In [2]:
import numpy as np
import pandas as pd
import pickle
from yaml import safe_load
import os
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error

In [3]:
filenames = []
for file in os.listdir('data'):
    filenames.append(os.path.join('data',file))

In [4]:
filenames[0:5]

['data\\1001349.yaml',
 'data\\1001351.yaml',
 'data\\1001353.yaml',
 'data\\1004729.yaml',
 'data\\1007655.yaml']

In [5]:
final_df = pd.DataFrame()
counter = 1
dataframes = []
for file in tqdm(filenames):
    with open(file, 'r') as f:
        df = pd.json_normalize(safe_load(f))
        df['match_id'] = counter
        dataframes.append(df)  # Append the dataframe to the list
        counter += 1

# Concatenate all dataframes at once
final_df = pd.concat(dataframes, ignore_index=True)

final_df

100%|██████████| 2506/2506 [07:55<00:00,  5.27it/s]


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,...,info.registry.people.Usman Mushtaq,info.registry.people.Mohammad Shahid,info.registry.people.Muktar Ali,info.registry.people.A Deshmukh,info.registry.people.Mohammad Usman,info.registry.people.Farhan Ahmed,info.registry.people.VS Wategaonkar,info.registry.people.Khurram Manzoor,info.registry.people.P Negi,info.registry.people.Hafiz Qaleem
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-18,2,6,[2017-02-17],male,T20,5.0,Sri Lanka,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-19,2,6,[2017-02-19],male,T20,2.0,Sri Lanka,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-23,1,6,[2017-02-22],male,T20,,Australia,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.91,2016-09-12,1,6,[2016-09-05],male,T20,,Hong Kong,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.91,2016-06-19,1,6,[2016-06-18],male,T20,,Zimbabwe,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.91,2016-03-05,2,6,[2016-03-04],male,T20,6.0,Pakistan,...,,,,,,,,,,
2502,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.91,2016-03-08,1,6,[2016-03-06],male,T20,8.0,India,...,,,,,,,,,,
2503,"[{'1st innings': {'team': 'Netherlands', 'deli...",0.91,2016-02-03,1,6,[2016-02-03],male,T20,,Netherlands,...,,,,,cee89f44,927694f7,,,,f566cd7d
2504,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2016-09-12,1,6,[2016-09-06],male,T20,,Australia,...,,,,,,,,,,


In [6]:
backup = final_df.copy()
final_df

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,...,info.registry.people.Usman Mushtaq,info.registry.people.Mohammad Shahid,info.registry.people.Muktar Ali,info.registry.people.A Deshmukh,info.registry.people.Mohammad Usman,info.registry.people.Farhan Ahmed,info.registry.people.VS Wategaonkar,info.registry.people.Khurram Manzoor,info.registry.people.P Negi,info.registry.people.Hafiz Qaleem
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-18,2,6,[2017-02-17],male,T20,5.0,Sri Lanka,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-19,2,6,[2017-02-19],male,T20,2.0,Sri Lanka,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2017-02-23,1,6,[2017-02-22],male,T20,,Australia,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.91,2016-09-12,1,6,[2016-09-05],male,T20,,Hong Kong,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.91,2016-06-19,1,6,[2016-06-18],male,T20,,Zimbabwe,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.91,2016-03-05,2,6,[2016-03-04],male,T20,6.0,Pakistan,...,,,,,,,,,,
2502,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.91,2016-03-08,1,6,[2016-03-06],male,T20,8.0,India,...,,,,,,,,,,
2503,"[{'1st innings': {'team': 'Netherlands', 'deli...",0.91,2016-02-03,1,6,[2016-02-03],male,T20,,Netherlands,...,,,,,cee89f44,927694f7,,,,f566cd7d
2504,"[{'1st innings': {'team': 'Australia', 'delive...",0.91,2016-09-12,1,6,[2016-09-06],male,T20,,Australia,...,,,,,,,,,,


In [14]:
final_df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.balls_per_over',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets',
    'info.toss.uncontested'
],inplace=True,errors='ignore')
final_df.drop(columns=final_df.filter(like='info.registry.people').columns, inplace=True,errors='ignore')
final_df.drop(columns=final_df.filter(like='info.players').columns, inplace=True,errors='ignore')


In [15]:
final_df

Unnamed: 0,innings,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],male,T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],male,T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],male,T20,Australia,20,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],male,T20,Hong Kong,20,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],male,T20,Zimbabwe,20,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],male,T20,Pakistan,20,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,2502,Mirpur
2502,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],male,T20,India,20,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,2503,Mirpur
2503,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],male,T20,Netherlands,20,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,2504,Dubai
2504,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],male,T20,Australia,20,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,2505,


In [16]:
final_df['info.gender'].value_counts()

info.gender
male    2506
Name: count, dtype: int64

In [17]:
# final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

Unnamed: 0,innings,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],T20,Australia,20,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],T20,Hong Kong,20,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],T20,Zimbabwe,20,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2501,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],T20,Pakistan,20,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,2502,Mirpur
2502,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],T20,India,20,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,2503,Mirpur
2503,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],T20,Netherlands,20,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,2504,Dubai
2504,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],T20,Australia,20,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,2505,


In [18]:
final_df['info.match_type'].value_counts()

info.match_type
T20    2506
Name: count, dtype: int64

In [19]:
final_df['info.overs'].value_counts()

info.overs
20    2503
50       3
Name: count, dtype: int64

In [20]:
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],Australia,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],Hong Kong,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],Zimbabwe,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...
2501,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],Pakistan,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,2502,Mirpur
2502,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],India,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,2503,Mirpur
2503,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],Netherlands,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,2504,Dubai
2504,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],Australia,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,2505,


In [21]:
pickle.dump(final_df,open('dataset_level1.pkl','wb'))

In [35]:
matches = pickle.load(open('dataset_level1.pkl','rb'))

matches.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.2: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
 {0.4: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 2, 'extras': 0, 'total': 2}}},
 {0.5: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.6: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 3, 'extras': 0, 'total': 3}}},
 {1.1: {'batsman': 'M Klinger',
   'bowler': 'KMDN Kulasekara',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 

In [36]:
count = 1
delivery_df = pd.DataFrame()  # This will hold the final data
dataframes = []  # A list to store dataframes for concatenation later

for index, row in matches.iterrows():
    if count in [75, 108, 150, 180, 268, 360, 443, 458, 584, 748, 982, 1052, 1111, 1226, 1345]:
        count += 1
        continue
    count += 1
    
    # Temporary lists to hold data for each ball
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except KeyError:
                player_of_dismissed.append('0')
    
    # Create a dataframe for this loop's data
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue
    })
    
    # Append this loop's dataframe to the list of dataframes
    dataframes.append(loop_df)

# Concatenate all dataframes once the loop is done
delivery_df = pd.concat(dataframes, ignore_index=True)

delivery_df


Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,2,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
1,2,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
2,2,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground
3,2,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground
4,2,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...,...,...
301349,2504,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium
301350,2504,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium
301351,2504,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium
301352,2504,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium


In [37]:
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team

In [38]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)

In [39]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,2,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,2,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,2,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,2,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...
301349,2504,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
301350,2504,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
301351,2504,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
301352,2504,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [40]:
delivery_df.drop(columns=['teams'],inplace=True)

In [41]:
delivery_df['batting_team'].unique()

array(['Australia', 'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh',
       'New Zealand', 'South Africa', 'England', 'West Indies', 'Ireland',
       'Afghanistan', 'Pakistan', 'United Arab Emirates', 'Scotland',
       'Oman', 'Papua New Guinea', 'Sri Lanka', 'Netherlands', 'Nepal',
       'Vanuatu', 'Philippines', 'United States of America', 'Germany',
       'Ghana', 'Uganda', 'Kenya', 'Namibia', 'Nigeria', 'Botswana',
       'Guernsey', 'Denmark', 'Jersey', 'Italy', 'Norway', 'Thailand',
       'Malaysia', 'Maldives', 'Singapore', 'Kuwait', 'Bermuda', 'Canada',
       'Cayman Islands', 'Portugal', 'Gibraltar', 'Spain', 'Bhutan',
       'Qatar', 'Iran', 'Belgium', 'Isle of Man', 'Bulgaria', 'Romania',
       'Luxembourg', 'Austria', 'Czech Republic', 'Greece', 'Serbia',
       'Malta', 'France', 'Sweden', 'Rwanda', 'Finland', 'Hungary',
       'Estonia', 'Cyprus', 'Switzerland', 'Seychelles', 'Malawi',
       'Lesotho', 'Swaziland', 'Tanzania', 'Mozambique', 'Sierra Leone',
       '

In [42]:
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka'    
]

In [43]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]

In [44]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,2,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,2,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,2,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,2,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
301350,2504,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
301351,2504,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
301352,2504,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [45]:
# delivery_df['venue'].unique()

array(['Melbourne Cricket Ground', 'Simonds Stadium, South Geelong',
       'Adelaide Oval', 'McLean Park', 'Bay Oval', 'Eden Park',
       'The Rose Bowl', 'County Ground', 'Sophia Gardens',
       'Riverside Ground', 'Green Park',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'M Chinnaswamy Stadium',
       'Central Broward Regional Park Stadium Turf Ground',
       'Dubai International Cricket Stadium', 'Sheikh Zayed Stadium',
       'Sydney Cricket Ground', 'Bellerive Oval', 'Westpac Stadium',
       'Seddon Park', 'Mangaung Oval', 'Senwes Park',
       'Kensington Oval, Bridgetown', "Queen's Park Oval, Port of Spain",
       'R Premadasa Stadium', 'Warner Park, Basseterre',
       'Sabina Park, Kingston', 'R.Premadasa Stadium, Khettarama',
       'Saxton Oval', 'JSCA International Stadium Complex', 'Edgbaston',
       'Old Trafford', 'Arun Jaitley Stadium',
       'Saurashtra Cricket Association Stadium',
       'Greenfield International Stadium', 'Gaddafi Stadium'

In [33]:
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [34]:
output

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [35]:
pickle.dump(output,open('dataset_level2.pkl','wb'))

In [3]:
df = pickle.load(open('dataset_level2.pkl','rb'))

In [4]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [5]:
df.isnull().sum()


match_id               0
batting_team           0
bowling_team           0
ball                   0
runs                   0
player_dismissed       0
city                8671
venue                  0
dtype: int64

In [6]:
df[df['city'].isnull()]['venue'].value_counts()


venue
Dubai International Cricket Stadium        3092
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sydney Cricket Ground                       749
Adelaide Oval                               498
Harare Sports Club                          372
Sharjah Cricket Stadium                     249
Sylhet International Cricket Stadium        128
Carrara Oval                                 64
Name: count, dtype: int64

In [7]:
cities = np.where(df['city'].isnull(),df['venue'].str.split().apply(lambda x:x[0]),df['city'])

In [8]:
df['city'] = cities

In [9]:
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [10]:
df.drop(columns=['venue'],inplace=True)

In [11]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo


In [12]:
eligible_cities = df['city'].value_counts()[df['city'].value_counts() > 600].index.tolist()
df = df[df['city'].isin(eligible_cities)]
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo


In [13]:
# Ensure that the 'runs' column is numeric
df['runs'] = pd.to_numeric(df['runs'], errors='coerce')

# Fill any NaN values with 0 (optional, depending on your dataset)
df['runs'] = df['runs'].fillna(0)

# Now, apply cumsum() within each group (match_id)
df['current_score'] = df.groupby('match_id')['runs'].cumsum()

# Check the result
print(df[['match_id', 'runs', 'current_score']].head())


   match_id  runs  current_score
0         2     0              0
1         2     0              0
2         2     1              1
3         2     2              3
4         2     0              3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['runs'] = pd.to_numeric(df['runs'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['runs'] = df['runs'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['current_score'] = df.groupby('match_id')['runs'].cumsum()


In [14]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3
...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127


In [15]:
df['over'] = df['ball'].apply(lambda x:str(x).split(".")[0])
df['ball_no'] = df['ball'].apply(lambda x:str(x).split(".")[1])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['over'] = df['ball'].apply(lambda x:str(x).split(".")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ball_no'] = df['ball'].apply(lambda x:str(x).split(".")[1])


Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5
...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6


In [16]:
df['balls_bowled'] = (df['over'].astype('int')*6) + df['ball_no'].astype('int')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['balls_bowled'] = (df['over'].astype('int')*6) + df['ball_no'].astype('int')


Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120


In [17]:
df['balls_left'] = 120 - df['balls_bowled']
df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['balls_left'] = 120 - df['balls_bowled']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)


Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115
...,...,...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2
301351,2504,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119,1
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0


In [18]:
# Convert 'player_dismissed' column to 0 and 1
df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 0 if x == '0' else 1)

# Ensure the column is of type int
df['player_dismissed'] = df['player_dismissed'].astype(int)

# Perform cumsum for each match_id
df['player_dismissed_cumsum'] = df.groupby('match_id')['player_dismissed'].cumsum()

# Calculate wickets left
df['wickets_left'] = 10 - df['player_dismissed_cumsum']

# Check the result
print(df[['match_id', 'player_dismissed', 'player_dismissed_cumsum', 'wickets_left']].head())


   match_id  player_dismissed  player_dismissed_cumsum  wickets_left
0         2                 0                        0            10
1         2                 0                        0            10
2         2                 0                        0            10
3         2                 0                        0            10
4         2                 0                        0            10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 0 if x == '0' else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_dismissed'] = df['player_dismissed'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_dismissed_cumsum'] = df.groupby('match_id')['pla

In [19]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,player_dismissed_cumsum,wickets_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,0,10
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,0,10
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,0,10
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,0,10
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,8,2
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,8,2
301351,2504,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,9,1
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,9,1


In [20]:
df['crr'] = (df['current_score']*6)/df['balls_bowled']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['crr'] = (df['current_score']*6)/df['balls_bowled']


In [21]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,player_dismissed_cumsum,wickets_left,crr
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,0,10,0.000000
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,0,10,0.000000
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,0,10,2.000000
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,0,10,4.500000
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,0,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301349,2504,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,8,2,6.410256
301350,2504,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,8,2,6.355932
301351,2504,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,9,1,6.302521
301352,2504,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,9,1,6.350000


In [22]:
# Convert 'runs' column to numeric, invalid parsing will be set as NaN
df['runs'] = pd.to_numeric(df['runs'], errors='coerce')

# Group the data by 'match_id'
groups = df.groupby('match_id')

# Get unique match IDs
match_ids = df['match_id'].unique()

last_five = []
for id in match_ids:
    # Get the group for the match and perform rolling sum on 'runs'
    runs_rolling_sum = groups.get_group(id)['runs'].rolling(window=30, min_periods=1).sum()
    # Extend the result to the list
    last_five.extend(runs_rolling_sum.values.tolist())

# Check the result
print(last_five)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['runs'] = pd.to_numeric(df['runs'], errors='coerce')


[0.0, 0.0, 1.0, 3.0, 3.0, 6.0, 6.0, 7.0, 7.0, 7.0, 11.0, 13.0, 14.0, 15.0, 15.0, 15.0, 19.0, 19.0, 19.0, 20.0, 21.0, 22.0, 26.0, 27.0, 30.0, 33.0, 37.0, 41.0, 42.0, 43.0, 44.0, 45.0, 44.0, 42.0, 43.0, 41.0, 41.0, 41.0, 42.0, 42.0, 39.0, 38.0, 39.0, 38.0, 39.0, 40.0, 37.0, 41.0, 42.0, 42.0, 44.0, 44.0, 40.0, 39.0, 37.0, 35.0, 31.0, 27.0, 26.0, 31.0, 31.0, 30.0, 31.0, 31.0, 32.0, 31.0, 31.0, 36.0, 37.0, 37.0, 37.0, 37.0, 36.0, 36.0, 36.0, 36.0, 35.0, 32.0, 32.0, 31.0, 29.0, 29.0, 31.0, 32.0, 32.0, 32.0, 33.0, 36.0, 37.0, 37.0, 39.0, 41.0, 40.0, 41.0, 41.0, 42.0, 44.0, 39.0, 43.0, 45.0, 48.0, 48.0, 48.0, 49.0, 49.0, 49.0, 51.0, 51.0, 51.0, 52.0, 52.0, 53.0, 53.0, 53.0, 54.0, 53.0, 52.0, 50.0, 53.0, 47.0, 46.0, 45.0, 46.0, 47.0, 49.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 8.0, 9.0, 9.0, 13.0, 14.0, 15.0, 15.0, 19.0, 20.0, 21.0, 21.0, 25.0, 26.0, 27.0, 31.0, 31.0, 31.0, 35.0, 35.0, 39.0, 40.0, 44.0, 46.0, 52.0, 53.0, 57.0, 58.0, 59.0, 59.0, 59.0, 53.0, 53.0, 54.0, 50.0, 50.0, 49.0, 50.0, 47.0,

In [23]:
df['last_five'] = last_five


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['last_five'] = last_five


In [24]:
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df,on='match_id')


In [25]:
final_df=final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','crr','last_five','runs_x']]


In [26]:
final_df.dropna(inplace=True)


In [27]:
final_df.isnull().sum()


batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
crr              0
last_five        0
runs_x           0
dtype: int64

In [28]:
final_df = final_df.sample(final_df.shape[0])


In [29]:
final_df.sample(2)


Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
28194,Bangladesh,Pakistan,Dhaka,64,60,7,6.4,34.0,108
24035,Bangladesh,New Zealand,Dhaka,52,66,10,5.777778,31.0,141


In [30]:
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [31]:
X_train

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five
26528,Sri Lanka,West Indies,Abu Dhabi,38,94,10,8.769231,38.0
72936,New Zealand,Bangladesh,Mirpur,42,97,10,10.956522,42.0
73180,Afghanistan,Bangladesh,Mirpur,15,100,9,4.500000,15.0
20242,West Indies,England,Bridgetown,99,58,8,9.580645,51.0
45920,South Africa,West Indies,Tarouba,34,77,9,4.744186,29.0
...,...,...,...,...,...,...,...,...
3163,Pakistan,West Indies,Trinidad,45,79,8,6.585366,33.0
57428,Pakistan,England,Barbados,84,50,6,7.200000,33.0
18903,India,England,Ahmedabad,110,41,7,8.354430,42.0
47948,West Indies,South Africa,Johannesburg,10,112,10,7.500000,10.0


In [32]:


# Column transformer for one-hot encoding
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

# List of models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=1),
    "XGBoost": XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1),
    "LightGBM": LGBMRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1)
}

# Dictionary to store results
results = {}

# Loop through each model, create a pipeline, fit, and evaluate
for name, model in models.items():
    pipe1 = Pipeline(steps=[
        ('step1', trf),
        ('step2', StandardScaler()),  # Scale the data
        ('step3', model)  # Model step
    ])
    
    # Fit the pipeline
    pipe1.fit(X_train, y_train)
    
    # Predictions
    y_pred = pipe1.predict(X_test)
    
    # Evaluate model performance
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store results
    results[name] = {
        "R2 Score": r2,
        "Mean Absolute Error": mae
    }

# Print results for each model
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"R2 Score: {metrics['R2 Score']}")
    print(f"Mean Absolute Error: {metrics['Mean Absolute Error']}")
    print("-" * 30)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 67864, number of used features: 75
[LightGBM] [Info] Start training from score 161.963825
Model: Logistic Regression
R2 Score: 0.8625262600299726
Mean Absolute Error: 3.6777862910355394
------------------------------
Model: Random Forest
R2 Score: 0.9445540607802703
Mean Absolute Error: 3.5926348032268054
------------------------------
Model: XGBoost
R2 Score: 0.9602288603782654
Mean Absolute Error: 2.7320116781321415
------------------------------
Model: LightGBM
R2 Score: 0.9507746973089413
Mean Absolute Error: 4.228131055962365
------------------------------


In [36]:
pipe = Pipeline(steps=[
    ('trf', trf),
    ('step2', StandardScaler()),  # Scale the data
    ('xgb_model', XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1))
])

# Fit the pipeline with your training data
pipe.fit(X_train, y_train)

# Save the pipeline
with open('finalpipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)


In [34]:
pickle.dump(pipe,open('pipe.pkl','wb'))

The output you provided seems to be the result of training various machine learning models, including LightGBM, Logistic Regression, Random Forest, and XGBoost. Here’s a breakdown of the information presented, along with explanations of the key metrics and messages:

LightGBM Information:
Row-wise Multi-threading:

The LightGBM library is automatically choosing to use row-wise multi-threading for efficiency. The overhead time for this selection was approximately 0.0128 seconds.
You can set force_row_wise=true to eliminate this overhead if desired.
If you encounter memory issues, you can use force_col_wise=true to switch to column-wise processing, which can be more memory efficient but might be slower.
Total Bins:

Total Bins 942: This indicates that LightGBM has created 942 bins to represent the continuous features in the dataset, which is part of its optimization process.
Training Data Information:

Number of data points in the train set: 67864: This tells you that there are 67,864 samples (data points) used for training the model.
Number of used features: 75: This indicates that 75 features are being used in the training process.
Start Training Score:

Start training from score 162.011818: This is the initial score (baseline prediction) before the model starts training on the data.
Model Performance Metrics:
The output lists the performance metrics for various models, comparing them based on two metrics: R² Score and Mean Absolute Error (MAE).

Logistic Regression:

R² Score: 0.8624
Indicates that approximately 86.24% of the variance in the target variable is explained by the model.
Mean Absolute Error: 3.54
On average, the model's predictions deviate from the actual values by about 3.54 units.
Random Forest:

R² Score: 0.9415
This model explains about 94.15% of the variance.
Mean Absolute Error: 3.52
Average prediction error is slightly lower than that of Logistic Regression.
XGBoost:

R² Score: 0.9605
This indicates a high explanatory power, with 96.05% of the variance explained.
Mean Absolute Error: 2.64
XGBoost has the lowest average prediction error among the models listed.
LightGBM:

R² Score: 0.9506
The model explains 95.06% of the variance, showing strong performance.
Mean Absolute Error: 4.11
This model has the highest average prediction error of the four, which might indicate that it could benefit from tuning or additional feature engineering.
Summary:
Model Comparison: XGBoost performs the best in terms of both R² score and MAE, followed closely by Random Forest and LightGBM. Logistic Regression performs the least well among the four models.
Insights: The output suggests that tree-based models (XGBoost and Random Forest) generally perform better on the dataset than Logistic Regression, likely due to their ability to capture non-linear relationships in the data.
Model Tuning: Given the performance metrics, you may want to consider hyperparameter tuning for the LightGBM model to improve its accuracy or explore other model architectures if applicable.
This analysis provides valuable insights into model performance and helps in deciding which model to use for predictions based on the given dataset.