In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

In [2]:
# Download the dataset to a dataframe
complete_df = pd.read_csv('../raw_data/complete_cleaned_dataset.csv')
complete_df.head(1)

Unnamed: 0.1,Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non_striker,extra_type,batsman_run,...,TossDecision,SuperOver,WinningTeam,WonBy,Margin,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,0,1312200,1,1,1,ybk jaiswal,mohammed shami,jc buttler,noextra,0,...,bat,n,gujarat titans,wickets,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",cb gaffaney,nitin menon


In [3]:
# Check the columns
complete_df.columns

Index(['Unnamed: 0', 'ID', 'innings', 'overs', 'ballnumber', 'batter',
       'bowler', 'non_striker', 'extra_type', 'batsman_run', 'extras_run',
       'total_run', 'non_boundary', 'isWicketDelivery', 'player_out', 'kind',
       'fielders_involved', 'BattingTeam', 'City', 'Date', 'Season',
       'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'Player_of_Match',
       'Team1Players', 'Team2Players', 'Umpire1', 'Umpire2'],
      dtype='object')

In [4]:
# Create 'innings_total' feature

complete_df['innings_total'] = complete_df.groupby(['ID', 'innings'])['total_run'].transform('sum')
complete_df.Season

0         2022
1         2022
2         2022
3         2022
4         2022
          ... 
225949    2008
225950    2008
225951    2008
225952    2008
225953    2008
Name: Season, Length: 225954, dtype: int64

In [5]:
# Remove unnecessary features

to_model_df = complete_df[['ID', 'innings', 'batter',
       'bowler', 'batsman_run', 'extras_run',
       'total_run', 'BattingTeam', 'City', 'Date', 'Season',
       'MatchNumber', 'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'SuperOver', 'WinningTeam', 'WonBy', 'Margin', 'Player_of_Match',
       'Team1Players', 'Team2Players', 'innings_total']].copy()

In [6]:
# Engineer the team_batting_average feature

to_model_df['team_batting_average'] = to_model_df.groupby('BattingTeam')['total_run'].transform('sum') / to_model_df.groupby('BattingTeam')['ID'].transform('nunique')

In [7]:
# Drop any rows where innings is not == 1 or 2

to_model_df.innings.value_counts()

1    116883
2    108910
3        77
4        72
5         8
6         4
Name: innings, dtype: int64

In [8]:
to_model_df = to_model_df[(to_model_df['innings'] == 1) | (to_model_df['innings'] == 2)]

In [9]:
# Convert the innings totals into columns

# Group by 'ID' and 'innings' and calculate innings_total
innings_totals = to_model_df.groupby(['ID', 'innings'])['innings_total'].mean().unstack()
innings_totals.columns = ['TeamA_innings_total', 'TeamB_innings_total']

# Merge the innings_totals DataFrame back into new_df on 'ID'
to_model_df = pd.merge(to_model_df, innings_totals, left_on='ID', right_index=True)

In [10]:
# Convert the average innings totals into columns

# Group by 'ID' and 'innings' and calculate average innings_total
batting_averages = to_model_df.groupby(['ID', 'innings'])['team_batting_average'].mean().unstack()
batting_averages.columns = ['TeamA_batting_average', 'TeamB_batting_average']

# Merge the innings_totals DataFrame back into new_df on 'ID'
to_model_df = pd.merge(to_model_df, batting_averages, left_on='ID', right_index=True)

In [11]:
to_model_df.head(50)

Unnamed: 0,ID,innings,batter,bowler,batsman_run,extras_run,total_run,BattingTeam,City,Date,...,Margin,Player_of_Match,Team1Players,Team2Players,innings_total,team_batting_average,TeamA_innings_total,TeamB_innings_total,TeamA_batting_average,TeamB_batting_average
0,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
1,1312200,1,ybk jaiswal,mohammed shami,0,1,1,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
2,1312200,1,jc buttler,mohammed shami,1,0,1,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
3,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
4,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
5,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
6,1312200,1,jc buttler,yash dayal,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
7,1312200,1,jc buttler,yash dayal,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
8,1312200,1,jc buttler,yash dayal,4,0,4,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
9,1312200,1,jc buttler,yash dayal,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375


In [12]:
# Drop ID duplicates now that the data has been generated

to_model_df = to_model_df.drop_duplicates(subset=['ID'])
to_model_df

Unnamed: 0,ID,innings,batter,bowler,batsman_run,extras_run,total_run,BattingTeam,City,Date,...,Margin,Player_of_Match,Team1Players,Team2Players,innings_total,team_batting_average,TeamA_innings_total,TeamB_innings_total,TeamA_batting_average,TeamB_batting_average
0,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.437500
233,1312199,1,v kohli,ta boult,0,0,0,royal challengers bangalore,ahmedabad,2022-05-27,...,7.0,jc buttler,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",157,155.707965,157.0,161.0,155.707965,155.397906
469,1312198,1,v kohli,mohsin khan,0,0,0,royal challengers bangalore,kolkata,2022-05-25,...,14.0,rm patidar,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",207,155.707965,207.0,193.0,155.707965,169.866667
725,1312197,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,kolkata,2022-05-24,...,7.0,da miller,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",188,155.397906,188.0,191.0,155.397906,166.437500
977,1304116,1,pk garg,ls livingstone,1,0,1,sunrisers hyderabad,mumbai,2022-05-22,...,5.0,harpreet brar,"['pk garg', 'abhishek sharma', 'ra tripathi', ...","['jm bairstow', 's dhawan', 'm shahrukh khan',...",157,155.546053,157.0,160.0,155.546053,158.518349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224776,335986,1,ac gilchrist,ab dinda,1,0,1,deccan chargers,kolkata,2008-04-20,...,5.0,dj hussey,"['wp saha', 'bb mccullum', 'rt ponting', 'sc g...","['ac gilchrist', 'y venugopal rao', 'vvs laxma...",110,152.840000,110.0,112.0,152.840000,153.367713
225016,335985,1,l ronchi,p kumar,0,0,0,mumbai indians,mumbai,2008-04-20,...,5.0,mv boucher,"['l ronchi', 'st jayasuriya', 'dj thornely', '...","['s chanderpaul', 'r dravid', 'lrpl taylor', '...",165,158.714286,165.0,166.0,158.714286,155.707965
225262,335984,1,t kohli,gd mcgrath,0,0,0,rajasthan royals,delhi,2008-04-19,...,9.0,mf maharoof,"['g gambhir', 'v sehwag', 's dhawan', 'mk tiwa...","['t kohli', 'yk pathan', 'sr watson', 'm kaif'...",129,155.397906,129.0,132.0,155.397906,153.754464
225481,335983,1,pa patel,b lee,0,0,0,chennai super kings,chandigarh,2008-04-19,...,33.0,mek hussey,"['k goel', 'jr hopes', 'kc sangakkara', 'yuvra...","['pa patel', 'ml hayden', 'mek hussey', 'ms dh...",240,160.543269,240.0,207.0,160.543269,158.518349


In [13]:
to_model_df.columns

Index(['ID', 'innings', 'batter', 'bowler', 'batsman_run', 'extras_run',
       'total_run', 'BattingTeam', 'City', 'Date', 'Season', 'MatchNumber',
       'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision', 'SuperOver',
       'WinningTeam', 'WonBy', 'Margin', 'Player_of_Match', 'Team1Players',
       'Team2Players', 'innings_total', 'team_batting_average',
       'TeamA_innings_total', 'TeamB_innings_total', 'TeamA_batting_average',
       'TeamB_batting_average'],
      dtype='object')

In [14]:
checker_df = to_model_df[['ID', 'Team1', 'Team2', 'TossWinner', 'TossDecision', 'TeamA_innings_total', 'TeamB_innings_total', 'WinningTeam']]
checker_df.head(20)

Unnamed: 0,ID,Team1,Team2,TossWinner,TossDecision,TeamA_innings_total,TeamB_innings_total,WinningTeam
0,1312200,rajasthan royals,gujarat titans,rajasthan royals,bat,130.0,133.0,gujarat titans
233,1312199,royal challengers bangalore,rajasthan royals,rajasthan royals,field,157.0,161.0,rajasthan royals
469,1312198,royal challengers bangalore,lucknow super giants,lucknow super giants,field,207.0,193.0,royal challengers bangalore
725,1312197,rajasthan royals,gujarat titans,gujarat titans,field,188.0,191.0,gujarat titans
977,1304116,sunrisers hyderabad,punjab kings,sunrisers hyderabad,bat,157.0,160.0,punjab kings
1197,1304115,delhi capitals,mumbai indians,mumbai indians,field,159.0,160.0,mumbai indians
1444,1304114,chennai super kings,rajasthan royals,chennai super kings,bat,150.0,151.0,rajasthan royals
1691,1304113,gujarat titans,royal challengers bangalore,gujarat titans,bat,168.0,170.0,royal challengers bangalore
1928,1304112,lucknow super giants,kolkata knight riders,lucknow super giants,bat,210.0,208.0,lucknow super giants
2177,1304111,sunrisers hyderabad,mumbai indians,mumbai indians,field,193.0,190.0,sunrisers hyderabad


In [15]:
to_model_df.head()

Unnamed: 0,ID,innings,batter,bowler,batsman_run,extras_run,total_run,BattingTeam,City,Date,...,Margin,Player_of_Match,Team1Players,Team2Players,innings_total,team_batting_average,TeamA_innings_total,TeamB_innings_total,TeamA_batting_average,TeamB_batting_average
0,1312200,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,ahmedabad,2022-05-29,...,7.0,hh pandya,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",130,155.397906,130.0,133.0,155.397906,166.4375
233,1312199,1,v kohli,ta boult,0,0,0,royal challengers bangalore,ahmedabad,2022-05-27,...,7.0,jc buttler,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",157,155.707965,157.0,161.0,155.707965,155.397906
469,1312198,1,v kohli,mohsin khan,0,0,0,royal challengers bangalore,kolkata,2022-05-25,...,14.0,rm patidar,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",207,155.707965,207.0,193.0,155.707965,169.866667
725,1312197,1,ybk jaiswal,mohammed shami,0,0,0,rajasthan royals,kolkata,2022-05-24,...,7.0,da miller,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",188,155.397906,188.0,191.0,155.397906,166.4375
977,1304116,1,pk garg,ls livingstone,1,0,1,sunrisers hyderabad,mumbai,2022-05-22,...,5.0,harpreet brar,"['pk garg', 'abhishek sharma', 'ra tripathi', ...","['jm bairstow', 's dhawan', 'm shahrukh khan',...",157,155.546053,157.0,160.0,155.546053,158.518349


In [22]:
to_model_df.columns

Index(['ID', 'innings', 'batter', 'bowler', 'batsman_run', 'extras_run',
       'total_run', 'BattingTeam', 'City', 'Date', 'Season', 'MatchNumber',
       'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision', 'SuperOver',
       'WinningTeam', 'WonBy', 'Margin', 'Player_of_Match', 'Team1Players',
       'Team2Players', 'innings_total', 'team_batting_average',
       'TeamA_innings_total', 'TeamB_innings_total', 'TeamA_batting_average',
       'TeamB_batting_average'],
      dtype='object')

In [23]:
final_to_model_df = to_model_df[['ID', 'City', 'Date', 'Season', 'MatchNumber',
       'Team1', 'Team2', 'Venue', 'TossWinner', 'TossDecision',
       'WinningTeam', 'innings_total', 'TeamA_batting_average',
       'TeamB_batting_average',
       'TeamA_innings_total', 'TeamB_innings_total']].copy()

In [24]:
final_to_model_df.to_csv("../raw_data/final_to_model_df.csv", index=False)

In [25]:
final_to_model_df.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total
0,1312200,ahmedabad,2022-05-29,2022,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,155.397906,166.4375,130.0,133.0
233,1312199,ahmedabad,2022-05-27,2022,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,155.707965,155.397906,157.0,161.0
469,1312198,kolkata,2022-05-25,2022,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,155.707965,169.866667,207.0,193.0
725,1312197,kolkata,2022-05-24,2022,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,155.397906,166.4375,188.0,191.0
977,1304116,mumbai,2022-05-22,2022,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,155.546053,158.518349,157.0,160.0
