# IPL Betting Models: Data Engineering

### Using the existing dataset to engineer better features


In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# Loading the datasets

df = pd.read_csv('~/code/patrickevans29/raw_data/complete_cleaned_dataset.csv')
ball_df = pd.read_csv('~/code/patrickevans29/raw_data/IPL_Ball_by_Ball_2008_2022_cleaned.csv')

# 1. Batsman

### Using the ball by ball data to extract key information for each batsman

In [81]:
# Exploring the data

ball_df.head()

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non_striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,1,1,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
1,1312200,1,1,2,ybk jaiswal,mohammed shami,jc buttler,legbyes,0,1,1,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
2,1312200,1,1,3,jc buttler,mohammed shami,ybk jaiswal,noextra,1,0,1,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
3,1312200,1,1,4,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
4,1312200,1,1,5,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals


In [427]:
# Calculating the total runs scored by each batsman

batter_total_runs = ball_df.groupby(['batter'], as_index=False)['batsman_run'].sum('batsman_run')\
                        .rename(columns={'batsman_run': 'total_runs'})

In [428]:
# Calculating the balls faced by each batsman

balls_faced = ball_df[ball_df['extra_type'].isin(['noextra', 'legbyes', 'byes', 'noballs'])]\
                .groupby(['batter'], as_index=False)['ballnumber'].count()\
                .rename(columns={'ballnumber': 'balls_faced'})

In [429]:
# Calculating the number of innings for each batsman

batter_innings = ball_df.groupby(['batter'], as_index=False)['ID'].nunique()\
                    .rename(columns={'ID': 'batter_innings'})
                    
# Updating for non-strikers

non_striker_innings = ball_df.groupby(['non_striker'], as_index=False)['ID'].nunique()\
                    .rename(columns={'ID': 'non_striker_innings', 'non_striker': 'batter'})
                    
combined_innings_df = batter_innings.merge(non_striker_innings, on='batter', how='outer').fillna(0)

combined_innings_df['bat_innings'] = combined_innings_df[['batter_innings', 'non_striker_innings']].max(axis=1)

# Final dataframe

bat_innings = combined_innings_df.drop(columns=['batter_innings', 'non_striker_innings'])
                    


In [430]:
## Calculating the boundaries scored by each batsman
# A number of fours / sixes appear as non-boundary in the dataset

non_boundary = list(ball_df[ball_df['non_boundary'] == 1].index)

# Creating a temporary df to remove the non_boundary indexes

temp_df = ball_df.drop(index=non_boundary)

## Calculating the fours and sixes scored by each batsman

fours = temp_df[temp_df['batsman_run'] == 4].groupby(['batter'], as_index=False)\
    ['batsman_run'].count().rename(columns={'batsman_run': 'fours'})
    
sixes = temp_df[temp_df['batsman_run'] == 6].groupby(['batter'], as_index=False)\
    ['batsman_run'].count().rename(columns={'batsman_run': 'sixes'})

In [431]:
## Calculating the 0, 50 and 100 totals for each batsman

# Creating a batsman score for each game ID

batsman_score_game = ball_df.groupby(['ID', 'batter'], as_index=False)['batsman_run'].sum().drop(columns='ID')

# Filter the new dataframe to get 0, 50 and 100 scores only

zero = batsman_score_game[batsman_score_game['batsman_run'] == 0].groupby(['batter'], as_index=False)\
        .count().rename(columns={'batsman_run': 'zero'})

fifty = batsman_score_game[batsman_score_game['batsman_run'] >= 50].groupby(['batter'], as_index=False)\
        .count().rename(columns={'batsman_run': '50s'})
        
hundred = batsman_score_game[batsman_score_game['batsman_run'] >= 100].groupby(['batter'], as_index=False)\
        .count().rename(columns={'batsman_run': '100s'})
        
# Combining and removing the double counts

merged = zero.merge(fifty, on='batter', how='outer').merge(hundred, on='batter', how='outer').fillna(0)
merged['zero'] = merged['zero'].astype('int')
merged['50s'] = merged['50s'].sub(merged['100s']).astype('int')
merged['100s'] = merged['100s'].astype('int')

In [432]:
## Calculating batsman not out at innings end

# Finding the index of the last ball of each inning

last_ball_index_temp = list(ball_df.groupby(['ID', 'innings'], as_index=False)['innings']\
                        .idxmax().sort_values(by='innings', ascending=False)['innings'])

last_ball_index_temp.remove(0)

last_ball_index = [index - 1 for index in last_ball_index_temp]

last_ball_index.insert(0, ball_df.shape[0])

# Counting the player out on the final ball

player_out_final_ball = ball_df[(ball_df['isWicketDelivery']==1) & (ball_df.index.isin(last_ball_index))]\
                        .groupby(['player_out'], as_index=False)['ID'].count()\
                        .rename(columns={'ID': 'out_count', 'player_out': 'batter'})
                        
# Counting all batsman involved in final ball of an inning

final_ball_batter = ball_df[ball_df.index.isin(last_ball_index)].groupby(['batter'], as_index=False)\
                    ['ID'].count().rename(columns={'ID': 'at_bat_count'})
                    
final_ball_non_striker = ball_df[ball_df.index.isin(last_ball_index)].groupby(['non_striker'], as_index=False)\
                        ['ID'].count().rename(columns={'ID': 'non_striker_count', 'non_striker': 'batter'})
                        
# Combining the dataframes and calculating not_out for each batsman

combined_final_ball = final_ball_batter.merge(final_ball_non_striker, on='batter', how='outer')\
                        .merge(player_out_final_ball, on='batter', how='outer')
                        
combined_final_ball.fillna(0, inplace=True)

combined_final_ball['not_out'] = combined_final_ball['at_bat_count']\
                                .add(combined_final_ball['non_striker_count'])\
                                .sub(combined_final_ball['out_count']).astype('int')
                                
# Final dataframe

not_out = combined_final_ball[['batter', 'not_out']]


In [433]:
# Calculate the high score for each batsman

high_score = ball_df.groupby(['ID', 'batter'], as_index=False)['batsman_run']\
                        .sum().groupby('batter', as_index=False)['batsman_run'].max()\
                        .rename(columns={'batsman_run': 'high_score'})

In [434]:
# Merge all the statistics extracted

batting_stats_merged = bat_innings.merge(batter_total_runs, on='batter', how='outer')\
                        .merge(not_out, on='batter', how='outer')\
                        .merge(merged, on='batter', how='outer')\
                        .merge(fours, on='batter', how='outer')\
                        .merge(sixes, on='batter', how='outer')\
                        .merge(high_score, on='batter', how='outer')\
                        .merge(balls_faced, on='batter', how='outer').fillna(0)

In [435]:
# Check the dataframe

batting_stats_merged

Unnamed: 0,batter,bat_innings,total_runs,not_out,zero,50s,100s,fours,sixes,high_score,balls_faced
0,a ashish reddy,23.0,280.0,8.0,0.0,0.0,0.0,16.0,15.0,36.0,193.0
1,a badoni,11.0,161.0,3.0,1.0,1.0,0.0,11.0,7.0,54.0,130.0
2,a chandila,2.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0,4.0,7.0
3,a chopra,6.0,53.0,0.0,0.0,0.0,0.0,7.0,0.0,24.0,71.0
4,a choudhary,3.0,25.0,2.0,0.0,0.0,0.0,1.0,1.0,15.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...
604,z khan,31.0,117.0,18.0,6.0,0.0,0.0,11.0,2.0,23.0,141.0
605,a nel,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
606,bw hilfenhaus,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,c ganapathy,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [437]:
# Calculate average score - total_runs / bat_innings

batting_stats_merged['average_score'] = round((batting_stats_merged['total_runs'] / batting_stats_merged['bat_innings']), 2).fillna(0)

In [438]:
# Calculate batting average - total_runs / (bat_innings - not_out)

batting_stats_merged['batting_average'] = round((batting_stats_merged['total_runs'] / \
    (batting_stats_merged['bat_innings'] - batting_stats_merged['not_out'])), 2).fillna(0)

# Replace infinite values with the average score

batting_stats_merged['batting_average'] = np.where(batting_stats_merged\
    ['batting_average'] == np.inf, batting_stats_merged['average_score'], batting_stats_merged['batting_average'])

In [None]:
# Calculate batting strike rate - (total_runs * 100) / balls_faced

batting_stats_merged['batting_strike_rate'] = round((batting_stats_merged['total_runs'] * 100)/ \
                                                batting_stats_merged['balls_faced'], 2).fillna(0)

In [462]:
# Final dataframe

batting_stats_merged

Unnamed: 0,batter,bat_innings,total_runs,not_out,zero,50s,100s,fours,sixes,high_score,balls_faced,average_score,batting_average,batting_strike_rate
0,a ashish reddy,23.0,280.0,8.0,0.0,0.0,0.0,16.0,15.0,36.0,193.0,12.17,18.67,145.08
1,a badoni,11.0,161.0,3.0,1.0,1.0,0.0,11.0,7.0,54.0,130.0,14.64,20.12,123.85
2,a chandila,2.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0,4.0,7.0,2.00,2.00,57.14
3,a chopra,6.0,53.0,0.0,0.0,0.0,0.0,7.0,0.0,24.0,71.0,8.83,8.83,74.65
4,a choudhary,3.0,25.0,2.0,0.0,0.0,0.0,1.0,1.0,15.0,20.0,8.33,25.00,125.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,z khan,31.0,117.0,18.0,6.0,0.0,0.0,11.0,2.0,23.0,141.0,3.77,9.00,82.98
605,a nel,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00
606,bw hilfenhaus,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00
607,c ganapathy,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00


In [464]:
# Export

batting_stats_merged.to_csv('~/code/patrickevans29/raw_data/batting_features_dataset.csv', index=False)