# Cleaning the ball_by_ball dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

## Downloading the dataset

In [3]:
ball_by_ball_df = pd.read_csv("../raw_data/IPL_Ball_by_Ball_2008_2022.csv")

## Exploring data

In [4]:
ball_by_ball_df.head(5)

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals


#### shape and type of the dataset

In [5]:
# Copy DataFrame

ball_by_ball_df = ball_by_ball_df.copy()

In [6]:
ball_by_ball_df.shape

(225954, 17)

In [7]:
ball_by_ball_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225954 entries, 0 to 225953
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ID                 225954 non-null  int64 
 1   innings            225954 non-null  int64 
 2   overs              225954 non-null  int64 
 3   ballnumber         225954 non-null  int64 
 4   batter             225954 non-null  object
 5   bowler             225954 non-null  object
 6   non-striker        225954 non-null  object
 7   extra_type         12049 non-null   object
 8   batsman_run        225954 non-null  int64 
 9   extras_run         225954 non-null  int64 
 10  total_run          225954 non-null  int64 
 11  non_boundary       225954 non-null  int64 
 12  isWicketDelivery   225954 non-null  int64 
 13  player_out         11151 non-null   object
 14  kind               11151 non-null   object
 15  fielders_involved  7988 non-null    object
 16  BattingTeam        2

#### counting every batter, bowler and non-striker

In [8]:
ball_by_ball_df["batter"].nunique()

605

In [9]:
ball_by_ball_df["bowler"].nunique()

472

In [10]:
ball_by_ball_df["non-striker"].nunique() 

595

#### drop duplicates

In [11]:
ball_by_ball_df.duplicated().sum() ## no duplicate here

0

#### counting the NaN values

In [12]:
ball_by_ball_df.isnull().sum().sort_values(ascending=False) 

fielders_involved    217966
kind                 214803
player_out           214803
extra_type           213905
ID                        0
extras_run                0
isWicketDelivery          0
non_boundary              0
total_run                 0
batsman_run               0
innings                   0
non-striker               0
bowler                    0
batter                    0
ballnumber                0
overs                     0
BattingTeam               0
dtype: int64

# Cleaning data

### innings Column

In [13]:
ball_by_ball_df["innings"].value_counts() ## 5 and 6 innings ? 

1    116883
2    108910
3        77
4        72
5         8
6         4
Name: innings, dtype: int64

### overs Column

In [14]:
ball_by_ball_df["overs"].value_counts() 

0     12085
1     11935
2     11803
3     11773
4     11766
5     11720
6     11671
7     11651
8     11608
9     11578
10    11507
11    11480
12    11452
13    11355
14    11259
15    11093
16    10961
17    10650
18    10001
19     8606
Name: overs, dtype: int64

In [15]:
ball_by_ball_df["overs"] = ball_by_ball_df["overs"] + 1 ## assign the good value to overs

In [16]:
ball_by_ball_df["overs"].value_counts() ## verify the values

1     12085
2     11935
3     11803
4     11773
5     11766
6     11720
7     11671
8     11651
9     11608
10    11578
11    11507
12    11480
13    11452
14    11355
15    11259
16    11093
17    10961
18    10650
19    10001
20     8606
Name: overs, dtype: int64

### ballnumber Column

In [17]:
ball_by_ball_df["ballnumber"].value_counts() ## OK

1     36599
2     36507
3     36413
4     36322
5     36201
6     36083
7      6585
8      1055
9       171
10       18
Name: ballnumber, dtype: int64

### batter Column

In [18]:
ball_by_ball_df["batter"].nunique() 

605

In [19]:
ball_by_ball_df["batter"].sort_values().unique()

array(['A Ashish Reddy', 'A Badoni', 'A Chandila', 'A Chopra',
       'A Choudhary', 'A Dananjaya', 'A Flintoff', 'A Kumble',
       'A Manohar', 'A Mishra', 'A Mithun', 'A Mukund', 'A Nehra',
       'A Nortje', 'A Singh', 'A Symonds', 'A Tomar', 'A Uniyal',
       'A Zampa', 'AA Bilakhia', 'AA Chavan', 'AA Jhunjhunwala',
       'AA Noffke', 'AB Agarkar', 'AB Barath', 'AB Dinda', 'AB McDonald',
       'AB de Villiers', 'AC Blizzard', 'AC Gilchrist', 'AC Thomas',
       'AC Voges', 'AD Hales', 'AD Mascarenhas', 'AD Mathews', 'AD Nath',
       'AD Russell', 'AF Milne', 'AG Murtaza', 'AG Paunikar', 'AJ Finch',
       'AJ Turner', 'AJ Tye', 'AK Markram', 'AL Menaria', 'AM Nayar',
       'AM Rahane', 'AN Ahmed', 'AN Ghosh', 'AP Dole', 'AP Majumdar',
       'AP Tare', 'AR Bawne', 'AR Patel', 'AS Joseph', 'AS Rajpoot',
       'AS Raut', 'AS Roy', 'AS Yadav', 'AT Carey', 'AT Rayudu',
       'AUK Pathan', 'Abdul Samad', 'Abdur Razzak', 'Abhishek Sharma',
       'Akash Deep', 'Aman Hakim Khan', 

In [20]:
ball_by_ball_df["batter"] = [text.strip() for text in ball_by_ball_df["batter"]]  ## remove spaces

In [21]:
ball_by_ball_df["batter"] = [text.lower() for text in ball_by_ball_df["batter"]]  ## everything in lowercase

In [22]:
ball_by_ball_df["batter"].nunique()

605

### bowler Column

In [23]:
ball_by_ball_df["bowler"].nunique() 

472

In [24]:
ball_by_ball_df["bowler"].sort_values().unique()

array(['A Ashish Reddy', 'A Badoni', 'A Chandila', 'A Choudhary',
       'A Dananjaya', 'A Flintoff', 'A Kumble', 'A Mishra', 'A Mithun',
       'A Nehra', 'A Nel', 'A Nortje', 'A Singh', 'A Symonds', 'A Uniyal',
       'A Zampa', 'AA Chavan', 'AA Jhunjhunwala', 'AA Kazi', 'AA Noffke',
       'AB Agarkar', 'AB Dinda', 'AB McDonald', 'AC Gilchrist',
       'AC Thomas', 'AC Voges', 'AD Mascarenhas', 'AD Mathews',
       'AD Russell', 'AF Milne', 'AG Murtaza', 'AJ Finch', 'AJ Tye',
       'AK Markram', 'AL Menaria', 'AM Nayar', 'AM Rahane', 'AM Salvi',
       'AN Ahmed', 'AP Dole', 'AR Patel', 'AS Joseph', 'AS Rajpoot',
       'AS Raut', 'AS Roy', 'AU Rashid', 'AUK Pathan', 'Abdul Samad',
       'Abdur Razzak', 'Abhishek Sharma', 'Akash Deep', 'Akash Singh',
       'Aman Hakim Khan', 'Anand Rajan', 'Ankit Sharma', 'Ankit Soni',
       'Anureet Singh', 'Arshdeep Singh', 'Avesh Khan', 'Azhar Mahmood',
       'B Akhil', 'B Chipli', 'B Geeves', 'B Kumar', 'B Laughlin',
       'B Lee', 'B Stan

In [25]:
ball_by_ball_df["bowler"] = [text.strip() for text in ball_by_ball_df["bowler"]]  ## remove spaces

In [26]:
ball_by_ball_df["bowler"] = [text.lower() for text in ball_by_ball_df["bowler"]]  ## everything in lowercase

In [27]:
ball_by_ball_df["bowler"].nunique()

472

### non-striker Column

In [28]:
ball_by_ball_df.rename(columns={"non-striker": "non_striker"}, inplace=True)  ## rename the column

In [29]:
ball_by_ball_df["non_striker"].nunique() 

595

In [30]:
ball_by_ball_df["non_striker"].sort_values().unique()

array(['A Ashish Reddy', 'A Badoni', 'A Chandila', 'A Chopra',
       'A Choudhary', 'A Dananjaya', 'A Flintoff', 'A Kumble',
       'A Manohar', 'A Mishra', 'A Mithun', 'A Mukund', 'A Nehra',
       'A Nel', 'A Nortje', 'A Singh', 'A Symonds', 'A Tomar', 'A Uniyal',
       'A Zampa', 'AA Bilakhia', 'AA Chavan', 'AA Jhunjhunwala',
       'AA Noffke', 'AB Agarkar', 'AB Barath', 'AB Dinda', 'AB McDonald',
       'AB de Villiers', 'AC Blizzard', 'AC Gilchrist', 'AC Thomas',
       'AC Voges', 'AD Hales', 'AD Mascarenhas', 'AD Mathews', 'AD Nath',
       'AD Russell', 'AF Milne', 'AG Murtaza', 'AG Paunikar', 'AJ Finch',
       'AJ Turner', 'AJ Tye', 'AK Markram', 'AL Menaria', 'AM Nayar',
       'AM Rahane', 'AN Ahmed', 'AN Ghosh', 'AP Dole', 'AP Majumdar',
       'AP Tare', 'AR Bawne', 'AR Patel', 'AS Joseph', 'AS Rajpoot',
       'AS Raut', 'AS Yadav', 'AT Carey', 'AT Rayudu', 'AUK Pathan',
       'Abdul Samad', 'Abdur Razzak', 'Abhishek Sharma',
       'Aman Hakim Khan', 'Anirudh Singh'

In [31]:
ball_by_ball_df["non_striker"] = [text.strip() for text in ball_by_ball_df["non_striker"]]  ## remove spaces

In [32]:
ball_by_ball_df["non_striker"] = [text.lower() for text in ball_by_ball_df["non_striker"]]  ## everything in lowercase

In [33]:
ball_by_ball_df["non_striker"].nunique()

595

### extra-types Column

In [34]:
ball_by_ball_df["extra_type"].value_counts() 

wides      7025
legbyes    3531
noballs     908
byes        583
penalty       2
Name: extra_type, dtype: int64

In [35]:
ball_by_ball_df["extra_type"].isnull().sum()

213905

In [36]:
ball_by_ball_df["extra_type"] = ball_by_ball_df["extra_type"].fillna("noextra") ## filling the NaN value with no extra type value

In [37]:
ball_by_ball_df["extra_type"].value_counts()

noextra    213905
wides        7025
legbyes      3531
noballs       908
byes          583
penalty         2
Name: extra_type, dtype: int64

### batsman_run Column

In [38]:
ball_by_ball_df["batsman_run"].unique() 

array([0, 1, 4, 6, 3, 2, 5])

In [39]:
ball_by_ball_df["batsman_run"].value_counts() ## OK 

0    90778
1    83928
4    25500
2    14313
6    10666
3      708
5       61
Name: batsman_run, dtype: int64

### extras_run Column

In [40]:
ball_by_ball_df["extras_run"].unique()

array([0, 1, 5, 4, 2, 3, 7])

In [41]:
ball_by_ball_df["extras_run"].value_counts() ## OK 

0    213905
1     10760
2       499
4       436
5       275
3        78
7         1
Name: extras_run, dtype: int64

### total_run Column

In [42]:
ball_by_ball_df["total_run"].unique()

array([0, 1, 4, 6, 3, 2, 5, 7])

In [43]:
ball_by_ball_df["total_run"].value_counts() ## OK 

1    93861
0    79253
4    25828
2    15065
6    10603
3      836
5      444
7       64
Name: total_run, dtype: int64

### non_boundary Column

In [44]:
ball_by_ball_df["non_boundary"].unique()

array([0, 1])

In [45]:
ball_by_ball_df["non_boundary"].value_counts() ## OK 

0    225933
1        21
Name: non_boundary, dtype: int64

### isWicketDelivery Column

In [46]:
ball_by_ball_df["isWicketDelivery"].unique()

array([0, 1])

In [47]:
ball_by_ball_df["isWicketDelivery"].value_counts() ## OK 

0    214803
1     11151
Name: isWicketDelivery, dtype: int64

### player_out Column

In [48]:
ball_by_ball_df["player_out"].nunique()

573

In [49]:
ball_by_ball_df["player_out"].isnull().sum()

214803

In [50]:
ball_by_ball_df["player_out"] = ball_by_ball_df["player_out"].fillna("noplayerout") ## filling the NaN value with no player out value

In [51]:
ball_by_ball_df["player_out"] = [text.strip() for text in ball_by_ball_df["player_out"]]  ## remove spaces

In [52]:
ball_by_ball_df["player_out"] = [text.lower() for text in ball_by_ball_df["player_out"]]

### kind Column

In [53]:
ball_by_ball_df["kind"].unique()

array([nan, 'caught', 'caught and bowled', 'run out', 'bowled', 'stumped',
       'lbw', 'hit wicket', 'retired hurt', 'retired out',
       'obstructing the field'], dtype=object)

In [54]:
ball_by_ball_df["kind"].isnull().sum()

214803

In [55]:
ball_by_ball_df["kind"].value_counts()

caught                   6837
bowled                   1944
run out                  1007
lbw                       685
stumped                   325
caught and bowled         323
hit wicket                 14
retired hurt               13
obstructing the field       2
retired out                 1
Name: kind, dtype: int64

In [56]:
ball_by_ball_df["kind"] = ball_by_ball_df["kind"].fillna("nokind") ## filling the NaN value with nokind value

In [57]:
ball_by_ball_df["kind"] = [text.strip() for text in ball_by_ball_df["kind"]]  ## remove spaces

In [58]:
ball_by_ball_df["kind"] = [text.lower() for text in ball_by_ball_df["kind"]]

### fielders_involved Column

In [59]:
ball_by_ball_df["fielders_involved"].nunique()

535

In [60]:
ball_by_ball_df["fielders_involved"].isnull().sum()

217966

In [61]:
ball_by_ball_df["fielders_involved"] = ball_by_ball_df["fielders_involved"].fillna("nofieldersinvolved") ## filling the NaN value with no fileder involved value

In [62]:
ball_by_ball_df["fielders_involved"] = [text.strip() for text in ball_by_ball_df["fielders_involved"]]  ## remove spaces

In [63]:
ball_by_ball_df["fielders_involved"] = [text.lower() for text in ball_by_ball_df["fielders_involved"]]

In [64]:
ball_by_ball_df["fielders_involved"].value_counts()

nofieldersinvolved    217966
ms dhoni                 196
kd karthik               182
ab de villiers           141
rv uthappa               133
                       ...  
ts mills                   1
vg arora                   1
pn mankad                  1
anmolpreet singh           1
nj maddinson               1
Name: fielders_involved, Length: 536, dtype: int64

### BattingTeam Column

In [65]:
ball_by_ball_df["BattingTeam"].unique()

array(['Rajasthan Royals', 'Gujarat Titans',
       'Royal Challengers Bangalore', 'Lucknow Super Giants',
       'Sunrisers Hyderabad', 'Punjab Kings', 'Delhi Capitals',
       'Mumbai Indians', 'Chennai Super Kings', 'Kolkata Knight Riders',
       'Kings XI Punjab', 'Delhi Daredevils', 'Rising Pune Supergiant',
       'Gujarat Lions', 'Rising Pune Supergiants', 'Pune Warriors',
       'Deccan Chargers', 'Kochi Tuskers Kerala'], dtype=object)

In [66]:
ball_by_ball_df["BattingTeam"] = [text.strip() for text in ball_by_ball_df["BattingTeam"]]  ## remove spaces

In [67]:
ball_by_ball_df["BattingTeam"] = [text.lower() for text in ball_by_ball_df["BattingTeam"]]

In [68]:
team_name_dict = {'rising pune supergiants': 'rising pune supergiant', 
                  'kings xi punjab': 'punjab kings',
                  'delhi daredevils': 'delhi capitals'}

ball_by_ball_df['BattingTeam'] = ball_by_ball_df['BattingTeam'].replace(team_name_dict)

In [69]:
ball_by_ball_df["BattingTeam"].value_counts()

mumbai indians                 27826
royal challengers bangalore    26512
delhi capitals                 26373
kolkata knight riders          26192
punjab kings                   26034
chennai super kings            25128
rajasthan royals               22777
sunrisers hyderabad            18196
deccan chargers                 9034
pune warriors                   5443
gujarat lions                   3566
rising pune supergiant          3480
gujarat titans                  1971
lucknow super giants            1840
kochi tuskers kerala            1582
Name: BattingTeam, dtype: int64

In [70]:
ball_by_ball_df["BattingTeam"].nunique()

15

## Vizualisation of the new dataframe 


In [71]:
ball_by_ball_df.head()

Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non_striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,1,1,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
1,1312200,1,1,2,ybk jaiswal,mohammed shami,jc buttler,legbyes,0,1,1,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
2,1312200,1,1,3,jc buttler,mohammed shami,ybk jaiswal,noextra,1,0,1,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
3,1312200,1,1,4,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals
4,1312200,1,1,5,ybk jaiswal,mohammed shami,jc buttler,noextra,0,0,0,0,0,noplayerout,nokind,nofieldersinvolved,rajasthan royals


In [72]:
ball_by_ball_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225954 entries, 0 to 225953
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ID                 225954 non-null  int64 
 1   innings            225954 non-null  int64 
 2   overs              225954 non-null  int64 
 3   ballnumber         225954 non-null  int64 
 4   batter             225954 non-null  object
 5   bowler             225954 non-null  object
 6   non_striker        225954 non-null  object
 7   extra_type         225954 non-null  object
 8   batsman_run        225954 non-null  int64 
 9   extras_run         225954 non-null  int64 
 10  total_run          225954 non-null  int64 
 11  non_boundary       225954 non-null  int64 
 12  isWicketDelivery   225954 non-null  int64 
 13  player_out         225954 non-null  object
 14  kind               225954 non-null  object
 15  fielders_involved  225954 non-null  object
 16  BattingTeam        2

In [73]:
ball_by_ball_df.describe()

Unnamed: 0,ID,innings,overs,ballnumber,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery
count,225954.0,225954.0,225954.0,225954.0,225954.0,225954.0,225954.0,225954.0,225954.0
mean,832047.0,1.483868,10.185679,3.61975,1.243523,0.066907,1.31043,9.3e-05,0.049351
std,337954.2,0.503104,5.681797,1.810633,1.618166,0.34147,1.60605,0.00964,0.2166
min,335982.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,501262.0,1.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,829737.0,1.0,10.0,4.0,1.0,0.0,1.0,0.0,0.0
75%,1178395.0,2.0,15.0,5.0,1.0,0.0,1.0,0.0,0.0
max,1312200.0,6.0,20.0,10.0,6.0,7.0,7.0,1.0,1.0


In [74]:
ball_by_ball_df.to_csv('../raw_data/ball_by_ball_cleaned.csv')