## **

### *Business Problem*
*Identify the key factors that influence match outcomes in T20 cricket, with a special focus on batting team performance, bowling impact, and match context — so that franchises can make better strategic decisions about lineups, toss decisions, and player roles.*

### *Environmental Setup*

In [132]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 200) 

### *Data Loading*

In [133]:
df = pd.read_csv("IPL.csv", low_memory=False)

### *Data Understanding*

In [134]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,date,match_type,event_name,innings,batting_team,bowling_team,over,ball,ball_no,batter,bat_pos,runs_batter,balls_faced,bowler,valid_ball,runs_extras,runs_total,runs_bowler,runs_not_boundary,extra_type,non_striker,non_striker_pos,wicket_kind,player_out,fielders,runs_target,review_batter,team_reviewed,review_decision,umpire,umpires_call,player_of_match,match_won_by,win_outcome,toss_winner,toss_decision,venue,city,day,month,year,season,gender,team_type,superover_winner,result_type,method,balls_per_over,overs,event_match_no,stage,match_number,team_runs,team_balls,team_wicket,new_batter,batter_runs,batter_balls,bowler_wicket,batting_partners,next_batter,striker_out
0,131970,335982,18-04-2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,0.1,SC Ganguly,1,0,1,P Kumar,1,1,1,0,False,legbyes,BB McCullum,2,,,,,,,,,False,BB McCullum,Kolkata Knight Riders,140 runs,Royal Challengers Bangalore,field,M Chinnaswamy Stadium,Bangalore,18,4,2008,2007/08,male,club,,,,6,20,1,Unknown,Unknown,1,1,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
1,131971,335982,18-04-2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0.2,BB McCullum,2,0,1,P Kumar,1,0,0,0,False,,SC Ganguly,1,,,,,,,,,False,BB McCullum,Kolkata Knight Riders,140 runs,Royal Challengers Bangalore,field,M Chinnaswamy Stadium,Bangalore,18,4,2008,2007/08,male,club,,,,6,20,1,Unknown,Unknown,1,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
2,131972,335982,18-04-2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,2,0,0,P Kumar,0,1,1,1,False,wides,SC Ganguly,1,,,,,,,,,False,BB McCullum,Kolkata Knight Riders,140 runs,Royal Challengers Bangalore,field,M Chinnaswamy Stadium,Bangalore,18,4,2008,2007/08,male,club,,,,6,20,1,Unknown,Unknown,2,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
3,131973,335982,18-04-2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,2,0,1,P Kumar,1,0,0,0,False,,SC Ganguly,1,,,,,,,,,False,BB McCullum,Kolkata Knight Riders,140 runs,Royal Challengers Bangalore,field,M Chinnaswamy Stadium,Bangalore,18,4,2008,2007/08,male,club,,,,6,20,1,Unknown,Unknown,2,3,0,,0,2,0,"('BB McCullum', 'SC Ganguly')",,False
4,131974,335982,18-04-2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0.4,BB McCullum,2,0,1,P Kumar,1,0,0,0,False,,SC Ganguly,1,,,,,,,,,False,BB McCullum,Kolkata Knight Riders,140 runs,Royal Challengers Bangalore,field,M Chinnaswamy Stadium,Bangalore,18,4,2008,2007/08,male,club,,,,6,20,1,Unknown,Unknown,2,4,0,,0,3,0,"('BB McCullum', 'SC Ganguly')",,False


In [135]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,date,match_type,event_name,innings,batting_team,bowling_team,over,ball,ball_no,batter,bat_pos,runs_batter,balls_faced,bowler,valid_ball,runs_extras,runs_total,runs_bowler,runs_not_boundary,extra_type,non_striker,non_striker_pos,wicket_kind,player_out,fielders,runs_target,review_batter,team_reviewed,review_decision,umpire,umpires_call,player_of_match,match_won_by,win_outcome,toss_winner,toss_decision,venue,city,day,month,year,season,gender,team_type,superover_winner,result_type,method,balls_per_over,overs,event_match_no,stage,match_number,team_runs,team_balls,team_wicket,new_batter,batter_runs,batter_balls,bowler_wicket,batting_partners,next_batter,striker_out
278200,277951,1473511,03-06-2025,T20,Indian Premier League,2,Punjab Kings,Royal Challengers Bengaluru,19,2,19.2,Shashank Singh,6,0,1,JR Hazlewood,1,0,0,0,False,,KA Jamieson,9,,,,191.0,,,,,False,KH Pandya,Royal Challengers Bengaluru,6 runs,Punjab Kings,field,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,3,6,2025,2025,male,club,,,,6,20,Unknown,Final,Unknown,162,116,7,,39,26,0,"('KA Jamieson', 'Shashank Singh')",,False
278201,277952,1473511,03-06-2025,T20,Indian Premier League,2,Punjab Kings,Royal Challengers Bengaluru,19,3,19.3,Shashank Singh,6,6,1,JR Hazlewood,1,0,6,6,False,,KA Jamieson,9,,,,191.0,,,,,False,KH Pandya,Royal Challengers Bengaluru,6 runs,Punjab Kings,field,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,3,6,2025,2025,male,club,,,,6,20,Unknown,Final,Unknown,168,117,7,,45,27,0,"('KA Jamieson', 'Shashank Singh')",,False
278202,277953,1473511,03-06-2025,T20,Indian Premier League,2,Punjab Kings,Royal Challengers Bengaluru,19,4,19.4,Shashank Singh,6,4,1,JR Hazlewood,1,0,4,4,False,,KA Jamieson,9,,,,191.0,,,,,False,KH Pandya,Royal Challengers Bengaluru,6 runs,Punjab Kings,field,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,3,6,2025,2025,male,club,,,,6,20,Unknown,Final,Unknown,172,118,7,,49,28,0,"('KA Jamieson', 'Shashank Singh')",,False
278203,277954,1473511,03-06-2025,T20,Indian Premier League,2,Punjab Kings,Royal Challengers Bengaluru,19,5,19.5,Shashank Singh,6,6,1,JR Hazlewood,1,0,6,6,False,,KA Jamieson,9,,,,191.0,,,,,False,KH Pandya,Royal Challengers Bengaluru,6 runs,Punjab Kings,field,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,3,6,2025,2025,male,club,,,,6,20,Unknown,Final,Unknown,178,119,7,,55,29,0,"('KA Jamieson', 'Shashank Singh')",,False
278204,277955,1473511,03-06-2025,T20,Indian Premier League,2,Punjab Kings,Royal Challengers Bengaluru,19,6,19.6,Shashank Singh,6,6,1,JR Hazlewood,1,0,6,6,False,,KA Jamieson,9,,,,191.0,,,,,False,KH Pandya,Royal Challengers Bengaluru,6 runs,Punjab Kings,field,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,3,6,2025,2025,male,club,,,,6,20,Unknown,Final,Unknown,184,120,7,,61,30,0,"('KA Jamieson', 'Shashank Singh')",,False


In [136]:
df.shape

(278205, 64)

In [137]:
df.columns

Index(['Unnamed: 0', 'match_id', 'date', 'match_type', 'event_name', 'innings',
       'batting_team', 'bowling_team', 'over', 'ball', 'ball_no', 'batter',
       'bat_pos', 'runs_batter', 'balls_faced', 'bowler', 'valid_ball',
       'runs_extras', 'runs_total', 'runs_bowler', 'runs_not_boundary',
       'extra_type', 'non_striker', 'non_striker_pos', 'wicket_kind',
       'player_out', 'fielders', 'runs_target', 'review_batter',
       'team_reviewed', 'review_decision', 'umpire', 'umpires_call',
       'player_of_match', 'match_won_by', 'win_outcome', 'toss_winner',
       'toss_decision', 'venue', 'city', 'day', 'month', 'year', 'season',
       'gender', 'team_type', 'superover_winner', 'result_type', 'method',
       'balls_per_over', 'overs', 'event_match_no', 'stage', 'match_number',
       'team_runs', 'team_balls', 'team_wicket', 'new_batter', 'batter_runs',
       'batter_balls', 'bowler_wicket', 'batting_partners', 'next_batter',
       'striker_out'],
      dtype='object')

In [138]:
drop_cols = [
    'Unnamed: 0', 'match_id', 'date', 'match_type', 'event_name', 
    'ball', 'runs_bowler', 'runs_not_boundary', 'extra_type',
    'non_striker', 'non_striker_pos', 'fielders', 'runs_target',
    'review_batter', 'team_reviewed', 'review_decision',
    'umpire', 'umpires_call', 'player_of_match', 'toss_winner',
    'toss_decision', 'day', 'month', 'year', 'gender', 'team_type',
    'superover_winner', 'result_type', 'method', 'balls_per_over',
    'overs', 'event_match_no', 'stage', 'match_number',
    'team_runs', 'team_balls', 'team_wicket', 'new_batter',
    'batter_runs', 'batter_balls', 'bowler_wicket',
    'batting_partners', 'next_batter', 'striker_out'
]

df = df.drop(columns=drop_cols)

In [139]:
df.columns

Index(['innings', 'batting_team', 'bowling_team', 'over', 'ball_no', 'batter',
       'bat_pos', 'runs_batter', 'balls_faced', 'bowler', 'valid_ball',
       'runs_extras', 'runs_total', 'wicket_kind', 'player_out',
       'match_won_by', 'win_outcome', 'venue', 'city', 'season'],
      dtype='object')

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278205 entries, 0 to 278204
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   innings       278205 non-null  int64  
 1   batting_team  278205 non-null  object 
 2   bowling_team  278205 non-null  object 
 3   over          278205 non-null  int64  
 4   ball_no       278205 non-null  float64
 5   batter        278205 non-null  object 
 6   bat_pos       278205 non-null  int64  
 7   runs_batter   278205 non-null  int64  
 8   balls_faced   278205 non-null  int64  
 9   bowler        278205 non-null  object 
 10  valid_ball    278205 non-null  int64  
 11  runs_extras   278205 non-null  int64  
 12  runs_total    278205 non-null  int64  
 13  wicket_kind   13823 non-null   object 
 14  player_out    13823 non-null   object 
 15  match_won_by  278205 non-null  object 
 16  win_outcome   273503 non-null  object 
 17  venue         278205 non-null  object 
 18  city

In [141]:
print(df.isnull().sum().to_frame("Null Count"))

              Null Count
innings                0
batting_team           0
bowling_team           0
over                   0
ball_no                0
batter                 0
bat_pos                0
runs_batter            0
balls_faced            0
bowler                 0
valid_ball             0
runs_extras            0
runs_total             0
wicket_kind       264382
player_out        264382
match_won_by           0
win_outcome         4702
venue                  0
city                   0
season                 0


In [142]:
df.duplicated().sum()

420

In [143]:
df.describe()

Unnamed: 0,innings,over,ball_no,bat_pos,runs_batter,balls_faced,valid_ball,runs_extras,runs_total
count,278205.0,278205.0,278205.0,278205.0,278205.0,278205.0,278205.0,278205.0,278205.0
mean,1.482914,9.193839,9.542725,3.612555,1.277378,0.967362,0.963182,0.067971,1.34535
std,0.502571,5.681511,5.682938,2.168978,1.651107,0.177687,0.188315,0.343033,1.63762
min,1.0,0.0,0.1,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,4.0,4.5,2.0,0.0,1.0,1.0,0.0,0.0
50%,1.0,9.0,9.4,3.0,1.0,1.0,1.0,0.0,1.0
75%,2.0,14.0,14.4,5.0,1.0,1.0,1.0,0.0,1.0
max,6.0,19.0,19.6,11.0,6.0,1.0,1.0,7.0,7.0


### *Data Exploration*

`innings`
- Categorical column
- Importnant column
- most of them are having 1 innings
- minumum is 6 innings
- Data type is corrrrect so no need for conversion

In [46]:
df['innings']

0         1
1         1
2         1
3         1
4         1
         ..
278200    2
278201    2
278202    2
278203    2
278204    2
Name: innings, Length: 278205, dtype: int64

In [47]:
df['innings'].nunique()

6

In [48]:
df['innings'].value_counts()

innings
1    144131
2    133903
3        83
4        76
5         8
6         4
Name: count, dtype: int64

In [49]:
df['innings'].dtype

dtype('int64')

`batting_team`
- Categorical column
- Name of all the team
- total 19 teams
- Some columns are having same name needs to be merged
- Data Type object
- No missing values

In [50]:
df['batting_team']

0         Kolkata Knight Riders
1         Kolkata Knight Riders
2         Kolkata Knight Riders
3         Kolkata Knight Riders
4         Kolkata Knight Riders
                  ...          
278200             Punjab Kings
278201             Punjab Kings
278202             Punjab Kings
278203             Punjab Kings
278204             Punjab Kings
Name: batting_team, Length: 278205, dtype: object

In [51]:
df['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [52]:
df['batting_team'].nunique()

19

In [53]:
df['batting_team'].value_counts()

batting_team
Mumbai Indians                 33323
Kolkata Knight Riders          30895
Chennai Super Kings            30395
Royal Challengers Bangalore    28205
Rajasthan Royals               27922
Sunrisers Hyderabad            23442
Kings XI Punjab                22646
Delhi Daredevils               18786
Delhi Capitals                 12615
Deccan Chargers                 9034
Punjab Kings                    8865
Gujarat Titans                  7331
Lucknow Super Giants            7144
Pune Warriors                   5443
Gujarat Lions                   3566
Royal Challengers Bengaluru     3531
Rising Pune Supergiant          1900
Kochi Tuskers Kerala            1582
Rising Pune Supergiants         1580
Name: count, dtype: int64

In [56]:
df['batting_team'].dtype

dtype('O')

In [69]:
df['batting_team'].isna().sum()

0

`bowling_team`
- Categorical column
- Name of all the team
- total 19 teams
- Some columns are having same name needs to be merged
- Data Type object
- No missing values

In [57]:
df['bowling_team']

0         Royal Challengers Bangalore
1         Royal Challengers Bangalore
2         Royal Challengers Bangalore
3         Royal Challengers Bangalore
4         Royal Challengers Bangalore
                     ...             
278200    Royal Challengers Bengaluru
278201    Royal Challengers Bengaluru
278202    Royal Challengers Bengaluru
278203    Royal Challengers Bengaluru
278204    Royal Challengers Bengaluru
Name: bowling_team, Length: 278205, dtype: object

In [58]:
df['bowling_team'].unique()

array(['Royal Challengers Bangalore', 'Kolkata Knight Riders',
       'Kings XI Punjab', 'Chennai Super Kings', 'Delhi Daredevils',
       'Rajasthan Royals', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [59]:
df['bowling_team'].nunique()

19

In [60]:
df['bowling_team'].value_counts()

bowling_team
Mumbai Indians                 33427
Kolkata Knight Riders          31169
Chennai Super Kings            30211
Royal Challengers Bangalore    28358
Rajasthan Royals               28194
Sunrisers Hyderabad            23352
Kings XI Punjab                22483
Delhi Daredevils               18725
Delhi Capitals                 12886
Deccan Chargers                 9039
Punjab Kings                    8585
Gujarat Titans                  7145
Lucknow Super Giants            6934
Pune Warriors                   5457
Gujarat Lions                   3545
Royal Challengers Bengaluru     3538
Rising Pune Supergiant          1928
Rising Pune Supergiants         1615
Kochi Tuskers Kerala            1614
Name: count, dtype: int64

In [68]:
df['bowling_team'].isna().sum()

0

`over`
- contains over for the match
- categorical column
- highest is 0 and the count is 14839
- Lowest is 19 and the count is 10632
- No missing values
- Data type is in64 no nned for data correction

In [144]:
df['over']

0          0
1          0
2          0
3          0
4          0
          ..
278200    19
278201    19
278202    19
278203    19
278204    19
Name: over, Length: 278205, dtype: int64

In [145]:
df['over'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [146]:
df['over'].nunique()

20

In [147]:
df['over'].value_counts()

over
0     14839
1     14682
2     14497
3     14482
4     14469
5     14400
6     14343
7     14328
8     14291
9     14248
10    14171
11    14148
12    14118
13    14002
14    13899
15    13737
16    13507
17    13097
18    12315
19    10632
Name: count, dtype: int64

In [148]:
df['over'].isna().sum()

0

In [149]:
df['over'].dtype

dtype('int64')

`ball_no`
- categorical column
- total 123 unique categories
- highest is 0.1 and the 2517 and the lowest is 11.7 and the count is 1
- data type is float
- no missing values in the column 

In [150]:
df['ball_no']

0          0.1
1          0.2
2          0.3
3          0.3
4          0.4
          ... 
278200    19.2
278201    19.3
278202    19.4
278203    19.5
278204    19.6
Name: ball_no, Length: 278205, dtype: float64

In [151]:
df['ball_no'].unique()

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  1.1,  1.2,  1.3,  1.4,  1.5,
        1.6,  2.1,  2.2,  2.3,  2.4,  2.5,  2.6,  3.1,  3.2,  3.3,  3.4,
        3.5,  3.6,  4.1,  4.2,  4.3,  4.4,  4.5,  4.6,  5.1,  5.2,  5.3,
        5.4,  5.5,  5.6,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,  7.1,  7.2,
        7.3,  7.4,  7.5,  7.6,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  9.1,
        9.2,  9.3,  9.4,  9.5,  9.6, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6,
       11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 12.1, 12.2, 12.3, 12.4, 12.5,
       12.6, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 14.1, 14.2, 14.3, 14.4,
       14.5, 14.6, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 16.1, 16.2, 16.3,
       16.4, 16.5, 16.6, 17.1, 17.2, 17.3, 17.4, 17.5, 17.6, 18.1, 18.2,
       18.3, 18.4, 18.5, 18.6, 19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 10.7,
       18.7, 11.7])

In [152]:
df['ball_no'].nunique()

123

In [153]:
df['ball_no'].value_counts()

ball_no
0.1     2517
0.2     2505
1.1     2495
0.3     2469
1.2     2457
        ... 
19.5    1687
19.6    1604
10.7       2
18.7       1
11.7       1
Name: count, Length: 123, dtype: int64

In [154]:
df['ball_no'].isna().sum()

0

In [156]:
df['ball_no'].dtypes

dtype('float64')

`batter`
- Contains names of all the batsman
- Categorical column
- Total 703 unique categories
- highest is by V Kohli and the count is 6702
- Lowest is by Mayank Dagar and the count is 1
- the data type is object
- No missing values present 

In [157]:
df['batter']

0             SC Ganguly
1            BB McCullum
2            BB McCullum
3            BB McCullum
4            BB McCullum
               ...      
278200    Shashank Singh
278201    Shashank Singh
278202    Shashank Singh
278203    Shashank Singh
278204    Shashank Singh
Name: batter, Length: 278205, dtype: object

In [158]:
df['batter'].unique()

array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'L Ronchi',
       'ST Jayasuriya', 'DJ Thornely', 'RV Uthappa', 'PR Shah',
       'AM Nayar', 'SM Pollock', 'Harbhajan Singh', 'S Chanderpaul',
       'LRPL Taylor', 'AC Gilchrist', 'Y Venugopal Rao', 'VVS Laxman',
       'A Symonds', 'RG Sharma', 'SB Styris', 'AS Yadav', 'SB Bangar',
       'WPUJC Vaas', 'RP Singh', 'WP Saha', 'LR Shukla',
       'DPMD Jayawardene', 'S Sohal', 'B Lee', 'PP Cha

In [159]:
df['batter'].nunique()

703

In [160]:
df['batter'].value_counts()

batter
V Kohli         6702
S Dhawan        5483
RG Sharma       5475
DA Warner       4849
SK Raina        4177
                ... 
YA Abdulla         1
S Kaushik          1
PVSN Raju          1
S Lamichhane       1
Mayank Dagar       1
Name: count, Length: 703, dtype: int64

In [162]:
df['batter'].dtypes

dtype('O')

`bat_pos`
- where in the batting order a player came out to bat in that innings.
- Total 11 unique value in the column 
- highest is 2 and the count is 52587
- Lowest is 11 and the count is 995
- The data type is int 64 
- There is no missing values in the column 

In [163]:
df['bat_pos']

0         1
1         2
2         2
3         2
4         2
         ..
278200    6
278201    6
278202    6
278203    6
278204    6
Name: bat_pos, Length: 278205, dtype: int64

In [164]:
df['bat_pos'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

In [165]:
df['bat_pos'].nunique()

11

In [166]:
df['bat_pos'].value_counts()

bat_pos
2     52587
1     51389
3     46644
4     41352
5     33060
6     22552
7     14619
8      8188
9      4456
10     2363
11      995
Name: count, dtype: int64

In [167]:
df['bat_pos'].isna().sum()

0

In [169]:
df['bat_pos'].dtype

dtype('int64')

`runs_batter`
- Categorical column
- total 7 unique values
- highest is 0 and the count for that is 110250
- Lowest is5 and the count is 67
- No missing value in the column
- data type is int 64

In [170]:
df['runs_batter']

0         0
1         0
2         0
3         0
4         0
         ..
278200    0
278201    6
278202    4
278203    6
278204    6
Name: runs_batter, Length: 278205, dtype: int64

In [171]:
df['runs_batter'].unique()

array([0, 4, 6, 1, 2, 5, 3], dtype=int64)

In [172]:
df['runs_batter'].nunique()

7

In [173]:
df['runs_batter'].value_counts()

runs_batter
0    110250
1    103187
4     32113
2     17424
6     14353
3       811
5        67
Name: count, dtype: int64

In [174]:
df['runs_batter'].isna().sum()

0

In [175]:
df['runs_batter'].dtype

dtype('int64')

`balls_faced`
- The count of legal deliveries faced by the batter for that record.
- binar column is 0 and 1
- can be converted to yes and no
- maximum is - 1 and count is 269125
- minimum is - 0 and count is 9080
- data type is int 64

In [109]:
df['balls_faced']

0         1
1         1
2         0
3         1
4         1
         ..
278200    1
278201    1
278202    1
278203    1
278204    1
Name: balls_faced, Length: 278205, dtype: int64

In [110]:
df['balls_faced'].unique()

array([1, 0], dtype=int64)

In [111]:
df['balls_faced'].nunique()

2

In [112]:
df['balls_faced'].value_counts()

balls_faced
1    269125
0      9080
Name: count, dtype: int64

In [113]:
df['balls_faced'].dtypes

dtype('int64')

`bowler`
- name of all the bowlers
- total 550 unique bowlers
- highest is by R Ashwin and count is 4868
- lowest is by many but for instance i will take YBK Jaiswal and the count is 1
- no missing values
- data type is object

In [114]:
df['bowler']

0              P Kumar
1              P Kumar
2              P Kumar
3              P Kumar
4              P Kumar
              ...     
278200    JR Hazlewood
278201    JR Hazlewood
278202    JR Hazlewood
278203    JR Hazlewood
278204    JR Hazlewood
Name: bowler, Length: 278205, dtype: object

In [115]:
df['bowler'].nunique()

550

In [116]:
df['bowler'].unique()

array(['P Kumar', 'Z Khan', 'AA Noffke', 'JH Kallis', 'SB Joshi',
       'CL White', 'AB Dinda', 'I Sharma', 'AB Agarkar', 'SC Ganguly',
       'LR Shukla', 'B Lee', 'S Sreesanth', 'JR Hopes', 'IK Pathan',
       'K Goel', 'PP Chawla', 'WA Mota', 'JDP Oram', 'MS Gony',
       'M Muralitharan', 'P Amarnath', 'Joginder Sharma', 'GD McGrath',
       'B Geeves', 'MF Maharoof', 'R Bhatia', 'DL Vettori', 'MM Patel',
       'SR Watson', 'SK Trivedi', 'SK Warne', 'YK Pathan', 'D Salunkhe',
       'R Vinay Kumar', 'B Akhil', 'A Nehra', 'SM Pollock', 'DS Kulkarni',
       'ST Jayasuriya', 'Harbhajan Singh', 'AM Nayar', 'M Kartik',
       'Mohammad Hafeez', 'DJ Hussey', 'WPUJC Vaas', 'RP Singh',
       'SB Styris', 'SB Bangar', 'A Symonds', 'PP Ojha', 'Pankaj Singh',
       'Mohammad Asif', 'VY Mahesh', 'Shahid Afridi', 'DJ Bravo',
       'VS Yeligati', 'MA Khote', 'D Kalyankrishna', 'VRV Singh',
       'Sohail Tanvir', 'A Kumble', 'DNT Zoysa', 'SD Chitnis',
       'Yuvraj Singh', 'Shoaib Malik',

In [117]:
df['bowler'].value_counts()

bowler
R Ashwin         4868
SP Narine        4421
B Kumar          4378
RA Jadeja        4127
YS Chahal        3905
                 ... 
DA Warner           2
Ishan Kishan        1
Atharva Taide       1
AC Gilchrist        1
YBK Jaiswal         1
Name: count, Length: 550, dtype: int64

In [118]:
df['bowler'].isna().sum()

0

In [119]:
df['bowler'].dtypes

dtype('O')

`valid_ball`
- binary column 
- values are 0 and 1 nned to be convetred to yes and no 
- highest is 1 and count is 267962
- Lowest is 0 and the cont is 10243
- no missing values
- data type is object 

In [120]:
df['valid_ball']

0         1
1         1
2         0
3         1
4         1
         ..
278200    1
278201    1
278202    1
278203    1
278204    1
Name: valid_ball, Length: 278205, dtype: int64

In [121]:
df['valid_ball'].unique()

array([1, 0], dtype=int64)

In [122]:
df['valid_ball'].nunique()

2

In [123]:
df['valid_ball'].value_counts()

valid_ball
1    267962
0     10243
Name: count, dtype: int64

In [124]:
df['valid_ball'].dtypes

dtype('int64')

In [125]:
df['valid_ball'].isna().sum()

0

`runs_extras`
- Categoricla column
- total 7 unique categories
- highest is 0 and the count is 263072
- lowest is 7 and the count is 1
- Data type is int64
- There is no missing values in the column

In [176]:
df['runs_extras']

0         1
1         0
2         1
3         0
4         0
         ..
278200    0
278201    0
278202    0
278203    0
278204    0
Name: runs_extras, Length: 278205, dtype: int64

In [177]:
df['runs_extras'].unique()

array([1, 0, 5, 4, 2, 3, 7], dtype=int64)

In [178]:
df['runs_extras'].nunique()

7

In [179]:
df['runs_extras'].value_counts()

runs_extras
0    263072
1     13549
2       622
4       529
5       349
3        83
7         1
Name: count, dtype: int64

In [180]:
df['runs_extras'].isna().sum()

0

In [181]:
df['runs_extras'].dtypes

dtype('int64')

`runs_total`
- Categorical column
- 8 unique values
- highest is 1 and the count is 115711
- Lowest is 7 and the count is 95
- no missing values
- The data type is int 64

In [182]:
df['runs_total']

0         1
1         0
2         1
3         0
4         0
         ..
278200    0
278201    6
278202    4
278203    6
278204    6
Name: runs_total, Length: 278205, dtype: int64

In [183]:
df['runs_total'].unique()

array([1, 0, 4, 6, 5, 2, 3, 7], dtype=int64)

In [184]:
df['runs_total'].nunique()

8

In [185]:
df['runs_total'].value_counts()

runs_total
1    115711
0     95778
4     32502
2     18349
6     14259
3       954
5       557
7        95
Name: count, dtype: int64

In [186]:
df['runs_total'].isna().sum()

0

In [189]:
df['runs_total'].dtypes

dtype('int64')

`wicket_kind`
- Categorical column
- total 10 categories
- missing values are 264382
- lowest is obstructing the field and the count is 3
- Highest is caught and the count is 8665
- data type is object 

In [190]:
df['wicket_kind']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
278200    NaN
278201    NaN
278202    NaN
278203    NaN
278204    NaN
Name: wicket_kind, Length: 278205, dtype: object

In [191]:
df['wicket_kind'].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [192]:
df['wicket_kind'].nunique()

10

In [193]:
df['wicket_kind'].value_counts()

wicket_kind
caught                   8665
bowled                   2345
run out                  1153
lbw                       853
caught and bowled         388
stumped                   376
hit wicket                 18
retired hurt               17
retired out                 5
obstructing the field       3
Name: count, dtype: int64

In [194]:
df['wicket_kind'].isna().sum()

264382

In [196]:
df['wicket_kind'].dtypes

dtype('O')

`player_out`
- Categorical column 
- Highest is RG Sharma and the count is 237
- Lowest is BKG Mendis and the count is 1 
- Missing values are - 264382
- Data type is object 

In [197]:
df['player_out']

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
278200    NaN
278201    NaN
278202    NaN
278203    NaN
278204    NaN
Name: player_out, Length: 278205, dtype: object

In [198]:
df['player_out'].unique()

array([nan, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil',
       'CL White', 'AA Noffke', 'Z Khan', 'SB Joshi', 'PA Patel',
       'ML Hayden', 'MS Dhoni', 'SK Raina', 'JDP Oram', 'K Goel',
       'JR Hopes', 'Yuvraj Singh', 'KC Sangakkara', 'T Kohli',
       'YK Pathan', 'SR Watson', 'DS Lehmann', 'M Kaif', 'M Rawat',
       'RA Jadeja', 'SK Warne', 'V Sehwag', 'L Ronchi', 'DJ Thornely',
       'ST Jayasuriya', 'PR Shah', 'RV Uthappa', 'AM Nayar', 'SM Pollock',
       'S Chanderpaul', 'LRPL Taylor', 'Y Venugopal Rao', 'VVS Laxman',
       'AC Gilchrist', 'RG Sharma', 'SB Styris', 'AS Yadav', 'A Symonds',
       'WPUJC Vaas', 'SB Bangar', 'PP Ojha', 'BB McCullum', 'WP Saha',
       'Mohammad Hafeez', 'DPMD Jayawardene', 'IK Pathan', 'B Lee',
       'S Sohal', 'Kamran Akmal', 'Shahid Afridi', 'G Gambhir',
       'MEK Hussey', 'DJ Bravo', 'MA Khote', 'Harbhajan Singh',
       'GC Smith', 'D Salunkhe', 'RR Sarwan',

In [199]:
df['player_out'].nunique()

655

In [200]:
df['player_out'].value_counts()

player_out
RG Sharma       237
V Kohli         219
S Dhawan        193
KD Karthik      184
RV Uthappa      180
               ... 
SE Bond           1
S Lamichhane      1
SD Lad            1
OC McCoy          1
BKG Mendis        1
Name: count, Length: 655, dtype: int64

In [201]:
df['player_out'].isna().sum()

264382

In [202]:
df['player_out'].dtypes

dtype('O')

`match_won_by`
- Categoticla column
- 20 unique categories
- Higest is by Mumbai Indians and the count is 36185
- Lowest is by Rising Pune Supergiants and the count is 1105
- Data type is object
- no missing values 

In [203]:
df['match_won_by']

0               Kolkata Knight Riders
1               Kolkata Knight Riders
2               Kolkata Knight Riders
3               Kolkata Knight Riders
4               Kolkata Knight Riders
                     ...             
278200    Royal Challengers Bengaluru
278201    Royal Challengers Bengaluru
278202    Royal Challengers Bengaluru
278203    Royal Challengers Bengaluru
278204    Royal Challengers Bengaluru
Name: match_won_by, Length: 278205, dtype: object

In [204]:
df['match_won_by'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Delhi Daredevils',
       'Royal Challengers Bangalore', 'Rajasthan Royals',
       'Kings XI Punjab', 'Deccan Chargers', 'Mumbai Indians', 'Unknown',
       'Pune Warriors', 'Kochi Tuskers Kerala', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [205]:
df['match_won_by'].nunique()

20

In [206]:
df['match_won_by'].value_counts()

match_won_by
Mumbai Indians                 36185
Chennai Super Kings            34371
Kolkata Knight Riders          31729
Rajasthan Royals               27466
Royal Challengers Bangalore    26801
Sunrisers Hyderabad            22183
Kings XI Punjab                20162
Delhi Daredevils               15709
Delhi Capitals                 12296
Gujarat Titans                  9000
Punjab Kings                    8158
Lucknow Super Giants            7430
Deccan Chargers                 7013
Unknown                         4702
Royal Challengers Bengaluru     4244
Gujarat Lions                   3063
Pune Warriors                   2883
Rising Pune Supergiant          2383
Kochi Tuskers Kerala            1322
Rising Pune Supergiants         1105
Name: count, dtype: int64

In [207]:
df['match_won_by'].dtypes

dtype('O')

In [208]:
df['match_won_by'].isna().sum()

0

`win_outcome`
- Categoical column
- 111 unique values in the column 
- highest is 6 wickets and the count is 30082
- Lowest is by 112 runs and the count is 189
- 4702 missing values
- Data type is object 

In [209]:
df['win_outcome']

0         140 runs
1         140 runs
2         140 runs
3         140 runs
4         140 runs
            ...   
278200      6 runs
278201      6 runs
278202      6 runs
278203      6 runs
278204      6 runs
Name: win_outcome, Length: 278205, dtype: object

In [210]:
df['win_outcome'].unique()

array(['140 runs', '33 runs', '9 wickets', '5 wickets', '6 wickets',
       '6 runs', '3 wickets', '66 runs', '7 wickets', '10 wickets',
       '4 wickets', '13 runs', '10 runs', '45 runs', '8 wickets',
       '9 runs', '3 runs', '29 runs', '5 runs', '18 runs', '23 runs',
       '12 runs', '65 runs', '25 runs', '1 runs', '14 runs', '41 runs',
       '105 runs', '19 runs', '75 runs', '92 runs', '11 runs', '24 runs',
       nan, '27 runs', '38 runs', '8 runs', '78 runs', '16 runs',
       '53 runs', '2 wickets', '2 runs', '4 runs', '31 runs', '55 runs',
       '98 runs', '34 runs', '36 runs', '17 runs', '39 runs', '40 runs',
       '67 runs', '63 runs', '37 runs', '57 runs', '35 runs', '22 runs',
       '21 runs', '48 runs', '26 runs', '20 runs', '85 runs', '32 runs',
       '76 runs', '111 runs', '82 runs', '43 runs', '58 runs', '28 runs',
       '74 runs', '42 runs', '59 runs', '46 runs', '7 runs', '47 runs',
       '86 runs', '44 runs', '87 runs', '130 runs', '15 runs', '60 runs',
   

In [211]:
df['win_outcome'].nunique()

111

In [213]:
df['win_outcome'].value_counts()

win_outcome
6 wickets    30082
7 wickets    28540
5 wickets    24481
8 wickets    19424
4 wickets    15059
             ...  
93 runs        219
138 runs       212
146 runs       211
76 runs        204
112 runs       189
Name: count, Length: 111, dtype: int64

In [214]:
df['win_outcome'].dtypes

dtype('O')

In [215]:
df['win_outcome'].isna().sum()

4702

In [None]:
'innings', 'batting_team', 'bowling_team', 'over', 'ball_no', 'batter',
       'bat_pos', 'runs_batter', 'balls_faced', 'bowler', 'valid_ball',
       'runs_extras', 'runs_total', 'wicket_kind', 'player_out',
       'match_won_by', 'win_outcome', 'venue', 'city', 'season'],
      dtype='object')

### *Data Cleaning*
- *Drop the unimportant columns*

In [None]:
df_dropped_multiple = df.drop(['Unnamed: 0', 'match_type', 'event_name', ''], axis=1)

- *Converting binary columns to Yes and No* 

In [None]:

df['balls_faced'] = df['balls_faced'].apply(lambda x: 'Yes' if x == 1 else 'No')

df['valid_ball'] = df['valid_ball'].apply(lambda x: 'Yes' if x == 1 else 'No')



- *Data type conversion*

In [None]:
df['date'] = pd.to_datetime(df['date'])

df['batting_team'] = df['batting_team'].astype('category')

df['bowling_team'] = df['bowling_team'].astype('category')


- *Standarize column*

In [None]:
# Create a mapping 

team_mapping = {
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals',
    'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
    'Rising Pune Supergiant': 'Rising Pune Supergiants',
    'Deccan Chargers': 'Sunrisers Hyderabad'  # Optional!
}

df['batting_team'] = df['batting_team'].replace(team_mapping)


In [None]:
# Create a mapping 

team_mapping = {
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals',
    'Royal Challengers Bengaluru': 'Royal Challengers Bangalore',
    'Rising Pune Supergiant': 'Rising Pune Supergiants',
    'Deccan Chargers': 'Sunrisers Hyderabad'
}

df['bowling_team'] = df['bowling_team'].replace(team_mapping)


In [10]:
df.columns

Index(['Unnamed: 0', 'match_id', 'date', 'match_type', 'event_name', 'innings',
       'batting_team', 'bowling_team', 'over', 'ball', 'ball_no', 'batter',
       'bat_pos', 'runs_batter', 'balls_faced', 'bowler', 'valid_ball',
       'runs_extras', 'runs_total', 'runs_bowler', 'runs_not_boundary',
       'extra_type', 'non_striker', 'non_striker_pos', 'wicket_kind',
       'player_out', 'fielders', 'runs_target', 'review_batter',
       'team_reviewed', 'review_decision', 'umpire', 'umpires_call',
       'player_of_match', 'match_won_by', 'win_outcome', 'toss_winner',
       'toss_decision', 'venue', 'city', 'day', 'month', 'year', 'season',
       'gender', 'team_type', 'superover_winner', 'result_type', 'method',
       'balls_per_over', 'overs', 'event_match_no', 'stage', 'match_number',
       'team_runs', 'team_balls', 'team_wicket', 'new_batter', 'batter_runs',
       'batter_balls', 'bowler_wicket', 'batting_partners', 'next_batter',
       'striker_out'],
      dtype='object')