In [237]:
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

## Importing Libraries


## Importing Dataset
Our data is divided into two CSV file I have to import those 🙂

In [238]:
deliveries = pd.read_csv('deliveries.csv')

In [239]:
matches = pd.read_csv('matches.csv')

## Looking into the data 👀

In [286]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,total_runs,is_wicket
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1,0
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0,0
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,1,0
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0,0
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,0,0


In [287]:
matches.head()

Unnamed: 0,match_id,city,venue,target_runs,super_over,winner
0,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders
1,335983,Chandigarh,"Punjab Cricket Association Stadium, Mohali",241.0,N,Chennai Super Kings
2,335984,Delhi,Feroz Shah Kotla,130.0,N,Delhi Daredevils
3,335985,Mumbai,Wankhede Stadium,166.0,N,Royal Challengers Bangalore
4,335986,Kolkata,Eden Gardens,111.0,N,Kolkata Knight Riders


Both of CSV file have a lot of column but for our win probability prediction we just need some of them. What's column useful for my prediction? I am extracting those columns.

In [240]:
deliveries = deliveries[['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'total_runs', 'is_wicket']]

In [241]:
matches = matches[['id', 'city', 'venue', 'target_runs', 'super_over', 'winner']]

In [242]:
matches.rename(columns={'id' : 'match_id'}, inplace=True)

In [243]:
ipl = matches.merge(deliveries, how = "inner", on = 'match_id')

In [288]:
ipl.head()

Unnamed: 0,match_id,city,venue,target_runs,super_over,winner,inning,batting_team,bowling_team,over,...,total_runs,wickets_out,current_score,over_completed,run_left,wickets_left,ball_left,result,crr,rrr
124,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,...,1,0,1,20,222.0,10,119,0,6.0,11.193277
125,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,...,1,0,2,20,221.0,10,118,0,6.0,11.237288
126,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,...,0,0,2,20,221.0,10,117,0,4.0,11.333333
127,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,...,1,0,3,20,220.0,10,116,0,4.5,11.37931
128,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,...,1,0,4,20,219.0,10,115,0,4.8,11.426087


Now I have my excepted data set I store this inside ipl variable.
Here is an overview of the data.
- **match_id** - A unique value for every match.
- **city** - Location of the game
- **venue** - Name of the Stadium
- **target_runs** - How many runs have been scored by the batting team
- **Super_over** - The batting team and bowling team have scored the same or not
- **winner** - who won the match
- **inning** - 1 for the batting team and 2 for the bowling team
- **Over** - How many over have completed out of 20
- **total_runs** - How many runs scored in a single ball
- **wicket_out** - Someone got out or not
- **current_score** - How many runs they have scored after completing of certain over.
- **over_completed** - 20 over have got completed.
- **run_left** - How many have to score bowling team to win the match?
- **wickets_left** - How many wickets they have left out of 10 
- **ball_left** - How many ball they to score the given target 
- **result** - Who won the match
- **crr** - what is the current run rate ((runs*6)/ball_left)
- **rrr** - what is the require run rate ((runs_left * 6)/ball_left)

## Feature Engineering

To predict win probability I will only consider the second innings.

In [289]:
ipl = ipl[ipl['inning'] == 2]

Take those columns where conduct more than five matches

In [245]:
city=ipl['city'].value_counts()[ipl['city'].value_counts() > 600].index.tolist()

In [246]:
ipl = ipl[ipl['city'].isin(city)]

In [247]:
ipl['batting_team'].unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Kolkata Knight Riders', 'Rajasthan Royals',
       'Mumbai Indians', 'Chennai Super Kings', 'Deccan Chargers',
       'Pune Warriors', 'Kochi Tuskers Kerala', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

Some of the teams have changed their name and those who have played a few matches I will not consider those teams. I will remove those teams and rename the names of those who have changed their name. 

In [248]:
ipl['batting_team'] = ipl['batting_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals',
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru'
})

In [249]:
teams = ['Royal Challengers Bengaluru', 'Lucknow Super Giants','Gujarat Titans',
                                                        'Kolkata Knight Riders', 'Rajasthan Royals', 'Mumbai Indians',
                                                        'Chennai Super Kings','Sunrisers Hyderabad','Delhi Capitals','Punjab Kings',]

In [250]:
ipl = ipl[ipl['batting_team'].isin(teams)]
ipl = ipl[ipl['bowling_team'].isin(teams)]

Current score after completion of every ball.

In [251]:
current_score = ipl.groupby('match_id')['total_runs'].cumsum()

In [252]:
ipl['current_score'] = current_score

In [253]:
ipl.rename(columns={'is_wicket' : 'wickets_out'}, inplace=True)

In [254]:
ipl['over_completed'] = 20 - ipl['over']

In [255]:
ipl['run_left'] = ipl['target_runs'] - ipl['current_score']

In [256]:
ipl['wickets_out'] = ipl.groupby('match_id')['wickets_out'].cumsum()

In [257]:
ipl['wickets_left'] = 10 - ipl['wickets_out']

In [258]:
ipl = ipl[ipl['super_over'] == 'N']

In [259]:
ipl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69739 entries, 124 to 260919
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   match_id        69739 non-null  int64  
 1   city            69739 non-null  object 
 2   venue           69739 non-null  object 
 3   target_runs     69739 non-null  float64
 4   super_over      69739 non-null  object 
 5   winner          69739 non-null  object 
 6   inning          69739 non-null  int64  
 7   batting_team    69739 non-null  object 
 8   bowling_team    69739 non-null  object 
 9   over            69739 non-null  int64  
 10  ball            69739 non-null  int64  
 11  total_runs      69739 non-null  int64  
 12  wickets_out     69739 non-null  int64  
 13  current_score   69739 non-null  int64  
 14  over_completed  69739 non-null  int64  
 15  run_left        69739 non-null  float64
 16  wickets_left    69739 non-null  int64  
dtypes: float64(2), int64(9), object(6

In [260]:
# ipl.dropna(inplace=True)

In [261]:
ipl['ball_left'] = 120 - ipl['ball']

In [262]:
ipl['over_completed']

124       20
125       20
126       20
127       20
128       20
          ..
260915    11
260916    11
260917    10
260918    10
260919    10
Name: over_completed, Length: 69739, dtype: int64

In [263]:
ipl.head()

Unnamed: 0,match_id,city,venue,target_runs,super_over,winner,inning,batting_team,bowling_team,over,ball,total_runs,wickets_out,current_score,over_completed,run_left,wickets_left,ball_left
124,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,1,1,0,1,20,222.0,10,119
125,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,2,1,0,2,20,221.0,10,118
126,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,3,0,0,2,20,221.0,10,117
127,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,4,1,0,3,20,220.0,10,116
128,335982,Bangalore,M Chinnaswamy Stadium,223.0,N,Kolkata Knight Riders,2,Royal Challengers Bengaluru,Kolkata Knight Riders,0,5,1,0,4,20,219.0,10,115


In [264]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0


In [265]:
ipl['result'] = ipl.apply(result, axis=1)

In [266]:
ipl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69739 entries, 124 to 260919
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   match_id        69739 non-null  int64  
 1   city            69739 non-null  object 
 2   venue           69739 non-null  object 
 3   target_runs     69739 non-null  float64
 4   super_over      69739 non-null  object 
 5   winner          69739 non-null  object 
 6   inning          69739 non-null  int64  
 7   batting_team    69739 non-null  object 
 8   bowling_team    69739 non-null  object 
 9   over            69739 non-null  int64  
 10  ball            69739 non-null  int64  
 11  total_runs      69739 non-null  int64  
 12  wickets_out     69739 non-null  int64  
 13  current_score   69739 non-null  int64  
 14  over_completed  69739 non-null  int64  
 15  run_left        69739 non-null  float64
 16  wickets_left    69739 non-null  int64  
 17  ball_left       69739 non-null  i

In [267]:
ipl['crr'] = (ipl['current_score']*6) / (120 - ipl['ball_left'])

In [268]:
ipl.describe()

Unnamed: 0,match_id,target_runs,inning,over,ball,total_runs,wickets_out,current_score,over_completed,run_left,wickets_left,ball_left,result,crr
count,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0,69739.0
mean,1008886.0,171.864624,2.0,8.956266,3.6169,1.336139,2.536802,75.747243,11.043734,96.117381,7.463198,116.3831,0.384032,179.211837
std,367655.3,32.000411,0.0,5.565416,1.815755,1.638,2.214605,48.39964,5.565416,51.70057,2.214605,1.815755,0.486369,188.736327
min,335982.0,48.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,-9.0,0.0,109.0,0.0,0.0
25%,598058.0,151.0,2.0,4.0,2.0,0.0,1.0,35.0,6.0,56.0,6.0,115.0,0.0,58.8
50%,1175366.0,171.0,2.0,9.0,4.0,1.0,2.0,72.0,11.0,95.0,8.0,116.0,0.0,121.5
75%,1304101.0,192.0,2.0,14.0,5.0,1.0,4.0,112.0,16.0,134.0,9.0,118.0,1.0,222.0
max,1426312.0,288.0,2.0,19.0,11.0,7.0,10.0,262.0,20.0,287.0,10.0,119.0,1.0,1554.0


In [269]:
ipl['rrr'] = (ipl['run_left']*6)/ipl['ball_left']

In [270]:
final_df = ipl[['batting_team','bowling_team','city','run_left',
                        'ball_left','wickets_left','target_runs','crr','rrr','result']]

In [271]:
final_df = final_df.sample(final_df.shape[0])

In [272]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [273]:
X_train

Unnamed: 0,batting_team,bowling_team,city,run_left,ball_left,wickets_left,target_runs,crr,rrr
243749,Chennai Super Kings,Gujarat Titans,Ahmedabad,117.0,118,10,171.0,162.000000,5.949153
94700,Mumbai Indians,Kolkata Knight Riders,Abu Dhabi,140.0,117,10,164.0,48.000000,7.179487
206110,Royal Challengers Bengaluru,Sunrisers Hyderabad,Abu Dhabi,6.0,116,4,142.0,204.000000,0.310345
77543,Chennai Super Kings,Mumbai Indians,Chennai,83.0,113,6,149.0,56.571429,4.407080
94510,Chennai Super Kings,Mumbai Indians,Kolkata,75.0,119,2,149.0,444.000000,3.781513
...,...,...,...,...,...,...,...,...,...
77268,Delhi Capitals,Rajasthan Royals,Delhi,107.0,118,9,166.0,177.000000,5.440678
83596,Chennai Super Kings,Rajasthan Royals,Chennai,0.0,115,5,186.0,223.200000,0.000000
218191,Punjab Kings,Lucknow Super Giants,Pune,51.0,119,5,154.0,618.000000,2.571429
113602,Royal Challengers Bengaluru,Chennai Super Kings,Bangalore,150.0,116,9,182.0,48.000000,7.758621


In [274]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [275]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [276]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

In [277]:
pipe.fit(X_train,y_train)

In [278]:
y_pred = pipe.predict(X_test)

In [279]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8229853742472039

In [283]:
pipe.predict_proba(X_test)[90]

array([0.72546385, 0.27453615])

In [281]:
def match_summary(row):
    print("Batting Team-" + row['batting_team'] + " | Bowling Team-" + row['bowling_team'] + " | Target- " + str(row['target_runs']))
    

In [282]:
final_df.columns

Index(['batting_team', 'bowling_team', 'city', 'run_left', 'ball_left',
       'wickets_left', 'target_runs', 'crr', 'rrr', 'result'],
      dtype='object')

In [285]:
import pickle
pickle.dump(pipe,open('ipl.pkl','wb'))