In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [3]:
matches = pd.read_csv("matches (1).csv")
deliveries = pd.read_csv("deliveries (2).csv")


#### we take the first innings total runs to calculate the target to be scored by the 2nd team 

In [4]:
first_innings = deliveries[deliveries['inning'] == 1]

In [5]:
target_df = first_innings.groupby('match_id')['total_runs'].sum().reset_index()
target_df.rename(columns={'total_runs': 'target'}, inplace=True)
target_df['target'] += 1
target_df.head()


Unnamed: 0,match_id,target
0,1,208
1,2,185
2,3,184
3,4,164
4,5,158


#### we focus only on the 2nd innings from here

In [6]:
deliveries = deliveries[deliveries['inning'] == 2]


In [7]:
match_data = deliveries.merge(
    matches[['id', 'city', 'winner']],
    left_on='match_id',
    right_on='id'
)

match_data = match_data.merge(target_df, on='match_id') #on='match_id' is short-hand for left_on='match_id', right_on='match_id'
match_data.head()


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder,id,city,winner,target
0,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,Mandeep Singh,A Nehra,0,...,1,0,1,,,,1,Hyderabad,Sunrisers Hyderabad,208
1,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,CH Gayle,A Nehra,0,...,0,0,0,,,,1,Hyderabad,Sunrisers Hyderabad,208
2,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,CH Gayle,A Nehra,0,...,0,0,0,,,,1,Hyderabad,Sunrisers Hyderabad,208
3,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,CH Gayle,A Nehra,0,...,2,0,2,,,,1,Hyderabad,Sunrisers Hyderabad,208
4,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,CH Gayle,A Nehra,0,...,4,0,4,,,,1,Hyderabad,Sunrisers Hyderabad,208


In [8]:
match_data['runs_left'] = match_data['target'] - match_data.groupby('match_id')['total_runs'].cumsum()
#cumsum- within each match, add up total_runs from the first row up to the current row as one row represents one ball


print(match_data.loc[match_data['match_id'] == 1, 'target'].iloc[0])
match_data[['match_id', 'over', 'ball', 'total_runs', 'runs_left']].head(10)


208


Unnamed: 0,match_id,over,ball,total_runs,runs_left
0,1,1,1,1,207
1,1,1,2,0,207
2,1,1,3,0,207
3,1,1,4,2,205
4,1,1,5,4,201
5,1,1,6,4,197
6,1,2,1,0,197
7,1,2,2,0,197
8,1,2,3,1,196
9,1,2,4,0,196


#### The chasing team has balls_left balls remaining to score the target


In [9]:
match_data['balls_bowled']=((match_data['over']-1) * 6 + match_data['ball'])
match_data['balls_left'] = 120 - match_data['balls_bowled']

#sanity check
match_data[['match_id', 'over', 'ball', 'balls_bowled', 'balls_left']].head(10)


Unnamed: 0,match_id,over,ball,balls_bowled,balls_left
0,1,1,1,1,119
1,1,1,2,2,118
2,1,1,3,3,117
3,1,1,4,4,116
4,1,1,5,5,115
5,1,1,6,6,114
6,1,2,1,7,113
7,1,2,2,8,112
8,1,2,3,9,111
9,1,2,4,10,110


#### data cleaning- removes completed match rows so the model only sees live situations.

In [10]:
match_data = match_data[match_data['balls_left'] > 0]


#### to see if player is out or not for a ball and total wickets left for each ball

In [11]:
match_data['is_wicket'] = match_data['player_dismissed'].notna().astype(int)
match_data['wickets_left'] = 10 - match_data.groupby('match_id')['is_wicket'].cumsum()
match_data['wickets_left'] = match_data['wickets_left'].clip(lower=0)


In [12]:
match_data[['match_id', 'over', 'ball', 'is_wicket', 'wickets_left']].loc[30:50]


Unnamed: 0,match_id,over,ball,is_wicket,wickets_left
30,1,5,6,0,10
31,1,5,7,0,10
32,1,6,1,0,10
33,1,6,2,0,10
34,1,6,3,0,10
35,1,6,4,1,9
36,1,6,5,0,9
37,1,6,6,0,9
38,1,7,1,0,9
39,1,7,2,0,9


#### if CRR>RRR then team is ahead else behind

In [13]:
match_data['current_run_rate'] = ((match_data['target'] - match_data['runs_left']) * 6) / (120 - match_data['balls_left'])
# the average runs per over of the batting team so far.
#aka how fast they are scoring now
#runs scored*6 /(balls left) since run rate per over(6 balls)

match_data['required_run_rate'] = ( match_data['runs_left'] * 6) / match_data['balls_left'] 
#How many runs per over they must score from now on to win.
#aka how fast they need to score
#runs left*6/(balls left)


#### did the chasing team win or lose as this is the target label for prediction (y_test)

In [14]:
match_data['result'] = match_data.apply(
    lambda x: 1 if x['batting_team'] == x['winner'] else 0,
    axis=1
)


#### columns we require

In [15]:
cols = [
    'batting_team', 'bowling_team', 'city',
    'runs_left', 'balls_left', 'wickets_left',
    'current_run_rate', 'required_run_rate', 'result'
]

In [16]:
final_df = match_data.loc[:, cols].copy()

#### data cleaning - replacing infinite values w NaN and dropping rows w NaN

In [17]:
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df.dropna(inplace=True)


#### features and target

In [18]:
X = final_df.drop('result', axis=1)
y = final_df['result']


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### feature encoding

In [21]:
categorical_cols = ['batting_team', 'bowling_team', 'city']
numeric_cols = X.columns.drop(categorical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [22]:
for col in categorical_cols:
    print(X[col].unique())

['Royal Challengers Bangalore' 'Rising Pune Supergiant'
 'Kolkata Knight Riders' 'Kings XI Punjab' 'Delhi Daredevils'
 'Sunrisers Hyderabad' 'Mumbai Indians' 'Gujarat Lions' 'Rajasthan Royals'
 'Chennai Super Kings' 'Deccan Chargers' 'Pune Warriors'
 'Kochi Tuskers Kerala' 'Rising Pune Supergiants']
['Sunrisers Hyderabad' 'Mumbai Indians' 'Gujarat Lions'
 'Rising Pune Supergiant' 'Royal Challengers Bangalore'
 'Kolkata Knight Riders' 'Delhi Daredevils' 'Kings XI Punjab'
 'Chennai Super Kings' 'Rajasthan Royals' 'Deccan Chargers'
 'Kochi Tuskers Kerala' 'Pune Warriors' 'Rising Pune Supergiants']
['Hyderabad' 'Pune' 'Rajkot' 'Indore' 'Bangalore' 'Mumbai' 'Kolkata'
 'Delhi' 'Chandigarh' 'Kanpur' 'Jaipur' 'Chennai' 'Cape Town'
 'Port Elizabeth' 'Durban' 'Centurion' 'East London' 'Johannesburg'
 'Kimberley' 'Bloemfontein' 'Ahmedabad' 'Cuttack' 'Nagpur' 'Dharamsala'
 'Kochi' 'Visakhapatnam' 'Raipur' 'Ranchi' 'Abu Dhabi' 'Sharjah']


In [23]:
for col in categorical_cols:
    print(col, X[col].nunique())

batting_team 14
bowling_team 14
city 30


In [24]:
X_transformed = preprocessor.fit_transform(X)
print(X_transformed.shape)


(71213, 63)


#### diff models to test and check the best one

In [26]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=3000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier()
}


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd


#### model evaluation and comparison using a preprocessing-model pipeline

In [28]:
results = {}

for name, model in models.items():
    #creating pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    #training model
    pipe.fit(X_train, y_train)
    #making predictions
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc


In [29]:
results_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': results.values()
}).sort_values(by='Accuracy', ascending=False)

results_df


Unnamed: 0,Model,Accuracy
2,Random Forest,0.998245
1,Decision Tree,0.985537
3,Gradient Boosting,0.843502
0,Logistic Regression,0.831145


In [31]:
best_model_name = results_df.iloc[0]['Model']
best_model_name


'Random Forest'

#### random forest is the best model

#### final model training

In [None]:
best_model = models[best_model_name]

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

final_pipe.fit(X, y)


In [25]:
def prepare_input(df):
    df = df.copy()
    
    df['required_run_rate'] = (df['runs_left'] * 6) / df['balls_left']
    df['current_run_rate'] = ((df['target'] - df['runs_left']) * 6) / (120 - df['balls_left'])
    
    return df


In [27]:
sample_input = pd.DataFrame([{
    'batting_team': 'Mumbai Indians',
    'bowling_team': 'Chennai Super Kings',
    'city': 'Mumbai',
    'target': 180,
    'runs_left': 45,
    'balls_left': 30,
    'wickets_left': 6
}])

sample_input = prepare_input(sample_input)

prediction = final_pipe.predict(sample_input)
prediction


array([1])

In [28]:
if prediction[0] == 1:
    print("üèè Batting team will WIN")
else:
    print("‚ùå Batting team will LOSE")


üèè Batting team will WIN
