# 1. Import Libraries

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import pickle


# 2. Load and Merge the Datasets

In [43]:
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

In [44]:
matches.head(5)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


In [50]:
deliveries.head(5)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


# 4. Feature Engineering

In [51]:
# Cumulative runs and wickets
merged['current_score'] = merged.groupby('match_id')['total_runs'].cumsum()
merged['wickets'] = merged['player_dismissed'].notna().astype(int)
merged['wickets'] = merged.groupby('match_id')['wickets'].cumsum()

In [52]:
# Balls bowled so far
merged['balls'] = (merged['over'] - 1) * 6 + merged['ball']

In [53]:
# Merge target score
target_df = deliveries[deliveries['inning'] == 1].groupby('match_id')['total_runs'].sum().reset_index()
target_df.columns = ['match_id', 'target']
merged = merged.merge(target_df, on='match_id')

In [54]:
# Calculate features
merged['runs_left'] = merged['target'] - merged['current_score']
merged['balls_left'] = 120 - merged['balls']
merged['crr'] = merged['current_score'] / (merged['balls'] / 6)
merged['rrr'] = merged['runs_left'] * 6 / merged['balls_left']

# 5. Final Cleaned Dataset

In [55]:
# Choose final columns
final_df = merged[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left', 'wickets',
                   'target', 'crr', 'rrr', 'winner', 'match_id']]

In [56]:
# Drop rows with NA and duplicates
final_df.dropna(inplace=True)
final_df.drop_duplicates(subset=['match_id'], keep='last', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop_duplicates(subset=['match_id'], keep='last', inplace=True)


In [57]:
# Define target column
final_df['result'] = np.where(final_df['batting_team'] == final_df['winner'], 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['result'] = np.where(final_df['batting_team'] == final_df['winner'], 1, 0)


In [23]:
# Drop anything invalid
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# 6. Split Data & Build Pipeline

In [58]:
X = final_df.drop(columns=['winner', 'result', 'match_id'])
y = final_df['result']

In [59]:
categorical_cols = ['batting_team', 'bowling_team', 'city']
numerical_cols = ['runs_left', 'balls_left', 'wickets', 'target', 'crr', 'rrr']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [61]:
# Pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

In [62]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42))
])

# 7. Train Model

In [63]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



# 8. Evaluate Accuracy

In [64]:
y_pred = pipe.predict(X_test)

In [65]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9855769230769231
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98        95
           1       0.99      0.98      0.99       113

    accuracy                           0.99       208
   macro avg       0.99      0.99      0.99       208
weighted avg       0.99      0.99      0.99       208



# 9. Save Model

In [66]:
pickle.dump(pipe, open("pipe.pkl", "wb"))