# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import pickle

# 2. Load Data

In [2]:
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

In [3]:
matches.head(5)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


In [4]:
deliveries.head(5)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


# 3. Merge match details

In [5]:
data = deliveries.merge(matches[['id', 'venue', 'city']], left_on='match_id', right_on='id')

# 4. Filter 1st innings only

In [6]:
data = data[data['inning'] == 1]

# 5. Create cumulative stats

In [7]:
agg = data.groupby(['match_id', 'over']).agg({
    'total_runs': 'sum',
    'is_wicket': 'sum'
}).reset_index()

# 6. Add max over info (used to map final score)

In [8]:
final_scores = agg.groupby('match_id')['total_runs'].sum().reset_index().rename(columns={'total_runs': 'final_score'})
data_model = agg.copy()

In [9]:
data_model = data_model.merge(final_scores, on='match_id', how='left')

# 7. Add match metadata (team, city, venue)

In [10]:
meta = data[['match_id', 'batting_team', 'bowling_team', 'city', 'venue']].drop_duplicates()
data_model = data_model.merge(meta, on='match_id', how='left')

# 8. Feature Engineering

In [12]:
# Split 'over' column into full_over and ball (e.g., 15.3 → 15 overs and 3 balls)
data_model['full_over'] = data_model['over'].astype(str).str.split('.').str[0].astype(int)
data_model['ball_in_over'] = data_model['over'].astype(str).str.split('.').str[1].fillna(0).astype(int)

# Total balls bowled = full_over * 6 + ball_in_over
data_model['balls_bowled'] = data_model['full_over'] * 6 + data_model['ball_in_over']


In [13]:
# Remove rows with 0 balls bowled
data_model = data_model[data_model['balls_bowled'] > 0]

# Balls left
data_model['balls_left'] = 120 - data_model['balls_bowled']

# Cumulative runs
data_model['runs_till_now'] = data_model.groupby('match_id')['total_runs'].cumsum()

# Cumulative wickets
data_model['wickets_till_now'] = data_model.groupby('match_id')['is_wicket'].cumsum()

# Current Run Rate
data_model['crr'] = data_model['runs_till_now'] / data_model['balls_bowled'] * 6

# Last 30-ball performance
data_model['last_5_runs'] = data_model.groupby('match_id')['total_runs'].rolling(window=30, min_periods=1).sum().reset_index(level=0, drop=True)
data_model['last_5_wkts'] = data_model.groupby('match_id')['is_wicket'].rolling(window=30, min_periods=1).sum().reset_index(level=0, drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model['balls_left'] = 120 - data_model['balls_bowled']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model['runs_till_now'] = data_model.groupby('match_id')['total_runs'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_model['wickets_till_now'] = data_model.groupby('match_id

# 9. Select Features & Target

In [14]:
features = ['batting_team', 'bowling_team', 'city', 'runs_till_now',
            'balls_bowled', 'wickets_till_now', 'crr', 'last_5_runs', 'last_5_wkts']

In [15]:
target = 'final_score'

In [16]:
X = data_model[features]
y = data_model[target]

# 10. Train-Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 11. Pipeline Setup

In [18]:
cat_cols = ['batting_team', 'bowling_team', 'city']
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

In [19]:
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

# 12. Fit the Model

In [20]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



# 13. Evaluate the Model

In [21]:
y_pred = pipe.predict(X_test)

In [22]:
print("📊 Evaluation Metrics")
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

📊 Evaluation Metrics
R2 Score: 0.704736539619812
MAE: 11.981543734026914
RMSE: 17.211448950333295


# 14. Save Model

In [23]:
pickle.dump(pipe, open('score_predictor.pkl', 'wb'))