In [6]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [3]:
# 1. Union CSVs for one team (example: Chennai Super Kings)
team_name = "Chennai Super Kings"
files = glob.glob(f"match_data/{team_name}_*.csv")

dfs = [pd.read_csv(f) for f in files]
df = pd.concat(dfs, ignore_index=True)
print(f"Dataset size for {team_name}: {df.shape}")
df.head()

Dataset size for Chennai Super Kings: (13550, 11)


Unnamed: 0,match_id,season,venue,chasing_team,runs,wickets,runs_required,balls_remaining,crr,rrr,win
0,1.0,2018,Wankhede Stadium,Chennai Super Kings,0,0,165,119,0.0,8.319328,1
1,1.0,2018,Wankhede Stadium,Chennai Super Kings,1,0,164,118,3.0,8.338983,1
2,1.0,2018,Wankhede Stadium,Chennai Super Kings,2,0,163,117,4.0,8.358974,1
3,1.0,2018,Wankhede Stadium,Chennai Super Kings,3,0,162,116,4.5,8.37931,1
4,1.0,2018,Wankhede Stadium,Chennai Super Kings,3,0,162,115,3.6,8.452174,1


In [4]:
df['win'].value_counts()

win
1    8196
0    5354
Name: count, dtype: int64

In [5]:
x=df[['runs','wickets','runs_required','balls_remaining','crr','rrr']]
y=df['win']
print(y.head())
x.head()

0    1
1    1
2    1
3    1
4    1
Name: win, dtype: int64


Unnamed: 0,runs,wickets,runs_required,balls_remaining,crr,rrr
0,0,0,165,119,0.0,8.319328
1,1,0,164,118,3.0,8.338983
2,2,0,163,117,4.0,8.358974
3,3,0,162,116,4.5,8.37931
4,3,0,162,115,3.6,8.452174


In [7]:
# Split first
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE
sm = SMOTE(random_state=42)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())

Before SMOTE: win
1    6557
0    4283
Name: count, dtype: int64
After SMOTE: win
1    6557
0    6557
Name: count, dtype: int64


In [None]:
#Train Model
model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train_res, y_train_res)
y_pred=model.predict(x_test)
y_prob = model.predict_proba(x_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.9328413284132842
ROC AUC: 0.9828266877220686


In [10]:
preprocessed_df=pd.concat([x,y],axis=1)
preprocessed_df.to_csv(f"preprocessed_{team_name.replace(' ','_')}.csv",index=False)

In [12]:
#Inference
def predict_win_probability(runs, wickets, runs_required, balls_remaining, crr, rrr):
    input_data = pd.DataFrame([[runs, wickets, runs_required, balls_remaining, crr, rrr]],
                              columns=['runs', 'wickets', 'runs_required', 'balls_remaining', 'crr', 'rrr'])
    win_prob = model.predict_proba(input_data)[:, 1][0]
    return win_prob   

In [16]:
total_ball=120
balls_faced=40
runs=150
wickets=5
runs_required=100
balls_remaining=total_ball-balls_faced
crr=(runs/balls_faced)*6
rrr=(runs_required/(balls_remaining))*6

result=predict_win_probability(runs, wickets, runs_required, balls_remaining, crr, rrr)
print(f"Win Probability: {result*100:.2f}%")

Win Probability: 72.00%
