In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
matches = pd.read_csv("matches (1).csv")
deliveries = pd.read_csv("deliveries (2).csv")


In [3]:
deliveries = deliveries[deliveries['inning'] == 2]


In [4]:
target_df = deliveries.groupby('match_id')['total_runs'].sum().reset_index()
target_df.rename(columns={'total_runs': 'target'}, inplace=True)
target_df['target'] += 1


In [5]:
match_data = deliveries.merge(
    matches[['id', 'city', 'winner']],
    left_on='match_id',
    right_on='id'
)

match_data = match_data.merge(target_df, on='match_id')


In [6]:
match_data['runs_left'] = match_data['target'] - match_data.groupby('match_id')['total_runs'].cumsum()


In [7]:
match_data['balls_left'] = 120 - (match_data['over'] * 6 + match_data['ball'])


In [8]:
match_data = match_data[match_data['balls_left'] > 0]


In [9]:
match_data['is_wicket'] = match_data['player_dismissed'].notna().astype(int)
match_data['wickets_left'] = 10 - match_data.groupby('match_id')['is_wicket'].cumsum()
match_data['wickets_left'] = match_data['wickets_left'].clip(lower=0)


In [10]:
match_data['current_run_rate'] = (
    (match_data['target'] - match_data['runs_left']) * 6
) / (120 - match_data['balls_left'])

match_data['required_run_rate'] = (
    match_data['runs_left'] * 6
) / match_data['balls_left']


In [11]:
match_data['result'] = match_data.apply(
    lambda x: 1 if x['batting_team'] == x['winner'] else 0,
    axis=1
)


In [12]:
cols = [
    'batting_team', 'bowling_team', 'city',
    'runs_left', 'balls_left', 'wickets_left',
    'current_run_rate', 'required_run_rate', 'result'
]

In [13]:
final_df = match_data.loc[:, cols].copy()

In [14]:
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df.dropna(inplace=True)


In [15]:
X = final_df.drop('result', axis=1)
y = final_df['result']


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
categorical_cols = ['batting_team', 'bowling_team', 'city']
numeric_cols = X.columns.drop(categorical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [18]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=3000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier()
}


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd


In [20]:
results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc


In [21]:
results_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': results.values()
}).sort_values(by='Accuracy', ascending=False)

results_df


Unnamed: 0,Model,Accuracy
2,Random Forest,0.997613
1,Decision Tree,0.980038
3,Gradient Boosting,0.784753
0,Logistic Regression,0.741429


In [22]:
best_model_name = results_df.iloc[0]['Model']
best_model_name


'Random Forest'

In [23]:
best_model = models[best_model_name]

final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

final_pipe.fit(X, y)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
def prepare_input(df):
    df = df.copy()
    
    df['required_run_rate'] = (df['runs_left'] * 6) / df['balls_left']
    df['current_run_rate'] = ((df['target'] - df['runs_left']) * 6) / (120 - df['balls_left'])
    
    return df


In [27]:
sample_input = pd.DataFrame([{
    'batting_team': 'Mumbai Indians',
    'bowling_team': 'Chennai Super Kings',
    'city': 'Mumbai',
    'target': 180,
    'runs_left': 45,
    'balls_left': 30,
    'wickets_left': 6
}])

sample_input = prepare_input(sample_input)

prediction = final_pipe.predict(sample_input)
prediction


array([1])

In [28]:
if prediction[0] == 1:
    print("üèè Batting team will WIN")
else:
    print("‚ùå Batting team will LOSE")


üèè Batting team will WIN
