In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [4]:
matches = pd.read_csv("matches (1).csv")
deliveries = pd.read_csv("deliveries (2).csv")


In [5]:
deliveries = deliveries[deliveries['inning'] == 2]


In [6]:
target_df = deliveries.groupby('match_id')['total_runs'].sum().reset_index()
target_df.rename(columns={'total_runs': 'target'}, inplace=True)
target_df['target'] += 1


In [7]:
match_data = deliveries.merge(
    matches[['id', 'city', 'winner']],
    left_on='match_id',
    right_on='id'
)

match_data = match_data.merge(target_df, on='match_id')


In [8]:
match_data['runs_left'] = match_data['target'] - match_data.groupby('match_id')['total_runs'].cumsum()


In [9]:
match_data['balls_left'] = 120 - (match_data['over'] * 6 + match_data['ball'])


In [10]:
match_data = match_data[match_data['balls_left'] > 0]


In [11]:
match_data['is_wicket'] = match_data['player_dismissed'].notna().astype(int)
match_data['wickets_left'] = 10 - match_data.groupby('match_id')['is_wicket'].cumsum()
match_data['wickets_left'] = match_data['wickets_left'].clip(lower=0)


In [12]:
match_data['current_run_rate'] = (
    (match_data['target'] - match_data['runs_left']) * 6
) / (120 - match_data['balls_left'])

match_data['required_run_rate'] = (
    match_data['runs_left'] * 6
) / match_data['balls_left']


In [13]:
match_data['result'] = match_data.apply(
    lambda x: 1 if x['batting_team'] == x['winner'] else 0,
    axis=1
)


In [14]:
cols = [
    'batting_team', 'bowling_team', 'city',
    'runs_left', 'balls_left', 'wickets_left',
    'current_run_rate', 'required_run_rate', 'result'
]

In [15]:
final_df = match_data.loc[:, cols].copy()

In [16]:
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df.dropna(inplace=True)


In [17]:
X = final_df.drop('result', axis=1)
y = final_df['result']


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
categorical_cols = ['batting_team', 'bowling_team', 'city']
numeric_cols = X.columns.drop(categorical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [None]:
from sklearn.linear_model import LogisticRegression

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=3000,
        solver='lbfgs'
    ),
    "KNN": KNeighborsClassifier(),
    "SVM Linear": SVC(kernel='linear'),
    "SVM RBF": SVC(kernel='rbf'),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}
