In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

match = pd.read_csv('matches_info.csv')
delivery = pd.read_csv('deliveries_info.csv')


total_runs = delivery.groupby(['Match_ID','Inning']).sum()['Total_Runs'].reset_index()
total_runs = total_runs[total_runs['Inning'] == 1]

match_data = match.merge(total_runs[['Match_ID','Total_Runs']],left_on='Match Id',
                         right_on='Match_ID')

teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Punjab Kings',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals',
    'Gujarat Titans',
    'Lucknow Super Giants'
]

replacements = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Kings XI Punjab': 'Punjab Kings'
}

for column in ['Team 1', 'Team 2']:
    match_data[column] = match_data[column].replace(replacements, regex=True)

match_data = match_data[match_data['Team 1'].isin(teams)]
match_data = match_data[match_data['Team 2'].isin(teams)]
match_data = match_data[match_data['Dl applied'] == 0]
match_data = match_data[['Match Id','City','Winner','Total_Runs']]
match_data.rename(columns={'Match Id': 'Match_ID'}, inplace=True)
delivery_data = match_data.merge(delivery, on='Match_ID')
delivery_data = delivery_data[delivery_data['Inning'] == 2]
delivery_data['current_score'] = delivery_data.groupby('Match_ID')['Total_Runs_y'].cumsum()
delivery_data['runs_left'] = delivery_data['Total_Runs_x'] - delivery_data['current_score']
delivery_data['balls_left'] = 126 - (delivery_data['Over'] * 6 + delivery_data['Ball'])
delivery_data['Player_Dismissed'] = delivery_data['Player_Dismissed'].fillna("0")
delivery_data['Player_Dismissed'] = delivery_data['Player_Dismissed'].apply(lambda x: 0 if x == "0" else 1)
wickets_left = delivery_data.groupby('Match_ID')['Player_Dismissed'].cumsum()
delivery_data['wickets_left'] = 10 - wickets_left
delivery_data['crr'] = (delivery_data['current_score']*6)/(120 - delivery_data['balls_left'])
delivery_data['rrr'] = (delivery_data['runs_left']*6)/delivery_data['balls_left']


def result(row):
    return 1 if row['Batting_Team'] == row['Winner'] else 0

delivery_data['result'] = delivery_data.apply(result,axis=1)
final_match_data = delivery_data[['Batting_Team','Bowling_Team','City','runs_left','balls_left','wickets_left','Total_Runs_x','crr','rrr','result']]
final_match_data = final_match_data.sample(final_match_data.shape[0])
final_match_data.dropna(inplace=True)
final_match_data = final_match_data[final_match_data['balls_left'] != 0]

x = final_match_data.iloc[:, :-1]
y = final_match_data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['Batting_Team', 'Bowling_Team', 'City'])
], remainder='passthrough')


#linear regression
logistic_pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])
logistic_pipe.fit(x_train, y_train)
pickle.dump(logistic_pipe, open('logistic_regression.pkl', 'wb'))


#knn
knn_pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', KNeighborsClassifier())
])
knn_pipe.fit(x_train, y_train)
pickle.dump(knn_pipe, open('knn.pkl', 'wb'))


#randomforest
rf_pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', RandomForestClassifier())
])
rf_pipe.fit(x_train, y_train)
pickle.dump(rf_pipe, open('random_forest.pkl', 'wb'))