# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cufflinks as cf
cf.go_offline()
import plotly.graph_objects as go
import plotly.express as px

In [None]:
balls = pd.read_csv('../input/ipl-2008-to-2021-all-match-dataset/IPL_Ball_by_Ball_2008_2022.csv')
balls.shape

In [None]:
matches = pd.read_csv('../input/ipl-2008-to-2021-all-match-dataset/IPL_Matches_2008_2022.csv')
matches.shape

In [None]:
balls.head()

In [None]:
balls.info()

In [None]:
balls.describe()

In [None]:
matches.head()

In [None]:
matches.info()

# Data Visualization

In [None]:
matches['City'].value_counts().iplot()

# EDA and Feature Engineering

## Finding total score of the innings

In [None]:
total_score = balls.groupby(['ID', 'innings']).sum()['total_run'].reset_index()

In [None]:
total_score.head()

## We only need score of 1st innings
our target is winner prediction so we only need the score of first innings

In [None]:
total_score = total_score[total_score['innings']==1]

In [None]:
total_score.head()

In [None]:
fig = px.histogram(total_score,nbins=30, x='total_run')
fig.show()

In [None]:
total_score['target'] = total_score['total_run'] + 1

Merge with the mathces dataset

In [None]:
match_df = matches.merge(total_score[['ID','target']], on='ID')

In [None]:
match_df.head()

## Removing old teams / updating teams new names

In [None]:
match_df['Team1'].unique()

In [None]:
teams = [
    'Rajasthan Royals',
    'Royal Challengers Bangalore',
    'Sunrisers Hyderabad', 
    'Delhi Capitals', 
    'Chennai Super Kings',
    'Gujarat Titans', 
    'Lucknow Super Giants', 
    'Kolkata Knight Riders',
    'Punjab Kings', 
    'Mumbai Indians'
]

In [None]:
match_df['Team1'] = match_df['Team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['Team2'] = match_df['Team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['WinningTeam'] = match_df['WinningTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['Team1'] = match_df['Team1'].str.replace('Kings XI Punjab', 'Punjab Kings')
match_df['Team2'] = match_df['Team2'].str.replace('Kings XI Punjab', 'Punjab Kings')
match_df['WinningTeam'] = match_df['WinningTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')


match_df['Team1'] = match_df['Team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['Team2'] = match_df['Team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['WinningTeam'] = match_df['WinningTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [None]:
match_df = match_df[match_df['Team1'].isin(teams)]
match_df = match_df[match_df['Team2'].isin(teams)]
match_df = match_df[match_df['WinningTeam'].isin(teams)]

In [None]:
match_df.shape

In [None]:
match_df.columns

In [None]:
fig = px.histogram(match_df, x='WinningTeam')
fig.show()

## We want only the matches where D/L is not applied 
Removing all matches effected due to rain

In [None]:
match_df['method'].unique()

In [None]:
match_df['method'].value_counts()

In [None]:
match_df = match_df[match_df['method'].isna()]

In [None]:
match_df.shape

In [None]:
match_df.columns

In [None]:
match_df = match_df[['ID','City','Team1','Team2','WinningTeam','target']].dropna()

In [None]:
match_df.head()

In [None]:
match_df.isna().sum()

Merge the match_df dataset with balls dataset

In [None]:
balls.columns

In [None]:
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Kings XI Punjab', 'Punjab Kings')
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Delhi Daredevils', 'Delhi Capitals')
balls['BattingTeam'] = balls['BattingTeam'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

balls = balls[balls['BattingTeam'].isin(teams)]

In [None]:
balls_df = match_df.merge(balls, on='ID')

In [None]:
balls_df.head()

In [None]:
balls_df['BattingTeam'].value_counts()

In [None]:
fig = px.bar(balls_df['BattingTeam'].value_counts())
fig.show()

In [None]:
balls_df.columns

Only select rows where we are in 2nd innings

In [None]:
balls_df = balls_df[balls_df['innings']==2]

In [None]:
balls_df.shape

In [None]:
balls_df.head()

In [None]:
balls_df.columns

## Create new row current_score after each ball

In [None]:
balls_df['current_score'] = balls_df.groupby('ID')['total_run'].cumsum()

In [None]:
balls_df

In [None]:
balls_df['runs_left'] = np.where(balls_df['target']-balls_df['current_score']>=0, balls_df['target']-balls_df['current_score'], 0)

In [None]:
balls_df

In [None]:
balls_df['balls_left'] = np.where(120 - balls_df['overs']*6 - balls_df['ballnumber']>=0,120 - balls_df['overs']*6 - balls_df['ballnumber'], 0)

In [None]:
balls_df['wickets_left'] = 10 - balls_df.groupby('ID')['isWicketDelivery'].cumsum()

In [None]:
balls_df.columns

In [None]:
balls_df['current_run_rate'] = (balls_df['current_score']*6)/(120-balls_df['balls_left'])

In [None]:
balls_df['required_run_rate'] = np.where(balls_df['balls_left']>0, balls_df['runs_left']*6/balls_df['balls_left'], 0)

In [None]:
balls_df.columns

In [None]:
def result(row):
    return 1 if row['BattingTeam'] == row['WinningTeam'] else 0

In [None]:
balls_df['result'] = balls_df.apply(result, axis=1)

In [None]:
balls_df.head()

In [None]:
balls_df.columns

In [None]:
index1 = balls_df[balls_df['Team2']==balls_df['BattingTeam']]['Team1'].index
index2 = balls_df[balls_df['Team1']==balls_df['BattingTeam']]['Team2'].index

In [None]:
balls_df.loc[index1, 'BowlingTeam'] = balls_df.loc[index1, 'Team1']
balls_df.loc[index2, 'BowlingTeam'] = balls_df.loc[index2, 'Team2']

In [None]:
balls_df.head()

In [None]:
final_df = balls_df[['BattingTeam', 'BowlingTeam','City','runs_left','balls_left','wickets_left','current_run_rate','required_run_rate','target','result']]

In [None]:
final_df.head()

In [None]:
fig = px.bar(final_df['City'].value_counts())
fig.show()

In [None]:
fig = px.bar(final_df['BattingTeam'].value_counts())
fig.show()

In [None]:
fig = px.bar(final_df['BowlingTeam'].value_counts())
fig.show()

In [None]:
fig = px.histogram(final_df[final_df['runs_left']>0]['runs_left'], nbins=30,)
fig.show()

In [None]:
fig = px.bar(final_df['wickets_left'].value_counts())
fig.show()

In [None]:
fig = px.histogram(final_df['target'], nbins=30)
fig.show()

In [None]:
final_df.describe()

In [None]:
final_df.isna().sum()

In [None]:
final_df.shape

In [None]:
final_df.sample(final_df.shape[0])

Randomly shuffle all the rows

In [None]:
final_df.sample()

## One hot encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False,drop='first'),['BattingTeam','BowlingTeam','City'])
],
remainder = 'passthrough')

In [None]:
from sklearn.model_selection import train_test_split

X = final_df.drop('result', axis=1)
y = final_df['result']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',RandomForestClassifier())
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

In [None]:
pipe.predict_proba(X_test)

# Save model

In [None]:
teams

In [None]:
final_df['City'].unique()

In [None]:
import pickle
pickle.dump(pipe, open('pipe.pkl','wb'))