# IPL Match Winner Prediction
Install pandas and numpy

In [None]:
import pandas as pd

Read data from CSV datasets

In [None]:
match = pd.read_csv('datasets/matches.csv')
delivery = pd.read_csv('datasets/deliveries.csv')

In [None]:
delivery.head()

- find the total runs of each innings
- group by match_id and inning and sum the total runs & store in 'total_runs' column

In [None]:
delivery.groupby(['match_id', 'inning']).sum()['total_runs']

- reset the index of the dataframe and store in 'total_score_df'

In [None]:
total_score_df = delivery.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()

- Filter the innings 1 (first inning)

In [None]:
total_score_df = total_score_df[total_score_df['inning'] ==1]

In [None]:
total_score_df

- Till now, we got the first inning total runs for each match

- Now, merge the match and total_score_df dataframes by match_id present in total_score_df & id present in match, store in match_df

In [None]:
match_df = match.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

In [None]:
match_df['team1'].unique() # find all unique teams

In [None]:
# Current teams
teams = [
    'Chennai Super Kings',
    'Royal Challengers Bengaluru',
    'Delhi Capitals',
    'Kolkata Knight Riders',
    'Mumbai Indians',
    'Punjab Kings',
    'Rajasthan Royals',
    'Sunrisers Hyderabad',
    'Lucknow Super Giants', 
    'Gujarat Titans'
]

In [None]:
# Fix team names
match_df['team1'] = match_df['team1'].str.replace('Royal Challengers Bangalore', 'Royal Challengers Bengaluru')
match_df['team2'] = match_df['team2'].str.replace('Royal Challengers Bangalore', 'Royal Challengers Bengaluru')

match_df['team1'] = match_df['team1'].str.replace('Kings XI Punjab', 'Punjab Kings')
match_df['team2'] = match_df['team2'].str.replace('Kings XI Punjab', 'Punjab Kings')

match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [None]:
match_df = match_df[match_df['team2'].isin(teams) & match_df['team1'].isin(teams)]

In [None]:
# match_df['method'].unique()
match_df.columns

In [None]:
# Remove D/L method (DLS Method)
match_df = match_df[match_df['method'] != 'D/L']

In [None]:
# Extract the required fields such as city, winner, total_runs
match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]

In [None]:
match_df

In [None]:
# Merge this match_df with delivery
delivery_df = match_df.merge(delivery, on="match_id")

In [None]:
# Filter for second innings, as we have to find while chasing the target
delivery_df = delivery_df[delivery_df['inning'] == 2]

In [None]:
# Calculate the current score after each delivery
delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs_y'].cumsum()


In [None]:
# Calculate runs left after each delivery
delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score'] + 1   # 1 run extra needed for win

In [None]:
# Select only valid balls
delivery_df['extras_type'].unique()
# delivery_df['ball'].unique()

In [None]:
# Create column that indicates if ball is legal
delivery_df['is_legal'] = ~delivery_df['extras_type'].isin(['wides', 'legbyes', 'byes', 'noballs', 'penalty'])

In [None]:
delivery_df['is_legal'] = delivery_df['is_legal'].fillna(True)

In [None]:
delivery_df['legal_ball_number'] = delivery_df.groupby(['match_id', 'inning'])['is_legal'].cumsum()

In [None]:
# Calculate overs completed
delivery_df['overs_done'] = delivery_df['legal_ball_number'] / 6

In [None]:
# Calculate wickets left
delivery_df['wickets_left'] = 10 - delivery_df.groupby('match_id')['is_wicket'].cumsum()

In [None]:
# Calculate current run rate
# crr = (runs scored / overs played)
delivery_df['crr'] = delivery_df.apply(
    lambda x: x['current_score'] / x['overs_done'] if x['overs_done'] > 0 else 0,
    axis=1
)

In [None]:
# Calculate required run rate
# rrr = (runs needed / overs left)
delivery_df['overs_left'] = (120 - delivery_df['legal_ball_number']) / 6
delivery_df['rrr'] = delivery_df.apply(
    lambda x: x['runs_left'] / x['overs_left'] if x['overs_left'] > 0 else 0,
    axis=1
)

In [None]:
# Calculate result as boolean (win = 1, loss = 0)
# if batting team in 2nd inning is winner then result = 1 else result = 0
# for this we create a function & apply to each row
def result(row): 
    return 1 if row['batting_team'] == row['winner'] else 0

In [None]:
delivery_df['result'] = delivery_df.apply(result, axis=1)   # axis=1 for row wise

In [None]:
delivery_df['balls_left'] = 120 - delivery_df['legal_ball_number']
delivery_df['balls_left'] = delivery_df.apply(
    lambda x: 120 - x['legal_ball_number'] if x['inning'] == 2 else None,
    axis=1
)

In [None]:
# Extract required columns
final_df = delivery_df[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left', 'wickets_left', 'total_runs_x', 'crr', 'rrr', 'result']]

In [None]:
# shuffle each delivery to avoid overfitting / bias
final_df = final_df.sample(final_df.shape[0])

In [None]:
# sample row
final_df.sample()

In [None]:
# removes all rows in final_df that contains at lease one NaN value
final_df.dropna(inplace=True)

In [None]:
# Filter out all rows where balls_left is not Zero
final_df = final_df[final_df['balls_left'] != 0]

- Till now, our dataset is clean and ready for model training

# Model Training

In [None]:
# Import train_test_split for splitting data
X = final_df.iloc[:,:-1] # all rows & all columns except result (last column)
y = final_df.iloc[:,-1] # all rows & last column only

from sklearn.model_selection import train_test_split        # This imports the function to split your data into training and test sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training data & 20% test data, random_state=1 for reproducibility

In [None]:
# X.info()

In [None]:
# Convert categorical data to numeric formats

from sklearn.compose import ColumnTransformer   # apply transformer to specific columns
from sklearn.preprocessing import OneHotEncoder # convert categorical to binary values

# Create the ColumnTransformer
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city']) 
    # sparse_output=False to return numpy array
    # drop='first' to avoid dummy variable trap
], remainder='passthrough') # remainder='passthrough' to keep the other columns as it is


In [None]:
# Logistic Regression & Random Forest model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.pipeline import Pipeline

In [None]:
# Create the pipeline

# Logistic Regression
pipe = Pipeline(steps=[
    ('step1', trf),
    # ('step2', LogisticRegression(solver='liblinear'))
    ('step2', RandomForestClassifier())
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
# Find accuracy score of the model 
from sklearn.metrics import accuracy_score
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
# Logistic Regression = 0.8095959118008895
# Random Forest = 0.9983912179426516

In [None]:
import numpy as np

In [None]:
def match_progression(x_df, match_id, pipe):
    # Filter the DataFrame for the given match_id
    match = x_df[x_df['match_id'] == match_id]

    # Only consider the last ball of each over (i.e., ball number 6)
    match = match[(match['ball'] == 6)]

    # Select relevant features and remove rows with missing values
    temp_df = match[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left',
                     'wickets_left', 'total_runs_x', 'crr', 'rrr']].dropna()

    # Remove rows where no balls are left (i.e., innings is over)
    temp_df = temp_df[temp_df['balls_left'] != 0]

    # Predict win and loss probabilities using the trained model pipeline
    result = pipe.predict_proba(temp_df)

    # Add predicted lose and win probabilities (in percentage) to temp_df
    temp_df['lose'] = np.round(result.T[0] * 100, 1)
    temp_df['win'] = np.round(result.T[1] * 100, 1)

    # Add an 'end_of_over' column to indicate over progression (1 to n)
    temp_df['end_of_over'] = range(1, temp_df.shape[0] + 1)

    # Extract target runs from the total_runs_x column (assumes constant for match)
    target = temp_df['total_runs_x'].values[0]

    # Calculate runs scored in each over
    runs = list(temp_df['runs_left'].values)
    new_runs = runs[:]  # Copy of runs_left after each over
    runs.insert(0, target)  # Insert total target at the start
    temp_df['runs_after_over'] = np.array(runs)[:-1] - np.array(new_runs)  # Difference gives runs in that over

    # Calculate wickets lost in each over
    wickets = list(temp_df['wickets_left'].values)
    new_wickets = wickets[:]  # Copy of wickets_left
    new_wickets.insert(0, 10)  # Start with 10 wickets
    wickets.append(0)          # Add dummy value to match lengths
    w = np.array(wickets)
    nw = np.array(new_wickets)
    temp_df['wickets_in_over'] = (nw - w)[0:temp_df.shape[0]]  # Difference gives wickets lost in that over

    # Print the target for reference
    print("Target-", target)

    # Keep only relevant columns for plotting or further analysis
    temp_df = temp_df[['end_of_over', 'runs_after_over', 'wickets_in_over', 'lose', 'win']]

    return temp_df, target


In [None]:
temp_df,target = match_progression(delivery_df, 1181768, pipe)

In [None]:
# plot the graph
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 8))

# Line plot: Wickets lost per over
plt.plot(temp_df['end_of_over'], temp_df['wickets_in_over'], color='yellow', linewidth=3, label='Wickets in Over')

# Line plot: Win probability
plt.plot(temp_df['end_of_over'], temp_df['win'], color='#00a65a', linewidth=4, label='Win Probability')

# Line plot: Lose probability
plt.plot(temp_df['end_of_over'], temp_df['lose'], color='red', linewidth=4, label='Lose Probability')

# Bar chart: Runs scored after each over
plt.bar(temp_df['end_of_over'], temp_df['runs_after_over'], alpha=0.3, label='Runs After Over')

# Title and legend
plt.title('Target - ' + str(target))
plt.xlabel('End of Over')
plt.ylabel('Values')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


- Fetch the fields to display

In [None]:
teams

In [None]:
city = delivery_df['city'].unique()
city

In [None]:
import pickle  # Import the pickle module to save/load Python objects

# Save the trained model pipeline object 'pipe' to a file named 'pipe.pkl' in binary write mode
# pickle.dump(pipe, open('models/logistic_regression.pkl', 'wb'))
pickle.dump(pipe, open('models/random_forest.pkl', 'wb'))