# üèè IPL Win Probability Prediction
This project focuses on predicting the probability of winning for the chasing team in Indian Premier League (IPL) matches. Using ball-by-ball data from IPL (sourced from Cricsheet), we build a machine learning model that simulates real-time match scenarios.
The application is built using Streamlit, providing an interactive interface for users to input match parameters and receive win probability predictions.
## Features
- **Real-time Predictions**: Input current match details to get instant win probability.
- **Interactive UI**: User-friendly interface for easy data input and visualization.
- **Data-Driven Insights**: Leverages historical IPL data for accurate predictions.
## Technologies Used
- Python
- Streamlit
- Pandas
- Scikit-learn

# Import Libraries

In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Load Datasets as Union of DataFrames

In [2]:
# 1. Union CSVs of all team

files = glob.glob(f"match_data/*.csv")

dfs = [pd.read_csv(f) for f in files]
df = pd.concat(dfs, ignore_index=True)
print(f"Dataset size for : {df.shape}")
df.head()

Dataset size for : (133903, 11)


Unnamed: 0,match_id,season,venue,chasing_team,runs,wickets,runs_required,balls_remaining,crr,rrr,win
0,1.0,2018,Wankhede Stadium,Chennai Super Kings,0,0,165,119,0.0,8.319328,1
1,1.0,2018,Wankhede Stadium,Chennai Super Kings,1,0,164,118,3.0,8.338983,1
2,1.0,2018,Wankhede Stadium,Chennai Super Kings,2,0,163,117,4.0,8.358974,1
3,1.0,2018,Wankhede Stadium,Chennai Super Kings,3,0,162,116,4.5,8.37931,1
4,1.0,2018,Wankhede Stadium,Chennai Super Kings,3,0,162,115,3.6,8.452174,1


# Data Preprocessing

In [3]:
print(df.isnull().sum())
print(df.duplicated().sum())

match_id           8066
season                0
venue                 0
chasing_team          0
runs                  0
wickets               0
runs_required         0
balls_remaining       0
crr                   0
rrr                   0
win                   0
dtype: int64
0


## Feature Engineering

In [4]:
df['chasing_team'].unique()

array(['Chennai Super Kings', 'Deccan Chargers', 'Delhi Capitals',
       'Delhi Daredevils', 'Gujarat Lions', 'Gujarat Titans',
       'Kings XI Punjab', 'Kochi Tuskers Kerala', 'Kolkata Knight Riders',
       'Lucknow Super Giants', 'Mumbai Indians', 'Pune Warriors',
       'Punjab Kings', 'Rajasthan Royals', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Royal Challengers Bengaluru', 'Sunrisers Hyderabad'], dtype=object)

In [5]:
# Handling duplicate values in team names
team_mapping = {
    "Royal Challengers Bangalore": "Royal Challengers Bangalore",
    "Royal Challengers Bengaluru": "Royal Challengers Bangalore",
    "Punjab Kings": "Punjab Kings",
    "Kings XI Punjab": "Punjab Kings",
    "Rising Pune Supergiant": "Rising Pune Supergiants",
    "Rising Pune Supergiants": "Rising Pune Supergiants",
    "Delhi Daredevils": "Delhi Capitals",
    "Delhi Capitals": "Delhi Capitals"
}

df["chasing_team"] = df["chasing_team"].replace(team_mapping)
df['chasing_team'].unique()

array(['Chennai Super Kings', 'Deccan Chargers', 'Delhi Capitals',
       'Gujarat Lions', 'Gujarat Titans', 'Punjab Kings',
       'Kochi Tuskers Kerala', 'Kolkata Knight Riders',
       'Lucknow Super Giants', 'Mumbai Indians', 'Pune Warriors',
       'Rajasthan Royals', 'Rising Pune Supergiants',
       'Royal Challengers Bangalore', 'Sunrisers Hyderabad'], dtype=object)

In [6]:
# One-Hot Encoding for categorical features(chasing_team)
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded = encoder.fit_transform(df[["chasing_team"]])

# Put back into DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["chasing_team"]))
df = pd.concat([df.drop(columns=["chasing_team"]), encoded_df], axis=1)
df.head()

Unnamed: 0,match_id,season,venue,runs,wickets,runs_required,balls_remaining,crr,rrr,win,...,chasing_team_Kochi Tuskers Kerala,chasing_team_Kolkata Knight Riders,chasing_team_Lucknow Super Giants,chasing_team_Mumbai Indians,chasing_team_Pune Warriors,chasing_team_Punjab Kings,chasing_team_Rajasthan Royals,chasing_team_Rising Pune Supergiants,chasing_team_Royal Challengers Bangalore,chasing_team_Sunrisers Hyderabad
0,1.0,2018,Wankhede Stadium,0,0,165,119,0.0,8.319328,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2018,Wankhede Stadium,1,0,164,118,3.0,8.338983,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,2018,Wankhede Stadium,2,0,163,117,4.0,8.358974,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,2018,Wankhede Stadium,3,0,162,116,4.5,8.37931,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2018,Wankhede Stadium,3,0,162,115,3.6,8.452174,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Drop unnecessary columns
df.drop(columns=['match_id','season','venue'], inplace=True)
df.head()

Unnamed: 0,runs,wickets,runs_required,balls_remaining,crr,rrr,win,chasing_team_Deccan Chargers,chasing_team_Delhi Capitals,chasing_team_Gujarat Lions,...,chasing_team_Kochi Tuskers Kerala,chasing_team_Kolkata Knight Riders,chasing_team_Lucknow Super Giants,chasing_team_Mumbai Indians,chasing_team_Pune Warriors,chasing_team_Punjab Kings,chasing_team_Rajasthan Royals,chasing_team_Rising Pune Supergiants,chasing_team_Royal Challengers Bangalore,chasing_team_Sunrisers Hyderabad
0,0,0,165,119,0.0,8.319328,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,164,118,3.0,8.338983,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,163,117,4.0,8.358974,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,162,116,4.5,8.37931,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0,162,115,3.6,8.452174,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Split features and target
x=df.drop(columns=['win'])
y=df['win']

# Split train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
#Train Model & Evaluate
model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
y_prob = model.predict_proba(x_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.9236398939546694
ROC AUC: 0.9789092307709993


In [10]:
#Inference
def predict_win_probability(team_name,runs, wickets, runs_required, balls_remaining, crr, rrr):
    input_data = pd.DataFrame([[team_name,runs, wickets, runs_required, balls_remaining, crr, rrr]],
                              columns=['chasing_team','runs', 'wickets', 'runs_required', 'balls_remaining', 'crr', 'rrr'])
    encoded=encoder.transform(input_data[["chasing_team"]])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["chasing_team"]))
    input_data = pd.concat([input_data.drop(columns=["chasing_team"]), encoded_df], axis=1)
    
    #Predict probability
    win_prob = model.predict_proba(input_data)[:, 1][0]
    return win_prob   

In [11]:
# Example usage
total_ball=120
team_name='Chennai Super Kings'
balls_faced=40
runs=150
wickets=5
runs_required=100
balls_remaining=total_ball-balls_faced
crr=(runs/balls_faced)*6
rrr=(runs_required/(balls_remaining))*6

result=predict_win_probability(team_name,runs, wickets, runs_required, balls_remaining, crr, rrr)
print(f"Win Probability: {result*100:.2f}%")

Win Probability: 64.00%


# Save model and encoder

In [12]:
import joblib
# Save model and encoder
joblib.dump(model, 'models/win_probability_model.pkl',compress=3)
    
#save the encoder
joblib.dump(encoder, 'models/encoder.pkl')

['models/encoder.pkl']