# IPL Win Probability Predictor - SIMPLE & EFFECTIVE
### Based on proven working approach

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [2]:
match = pd.read_csv('matches.csv')
delivery = pd.read_csv('deliveries.csv')
print(f"Loaded {len(match)} matches and {len(delivery)} deliveries")

Loaded 1095 matches and 260920 deliveries


## 2. Prepare Data - CRITICAL DIFFERENCE

In [3]:
# Merge matches and deliveries
total = delivery.merge(match[['id', 'winner','city']], left_on='match_id', right_on='id')

# Filter for 2nd innings only
total = total[total['inning'] == 2].reset_index(drop=True)

# Get target runs for each match (1st innings total + 1)
first_innings_score = delivery[delivery['inning'] == 1].groupby('match_id')['total_runs'].sum().reset_index()
first_innings_score.columns = ['match_id', 'target_runs']
first_innings_score['target_runs'] = first_innings_score['target_runs'] + 1

total = total.merge(first_innings_score, on='match_id')

print(f"Total rows: {len(total)}")
total.head()

Total rows: 125741


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder,id,winner,city,target_runs
0,335982,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,1,R Dravid,AB Dinda,W Jaffer,1,...,1,,0,,,,335982,Kolkata Knight Riders,Bangalore,223
1,335982,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,2,W Jaffer,AB Dinda,R Dravid,0,...,1,wides,0,,,,335982,Kolkata Knight Riders,Bangalore,223
2,335982,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,3,W Jaffer,AB Dinda,R Dravid,0,...,0,,0,,,,335982,Kolkata Knight Riders,Bangalore,223
3,335982,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,4,W Jaffer,AB Dinda,R Dravid,1,...,1,,0,,,,335982,Kolkata Knight Riders,Bangalore,223
4,335982,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,5,R Dravid,AB Dinda,W Jaffer,1,...,1,,0,,,,335982,Kolkata Knight Riders,Bangalore,223


## 3. Build Features - Ball by Ball

In [4]:
# Current score (cumulative runs)
total['current_score'] = total.groupby('match_id')['total_runs'].cumsum()

# Runs left
total['runs_left'] = total['target_runs'] - total['current_score']

# Balls left (only count legal deliveries)
total['legal_ball'] = ~total['extras_type'].isin(['wides', 'noballs'])
total['balls_bowled'] = total.groupby('match_id')['legal_ball'].cumsum()
total['balls_left'] = 120 - total['balls_bowled']

# Wickets left
total['wickets_fallen'] = total.groupby('match_id')['is_wicket'].cumsum()
total['wickets_left'] = 10 - total['wickets_fallen']

# Current Run Rate
total['crr'] = (total['current_score'] * 6) / (total['balls_bowled'] + 1)

# Required Run Rate
total['rrr'] = (total['runs_left'] * 6) / (total['balls_left'] + 1)

print("Features created!")
total[['batting_team', 'current_score', 'runs_left', 'balls_left', 'wickets_left', 'crr', 'rrr']].head(10)

Features created!


Unnamed: 0,batting_team,current_score,runs_left,balls_left,wickets_left,crr,rrr
0,Royal Challengers Bangalore,1,222,119,10,3.0,11.1
1,Royal Challengers Bangalore,2,221,119,10,6.0,11.05
2,Royal Challengers Bangalore,2,221,118,10,4.0,11.142857
3,Royal Challengers Bangalore,3,220,117,10,4.5,11.186441
4,Royal Challengers Bangalore,4,219,116,10,4.8,11.230769
5,Royal Challengers Bangalore,4,219,115,10,4.0,11.327586
6,Royal Challengers Bangalore,4,219,114,10,3.428571,11.426087
7,Royal Challengers Bangalore,4,219,113,9,3.0,11.526316
8,Royal Challengers Bangalore,4,219,112,9,2.666667,11.628319
9,Royal Challengers Bangalore,8,215,111,9,4.8,11.517857


## 4. Define Result Label

In [5]:
# Result: 1 if batting team won, 0 if they lost
total['result'] = (total['batting_team'] == total['winner']).astype(int)

print(f"Result distribution:")
print(total['result'].value_counts(normalize=True))

Result distribution:
result
1    0.519687
0    0.480313
Name: proportion, dtype: float64


## 5. Filter Active Teams

In [6]:
# Team name mapping
teams = [
    'Sunrisers Hyderabad', 'Mumbai Indians', 'Royal Challengers Bengaluru',
    'Kolkata Knight Riders', 'Punjab Kings', 'Chennai Super Kings',
    'Rajasthan Royals', 'Delhi Capitals', 'Gujarat Titans', 'Lucknow Super Giants'
]

team_rename = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Kings XI Punjab': 'Punjab Kings',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru'
}

total['batting_team'] = total['batting_team'].replace(team_rename)
total['bowling_team'] = total['bowling_team'].replace(team_rename)

# Keep only matches with active teams
total = total[
    total['batting_team'].isin(teams) & 
    total['bowling_team'].isin(teams)
].copy()

print(f"Filtered to {len(total)} rows with active teams")

Filtered to 112805 rows with active teams


## 6. CRITICAL: Sample Every 6 Balls (Not Every Ball)
### This is the key difference - reduces data size and improves balance

In [7]:
# Sample at the end of every over (ball_number % 6 == 0)
# This gives us ~20 samples per match instead of 120
total['ball_number'] = total.groupby('match_id').cumcount() + 1

# Keep only over-end situations (every 6th legal ball)
total['over_end'] = (total['balls_bowled'] % 6 == 0) | (total['balls_bowled'] == 120)

# Also keep important moments (wickets, boundaries)
total['important'] = (total['is_wicket'] == 1) | (total['batsman_runs'] >= 4)

# Sample: over ends + some important moments
sample_df = total[total['over_end'] | (total['important'] & (total['ball_number'] % 3 == 0))].copy()

print(f"Original rows: {len(total)}")
print(f"Sampled rows: {len(sample_df)}")
print(f"Reduction: {len(total)/len(sample_df):.1f}x")
print(f"\nSampled result distribution:")
print(sample_df['result'].value_counts(normalize=True))

Original rows: 112805
Sampled rows: 25048
Reduction: 4.5x

Sampled result distribution:
result
1    0.508184
0    0.491816
Name: proportion, dtype: float64


## 7. Create Final Dataset

In [9]:
# Select features
final_df = sample_df[[
    'batting_team', 'bowling_team', 'city',
    'current_score', 'runs_left', 'balls_left', 'wickets_left',
    'crr', 'rrr',
    'result'
]].copy()

# Remove missing values (Crucial: Matches in Dubai/Sharjah sometimes have NaN cities in raw data)
final_df = final_df.dropna()

print(f"Final dataset: {len(final_df)} rows")

Final dataset: 23740 rows


## 8. Train Simple Logistic Regression Model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Features and target
X = final_df.drop('result', axis=1)
y = final_df['result']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple preprocessing - just one-hot encode teams
# Simple preprocessing - one-hot encode teams AND city
trf = ColumnTransformer([
    ('encoder', OneHotEncoder(sparse_output=False, drop='first'), 
     ['batting_team', 'bowling_team', 'city']) 
], remainder='passthrough')

# Simple Logistic Regression
pipe = Pipeline([
    ('preprocessor', trf),
    ('model', LogisticRegression(max_iter=1000))
])

# Train
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")

# Test predictions
test_sample = X_test.sample(5)
probs = pipe.predict_proba(test_sample)

print("\nSample Predictions:")
for i, (idx, row) in enumerate(test_sample.iterrows()):
    print(f"Runs: {int(row['current_score'])}/{int(row['current_score'] + row['runs_left'])}, "
          f"Balls left: {int(row['balls_left'])}, "
          f"Win%: {probs[i][1]*100:.1f}%")

Accuracy: 81.07%

Sample Predictions:
Runs: 86/162, Balls left: 36, Win%: 28.3%
Runs: 169/186, Balls left: 6, Win%: 23.7%
Runs: 128/148, Balls left: 12, Win%: 81.9%
Runs: 66/126, Balls left: 54, Win%: 85.9%
Runs: 45/158, Balls left: 84, Win%: 62.5%


## 11. Save Model

In [11]:
import pickle

with open('ipl_win_predictor.pkl', 'wb') as f:
    pickle.dump(pipe, f)

print("Model saved as 'ipl_win_predictor.pkl'")

Model saved as 'ipl_win_predictor.pkl'
