In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
deliveries = pd.read_csv('IPL data (2008-2024)/deliveries.csv')
matches = pd.read_csv('IPL data (2008-2024)/matches.csv')

In [3]:
deliveries.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,


In [4]:
matches.head(3)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar


In [5]:
# Mapping dictionary for team name standardization
team_name_mapping = {
    "Kings XI Punjab": "Punjab Kings",
    "Delhi Daredevils": "Delhi Capitals",
    "Deccan Chargers": "Sunrisers Hyderabad",
    "Rising Pune Supergiant": "Rising Pune Supergiants",
    "Royal Challengers Bangalore": "Royal Challengers Bengaluru",
    "Gujarat Lions": "Gujarat Titans"
}

# Apply the mapping to both matches and deliveries datasets
matches["team1"] = matches["team1"].replace(team_name_mapping)
matches["team2"] = matches["team2"].replace(team_name_mapping)
matches["toss_winner"] = matches["toss_winner"].replace(team_name_mapping)
matches["winner"] = matches["winner"].replace(team_name_mapping)

deliveries["batting_team"] = deliveries["batting_team"].replace(team_name_mapping)
deliveries["bowling_team"] = deliveries["bowling_team"].replace(team_name_mapping)

In [6]:
teams_to_drop = ["Kochi Tuskers Kerala", "Pune Warriors", "Rising Pune Supergiant", 'Rising Pune Supergiants']
# Drop rows where team1 or team2 is in the list of teams to drop
matches = matches[~matches["team1"].isin(teams_to_drop) & ~matches["team2"].isin(teams_to_drop)]
deliveries = deliveries[~deliveries["batting_team"].isin(teams_to_drop) & ~deliveries["bowling_team"].isin(teams_to_drop)]

In [7]:
# Check unique team names in matches
print("Unique teams in matches:")
print(matches["team1"].unique())
print(matches["team2"].unique())

# Check unique team names in deliveries
print("Unique teams in deliveries:")
print(deliveries["batting_team"].unique())
print(deliveries["bowling_team"].unique())

Unique teams in matches:
['Royal Challengers Bengaluru' 'Punjab Kings' 'Delhi Capitals'
 'Mumbai Indians' 'Kolkata Knight Riders' 'Rajasthan Royals'
 'Sunrisers Hyderabad' 'Chennai Super Kings' 'Gujarat Titans'
 'Lucknow Super Giants']
['Kolkata Knight Riders' 'Chennai Super Kings' 'Rajasthan Royals'
 'Royal Challengers Bengaluru' 'Sunrisers Hyderabad' 'Punjab Kings'
 'Delhi Capitals' 'Mumbai Indians' 'Gujarat Titans' 'Lucknow Super Giants']
Unique teams in deliveries:
['Kolkata Knight Riders' 'Royal Challengers Bengaluru'
 'Chennai Super Kings' 'Punjab Kings' 'Rajasthan Royals' 'Delhi Capitals'
 'Mumbai Indians' 'Sunrisers Hyderabad' 'Gujarat Titans'
 'Lucknow Super Giants']
['Royal Challengers Bengaluru' 'Kolkata Knight Riders' 'Punjab Kings'
 'Chennai Super Kings' 'Delhi Capitals' 'Rajasthan Royals'
 'Mumbai Indians' 'Sunrisers Hyderabad' 'Gujarat Titans'
 'Lucknow Super Giants']


In [8]:
team_mapping = {team: idx for idx, team in enumerate(np.unique(matches['team1']))}

In [9]:
venue_mapping = {
    "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
    "Brabourne Stadium, Mumbai": "Brabourne Stadium",
    "Dr DY Patil Sports Academy, Mumbai": "Dr DY Patil Sports Academy",
    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam": "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium",
    "Eden Gardens, Kolkata": "Eden Gardens",
    "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
    "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",  # Fix for inconsistent naming
    "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
    "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
    "Maharashtra Cricket Association Stadium, Pune": "Maharashtra Cricket Association Stadium",
    "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
    "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh": "Punjab Cricket Association IS Bindra Stadium",
    "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
    "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
    "Rajiv Gandhi International Stadium, Uppal, Hyderabad": "Rajiv Gandhi International Stadium",
    "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
    "Wankhede Stadium, Mumbai": "Wankhede Stadium",
    "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium, Dharamsala",
}

# Apply the mapping to the venue column
matches["venue"] = matches["venue"].replace(venue_mapping)

In [10]:
from sklearn.preprocessing import OrdinalEncoder

# Columns to encode
columns_to_encode = ["team1", "team2", "venue"]

# Initialize the OrdinalEncoder
ord_encoder = OrdinalEncoder()

# Fit and transform the selected columns
matches[columns_to_encode] = ord_encoder.fit_transform(matches[columns_to_encode])

# Retrieve the mappings for each column
for i, column in enumerate(columns_to_encode):
    print(f"Mapping for {column}:")
    for original_value, encoded_value in zip(ord_encoder.categories_[i], range(len(ord_encoder.categories_[i]))):
        print(f"{original_value}: {encoded_value}")
    print()

Mapping for team1:
Chennai Super Kings: 0
Delhi Capitals: 1
Gujarat Titans: 2
Kolkata Knight Riders: 3
Lucknow Super Giants: 4
Mumbai Indians: 5
Punjab Kings: 6
Rajasthan Royals: 7
Royal Challengers Bengaluru: 8
Sunrisers Hyderabad: 9

Mapping for team2:
Chennai Super Kings: 0
Delhi Capitals: 1
Gujarat Titans: 2
Kolkata Knight Riders: 3
Lucknow Super Giants: 4
Mumbai Indians: 5
Punjab Kings: 6
Rajasthan Royals: 7
Royal Challengers Bengaluru: 8
Sunrisers Hyderabad: 9

Mapping for venue:
Arun Jaitley Stadium: 0
Barabati Stadium: 1
Barsapara Cricket Stadium, Guwahati: 2
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 3
Brabourne Stadium: 4
Buffalo Park: 5
De Beers Diamond Oval: 6
Dr DY Patil Sports Academy: 7
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium: 8
Dubai International Cricket Stadium: 9
Eden Gardens: 10
Feroz Shah Kotla: 11
Green Park: 12
Himachal Pradesh Cricket Association Stadium, Dharamsala: 13
Holkar Cricket Stadium: 14
JSCA International Stadium

In [11]:
# Merge datasets
merged_data = pd.merge(deliveries, matches, left_on="match_id", right_on="id")

In [12]:
# Calculate total runs scored by each team in each match
team_runs = deliveries.groupby(["match_id", "batting_team"])["total_runs"].sum().reset_index()

# Calculate average runs in the last 5 matches for each team
team_runs["avg_runs_last_5"] = team_runs.groupby("batting_team")["total_runs"].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean()
)

# Merge this back into the matches DataFrame
matches = pd.merge(matches, team_runs[["match_id", "batting_team", "avg_runs_last_5"]], 
                   left_on="id", right_on="match_id", how="left")

In [13]:
from sklearn.preprocessing import LabelEncoder

# Encode teams, venues, etc.
label_encoders = {}
for column in ["toss_winner", "toss_decision","winner"]:
    le = LabelEncoder()
    matches[column] = le.fit_transform(matches[column])
    label_encoders[column] = le

In [14]:
# Features
features = ["team1", "team2", "venue", "toss_winner", "toss_decision", "avg_runs_last_5"]
X = matches[features]

# Target
y = matches["winner"]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rfc_model = RandomForestClassifier(random_state=42)

# Train the model
rfc_model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score
# Predict on the test set
rfc_y_pred = rfc_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rfc_y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.67


In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn_model= KNeighborsClassifier()
knn_model.fit(X_train, y_train)

knn_y_pred = knn_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, knn_y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.34


In [19]:
import joblib

# Save the model and encoders
joblib.dump({
    "model": rfc_model,  # Your trained model
    "ordinal_encoder": ord_encoder,  # Your OrdinalEncoder
    "label_encoders": label_encoders,  # Any other encoders you used
}, "ipl_match_predictor.pkl")

['ipl_match_predictor.pkl']

## Deep Learning

In [20]:
def calculate_head_to_head_win_pct(row):
    t1, t2 = row["team1"], row["team2"]
    matches_team1 = matches[((matches["team1"] == t1) & (matches["team2"] == t2)) |
                             ((matches["team1"] == t2) & (matches["team2"] == t1))]
    wins_team1 = matches_team1["winner"].eq(t1).sum()
    total_matches = len(matches_team1)
    return wins_team1 / total_matches if total_matches > 0 else 0.5

# Apply head-to-head win percentage
matches["head_to_head_win_pct"] = matches.apply(calculate_head_to_head_win_pct, axis=1)

In [21]:
venue_win_pct = matches.groupby(["venue", "winner"]).size().unstack(fill_value=0)
venue_win_pct["venue_win_pct"] = venue_win_pct.div(venue_win_pct.sum(axis=1), axis=0).max(axis=1)
venue_win_pct = venue_win_pct.reset_index()[["venue", "venue_win_pct"]]

# Merge with matches ensuring correct columns
matches = pd.merge(matches, venue_win_pct[["venue", "venue_win_pct"]], on="venue", how="left")

# Fill missing values with 0.5
matches["venue_win_pct"] = matches["venue_win_pct"].fillna(0.5)

In [22]:
# Calculate Toss Win Percentage
toss_win_pct = matches.groupby("toss_winner")["winner"].apply(lambda x: (x == x.name).sum() / len(x)).reset_index(name="toss_win_pct")

# Merge toss win percentage with matches
matches = pd.merge(
    matches, 
    toss_win_pct[["toss_winner", "toss_win_pct"]], 
    on="toss_winner", 
    how="left",
    suffixes=("", "_y")
)

# Fill missing values with 0.5
matches["toss_win_pct"] = matches["toss_win_pct"].fillna(0.5)

In [23]:
def calculate_recent_win_pct(team, match_id):
    recent_matches = matches[(matches["id"] < match_id) & ((matches["team1"] == team) | (matches["team2"] == team))].tail(10)
    if len(recent_matches) == 0:
        return 0.5
    weights = np.linspace(1, 2, len(recent_matches))  # Higher weight for recent matches
    wins = recent_matches["winner"].eq(team).values
    weighted_win_pct = np.sum(weights * wins) / np.sum(weights)
    return weighted_win_pct

matches["team1_recent_form"] = matches.apply(lambda x: calculate_recent_win_pct(x["team1"], x["id"]), axis=1)
matches["team2_recent_form"] = matches.apply(lambda x: calculate_recent_win_pct(x["team2"], x["id"]), axis=1)

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
matches["avg_runs_last_5"] = scaler.fit_transform(matches[["avg_runs_last_5"]])

In [25]:
from category_encoders import TargetEncoder
# Apply Target Encoding to Team and Venue
target_enc = TargetEncoder(cols=["team1", "team2", "venue", "toss_winner", "toss_decision"])
matches[["team1", "team2", "venue", "toss_winner", "toss_decision"]] = target_enc.fit_transform(matches[["team1", "team2", "venue", "toss_winner", "toss_decision"]], matches["winner"])

In [26]:
def add_interaction_features(matches):
    # Create interaction features
    matches['team_venue_interaction'] = matches['team1'] * matches['venue']
    matches['toss_team_interaction'] = matches['team1'] * matches['toss_winner']
    
    # Seasonal performance feature
    matches['season_performance'] = matches.groupby(['team1', 'season'])['winner'].transform('mean')
    
    return matches

matches = add_interaction_features(matches)

In [27]:
def create_advanced_feature_engineering(matches):
    # Time-based features
    matches['is_weekend'] = pd.to_datetime(matches['date']).dt.dayofweek.isin([5, 6]).astype(int)
    
    # Advanced team performance metrics
    def calculate_team_momentum(team, match_id):
        team_recent_matches = matches[
            ((matches['team1'] == team) | (matches['team2'] == team)) & 
            (matches['id'] < match_id)
        ].tail(15)
        
        if len(team_recent_matches) == 0:
            return 0.5
        
        # Weighted win calculation with exponential decay
        weights = np.exp(np.linspace(0, 2, len(team_recent_matches)))
        wins = team_recent_matches['winner'].eq(team).values
        
        return np.sum(weights * wins) / np.sum(weights)
    
    # Improved player impact feature
    def player_impact_feature(team, match_id):
        recent_matches = matches[
            ((matches['team1'] == team) | (matches['team2'] == team)) & 
            (matches['id'] < match_id)
        ].tail(10)
        
        if len(recent_matches) == 0:
            return 0.5
        
        # Calculate player of the match impact
        # Check if player of match is from the team
        player_of_match_wins = recent_matches[
            (recent_matches['team1'] == team) & 
            (recent_matches['player_of_match'] != '') | 
            (recent_matches['team2'] == team) & 
            (recent_matches['player_of_match'] != '')
        ].shape[0]
        
        return player_of_match_wins / len(recent_matches)
    
    # Add team momentum and player impact
    matches['team1_momentum'] = matches.apply(lambda x: calculate_team_momentum(x['team1'], x['id']), axis=1)
    matches['team2_momentum'] = matches.apply(lambda x: calculate_team_momentum(x['team2'], x['id']), axis=1)
    matches['team1_player_impact'] = matches.apply(lambda x: player_impact_feature(x['team1'], x['id']), axis=1)
    matches['team2_player_impact'] = matches.apply(lambda x: player_impact_feature(x['team2'], x['id']), axis=1)

     # Performance in similar conditions
    def venue_performance(team, venue, match_id):
        team_venue_matches = matches[
            ((matches['team1'] == team) | (matches['team2'] == team)) & 
            (matches['venue'] == venue) & 
            (matches['id'] < match_id)
        ]
        
        if len(team_venue_matches) == 0:
            return 0.5
        
        return (team_venue_matches['winner'].eq(team).sum() / len(team_venue_matches))
    
    matches['team1_venue_performance'] = matches.apply(lambda x: venue_performance(x['team1'], x['venue'], x['id']), axis=1)
    matches['team2_venue_performance'] = matches.apply(lambda x: venue_performance(x['team2'], x['venue'], x['id']), axis=1)
    
    return matches

def prepare_advanced_features(matches):
    # Base features
    base_features = [
        "team1", "team2", "venue", "toss_winner", "toss_decision", 
        "avg_runs_last_5", "head_to_head_win_pct", "venue_win_pct", 
        "toss_win_pct", "team1_recent_form", "team2_recent_form"
    ]
    
    # New engineered features
    new_features = [
        'is_weekend', 'team1_momentum', 'team2_momentum', 
        'team1_player_impact', 'team2_player_impact',
        'team1_venue_performance', 'team2_venue_performance'
    ]
    
    # Combine features
    features = base_features + new_features
    
    # Prepare data
    X = matches[features].astype(np.float32)
    y = matches["winner"]
    
    return X, y

In [28]:
# Apply advanced feature engineering
matches = create_advanced_feature_engineering(matches)

# Prepare features
X, y = prepare_advanced_features(matches)

In [29]:
# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

rf_model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=15, 
    min_samples_split=5, 
    random_state=42
)

# Gradient Boosting
gb_model = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=5, 
    random_state=42
)

# Support Vector Machine
svm_model = SVC(
    kernel='rbf', 
    probability=True, 
    random_state=42
)

In [32]:
rf_model.fit(X_train_scaled, y_train)

In [33]:
gb_model.fit(X_train_scaled, y_train)

In [34]:
svm_model.fit(X_train_scaled, y_train)

In [35]:
from sklearn.ensemble import VotingClassifier

voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('svm', svm_model)
    ],
    voting='soft'
)

In [36]:
# Fit Voting Classifier
voting_model.fit(X_train_scaled, y_train)

In [37]:
print("Model Performance Evaluation:\n")
print("Random Forest Accuracy:", rf_model.score(X_test_scaled, y_test))
print("Gradient Boosting Accuracy:", gb_model.score(X_test_scaled, y_test))
print("SVM Accuracy:", svm_model.score(X_test_scaled, y_test))
print("Ensemble Voting Classifier Accuracy:", voting_model.score(X_test_scaled, y_test))

Model Performance Evaluation:

Random Forest Accuracy: 0.7661691542288557
Gradient Boosting Accuracy: 0.845771144278607
SVM Accuracy: 0.5472636815920398
Ensemble Voting Classifier Accuracy: 0.8308457711442786


In [38]:
import joblib

# Save all necessary components in one file
joblib.dump({
    'model': gb_model,
    'scaler': scaler,
    'target_encoder': target_enc,
    'team_mapping': team_mapping,
    'venue_mapping': venue_mapping,
    'feature_names': X_train.columns.tolist(),  # Optional, for reference
}, 'ipl_match_predictor_gb.pkl', protocol=4)

print("Gradient Boosting model and all tools saved successfully!")

Gradient Boosting model and all tools saved successfully!
