In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [88]:
deliveries = pd.read_csv('IPL data (2008-2024)/deliveries.csv')
matches = pd.read_csv('IPL data (2008-2024)/matches.csv')

In [36]:
deliveries.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bengaluru,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bengaluru,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bengaluru,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,


In [37]:
matches.head(3)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bengaluru,Kolkata Knight Riders,Royal Challengers Bengaluru,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Punjab Kings,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,bat,Delhi Capitals,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar


In [89]:
# Mapping dictionary for team name standardization
team_name_mapping = {
    "Kings XI Punjab": "Punjab Kings",
    "Delhi Daredevils": "Delhi Capitals",
    "Deccan Chargers": "Sunrisers Hyderabad",
    "Rising Pune Supergiant": "Rising Pune Supergiants",
    "Royal Challengers Bangalore": "Royal Challengers Bengaluru",
    "Gujarat Lions": "Gujarat Titans"
}

# Apply the mapping to both matches and deliveries datasets
matches["team1"] = matches["team1"].replace(team_name_mapping)
matches["team2"] = matches["team2"].replace(team_name_mapping)
matches["toss_winner"] = matches["toss_winner"].replace(team_name_mapping)
matches["winner"] = matches["winner"].replace(team_name_mapping)

deliveries["batting_team"] = deliveries["batting_team"].replace(team_name_mapping)
deliveries["bowling_team"] = deliveries["bowling_team"].replace(team_name_mapping)

In [90]:
teams_to_drop = ["Kochi Tuskers Kerala", "Pune Warriors", "Rising Pune Supergiant", 'Rising Pune Supergiants']
# Drop rows where team1 or team2 is in the list of teams to drop
matches = matches[~matches["team1"].isin(teams_to_drop) & ~matches["team2"].isin(teams_to_drop)]
deliveries = deliveries[~deliveries["batting_team"].isin(teams_to_drop) & ~deliveries["bowling_team"].isin(teams_to_drop)]

In [91]:
# Check unique team names in matches
print("Unique teams in matches:")
print(matches["team1"].unique())
print(matches["team2"].unique())

# Check unique team names in deliveries
print("Unique teams in deliveries:")
print(deliveries["batting_team"].unique())
print(deliveries["bowling_team"].unique())

Unique teams in matches:
['Royal Challengers Bengaluru' 'Punjab Kings' 'Delhi Capitals'
 'Mumbai Indians' 'Kolkata Knight Riders' 'Rajasthan Royals'
 'Sunrisers Hyderabad' 'Chennai Super Kings' 'Gujarat Titans'
 'Lucknow Super Giants']
['Kolkata Knight Riders' 'Chennai Super Kings' 'Rajasthan Royals'
 'Royal Challengers Bengaluru' 'Sunrisers Hyderabad' 'Punjab Kings'
 'Delhi Capitals' 'Mumbai Indians' 'Gujarat Titans' 'Lucknow Super Giants']
Unique teams in deliveries:
['Kolkata Knight Riders' 'Royal Challengers Bengaluru'
 'Chennai Super Kings' 'Punjab Kings' 'Rajasthan Royals' 'Delhi Capitals'
 'Mumbai Indians' 'Sunrisers Hyderabad' 'Gujarat Titans'
 'Lucknow Super Giants']
['Royal Challengers Bengaluru' 'Kolkata Knight Riders' 'Punjab Kings'
 'Chennai Super Kings' 'Delhi Capitals' 'Rajasthan Royals'
 'Mumbai Indians' 'Sunrisers Hyderabad' 'Gujarat Titans'
 'Lucknow Super Giants']


In [92]:
venue_mapping = {
    "Arun Jaitley Stadium, Delhi": "Arun Jaitley Stadium",
    "Brabourne Stadium, Mumbai": "Brabourne Stadium",
    "Dr DY Patil Sports Academy, Mumbai": "Dr DY Patil Sports Academy",
    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam": "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium",
    "Eden Gardens, Kolkata": "Eden Gardens",
    "M Chinnaswamy Stadium, Bengaluru": "M Chinnaswamy Stadium",
    "M.Chinnaswamy Stadium": "M Chinnaswamy Stadium",  # Fix for inconsistent naming
    "MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium",
    "MA Chidambaram Stadium, Chepauk, Chennai": "MA Chidambaram Stadium",
    "Maharashtra Cricket Association Stadium, Pune": "Maharashtra Cricket Association Stadium",
    "Punjab Cricket Association IS Bindra Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
    "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh": "Punjab Cricket Association IS Bindra Stadium",
    "Punjab Cricket Association Stadium, Mohali": "Punjab Cricket Association IS Bindra Stadium",
    "Rajiv Gandhi International Stadium, Uppal": "Rajiv Gandhi International Stadium",
    "Rajiv Gandhi International Stadium, Uppal, Hyderabad": "Rajiv Gandhi International Stadium",
    "Sawai Mansingh Stadium, Jaipur": "Sawai Mansingh Stadium",
    "Wankhede Stadium, Mumbai": "Wankhede Stadium",
    "Himachal Pradesh Cricket Association Stadium": "Himachal Pradesh Cricket Association Stadium, Dharamsala",
}

# Apply the mapping to the venue column
matches["venue"] = matches["venue"].replace(venue_mapping)

In [93]:
from sklearn.preprocessing import OrdinalEncoder

# Columns to encode
columns_to_encode = ["team1", "team2", "venue"]

# Initialize the OrdinalEncoder
ord_encoder = OrdinalEncoder()

# Fit and transform the selected columns
matches[columns_to_encode] = ord_encoder.fit_transform(matches[columns_to_encode])

# Retrieve the mappings for each column
for i, column in enumerate(columns_to_encode):
    print(f"Mapping for {column}:")
    for original_value, encoded_value in zip(ord_encoder.categories_[i], range(len(ord_encoder.categories_[i]))):
        print(f"{original_value}: {encoded_value}")
    print()

Mapping for team1:
Chennai Super Kings: 0
Delhi Capitals: 1
Gujarat Titans: 2
Kolkata Knight Riders: 3
Lucknow Super Giants: 4
Mumbai Indians: 5
Punjab Kings: 6
Rajasthan Royals: 7
Royal Challengers Bengaluru: 8
Sunrisers Hyderabad: 9

Mapping for team2:
Chennai Super Kings: 0
Delhi Capitals: 1
Gujarat Titans: 2
Kolkata Knight Riders: 3
Lucknow Super Giants: 4
Mumbai Indians: 5
Punjab Kings: 6
Rajasthan Royals: 7
Royal Challengers Bengaluru: 8
Sunrisers Hyderabad: 9

Mapping for venue:
Arun Jaitley Stadium: 0
Barabati Stadium: 1
Barsapara Cricket Stadium, Guwahati: 2
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 3
Brabourne Stadium: 4
Buffalo Park: 5
De Beers Diamond Oval: 6
Dr DY Patil Sports Academy: 7
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium: 8
Dubai International Cricket Stadium: 9
Eden Gardens: 10
Feroz Shah Kotla: 11
Green Park: 12
Himachal Pradesh Cricket Association Stadium, Dharamsala: 13
Holkar Cricket Stadium: 14
JSCA International Stadium

In [94]:
# Merge datasets
merged_data = pd.merge(deliveries, matches, left_on="match_id", right_on="id")

In [95]:
# Calculate total runs scored by each team in each match
team_runs = deliveries.groupby(["match_id", "batting_team"])["total_runs"].sum().reset_index()

# Calculate average runs in the last 5 matches for each team
team_runs["avg_runs_last_5"] = team_runs.groupby("batting_team")["total_runs"].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean()
)

# Merge this back into the matches DataFrame
matches = pd.merge(matches, team_runs[["match_id", "batting_team", "avg_runs_last_5"]], 
                   left_on="id", right_on="match_id", how="left")

In [96]:
from sklearn.preprocessing import LabelEncoder

# Encode teams, venues, etc.
label_encoders = {}
for column in ["toss_winner", "toss_decision","winner"]:
    le = LabelEncoder()
    matches[column] = le.fit_transform(matches[column])
    label_encoders[column] = le

In [97]:
# Features
features = ["team1", "team2", "venue", "toss_winner", "toss_decision", "avg_runs_last_5"]
X = matches[features]

# Target
y = matches["winner"]

In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rfc_model = RandomForestClassifier(random_state=42)

# Train the model
rfc_model.fit(X_train, y_train)

In [102]:
from sklearn.metrics import accuracy_score
# Predict on the test set
rfc_y_pred = rfc_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rfc_y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.67


In [103]:
from sklearn.neighbors import KNeighborsClassifier

knn_model= KNeighborsClassifier()
knn_model.fit(X_train, y_train)

knn_y_pred = knn_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, knn_y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.34


In [None]:
import joblib

# Save the model and encoders
joblib.dump({
    "model": rfc_model,  # Your trained model
    "ordinal_encoder": ord_encoder,  # Your OrdinalEncoder
    "label_encoders": label_encoders,  # Any other encoders you used
}, "ipl_match_predictor.pkl")

['ipl_match_predictor.pkl']