In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("./output/master_dataframe.csv", index_col=0)

# Feature engineering
df["player_runs"] = df["runs.batter"]
df["player_wickets"] = np.where(df["wicket.kind"].notna(), 1, 0)
df["player_fours"] = np.where(df["runs.batter"] == 4, 1, 0)
df["player_sixes"] = np.where(df["runs.batter"] == 6, 1, 0)

# One-hot encode the categorical variables
categorical_cols = ["team_1", "team_2", "venue","bowler","non_striker"]
df = pd.get_dummies(df, columns=categorical_cols)

# Group the data by player and calculate aggregated features
agg_func={
    col: "sum"
    for col in df.columns
    if col.startswith("team_") or col.startswith("venue_") or col in ["player_runs", "player_wickets", "player_fours", "player_sixes"]
}
agg_func["over"]="mean"

player_stats = (
    df.groupby("batter")[
        ["over","player_runs", "player_wickets", "player_fours", "player_sixes"]
        + [
            col
            for col in df.columns
            if col.startswith("team_") or col.startswith("venue_")
        ]
    ]
    .agg(agg_func)
    .reset_index()
)

In [29]:
player_stats.columns

Index(['batter', 'team_1_runs.total', 'team_2_runs.total', 'team_1_over',
       'team_2_over', 'team_1_runs.extras', 'team_2_runs.extras',
       'team_1_extras.legbyes', 'team_2_extras.legbyes', 'team_1_extras.wides',
       'team_2_extras.wides', 'team_1_extras.byes', 'team_2_extras.byes',
       'team_1_extras.noballs', 'team_2_extras.noballs', 'player_runs',
       'player_wickets', 'player_fours', 'player_sixes',
       'team_1_Chennai Super Kings', 'team_1_Deccan Chargers',
       'team_1_Delhi Capitals', 'team_1_Delhi Daredevils',
       'team_1_Gujarat Lions', 'team_1_Gujarat Titans',
       'team_1_Kings XI Punjab', 'team_1_Kochi Tuskers Kerala',
       'team_1_Kolkata Knight Riders', 'team_1_Lucknow Super Giants',
       'team_1_Mumbai Indians', 'team_1_Pune Warriors', 'team_1_Punjab Kings',
       'team_1_Rajasthan Royals', 'team_1_Rising Pune Supergiant',
       'team_1_Rising Pune Supergiants', 'team_1_Royal Challengers Bangalore',
       'team_1_Royal Challengers Bengalu

In [30]:
# Merge the aggregated player stats back into the main DataFrame
df = pd.merge(df, player_stats, on="batter", how="left",suffixes=["_original",""])

In [31]:
df.columns

Index(['date', 'match_number', 'innings', 'over_original', 'batter',
       'runs.batter', 'runs.extras', 'runs.total', 'extras.legbyes',
       'extras.wides',
       ...
       'venue_ShaheedVeerNarayanSinghInternationalStadium',
       'venue_SharjahCricketStadium', 'venue_SheikhZayedStadium',
       'venue_StGeorge'sPark', 'venue_SubrataRoySaharaStadium',
       'venue_SuperSportPark', 'venue_VidarbhaCricketAssociationStadium',
       'venue_WankhedeStadium', 'venue_ZayedCricketStadium', 'over'],
      dtype='object', length=1428)

In [32]:
"player_fours" in df.columns

True

In [33]:
df.fillna(0,inplace=True)

In [None]:
df['']

In [34]:
type(df['wicket.kind'][0])

int

In [None]:


# Define the target variable (fantasy points)
df["fantasy_points"] = (
    df["runs.batter"]
    + ((df["wicket.kind"]!=0) * -10)
    +(df['player_wickets'] * -1)
    + (df["player_fours"] * 4)
    + (df["player_sixes"] * 6)
)

# Split the data into training and testing sets
X = df[
    ["player_runs", "player_wickets", "player_fours", "player_sixes","over"]
    + [col for col in df.columns if col.startswith("team_") or col.startswith("venue_")]
]
y = df["fantasy_points"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)
print(f"Random Forest MSE: {rf_mse:.2f}")
print(f"Random Forest R-squared: {rf_r2:.2f}")

# gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
# gb_model.fit(X_train, y_train)
# gb_y_pred = gb_model.predict(X_test)
# gb_mse = mean_squared_error(y_test, gb_y_pred)
# gb_r2 = r2_score(y_test, gb_y_pred)
# print(f"Gradient Boosting MSE: {gb_mse:.2f}")
# print(f"Gradient Boosting R-squared: {gb_r2:.2f}")

# nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), random_state=42)
# nn_model.fit(X_train, y_train)
# nn_y_pred = nn_model.predict(X_test)
# nn_mse = mean_squared_error(y_test, nn_y_pred)
# nn_r2 = r2_score(y_test, nn_y_pred)
# print(f"Neural Network MSE: {nn_mse:.2f}")
# print(f"Neural Network R-squared: {nn_r2:.2f}")

# Cross-Validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
rf_scores = cross_val_score(rf_model, X, y, cv=kf, scoring="r2")
print(f"Random Forest R-squared (CV): {rf_scores.mean():.2f}")

# Player Insights Visualization
top_players = (
    df.groupby("batter")["fantasy_points"].mean().sort_values(ascending=False).head(20)
)
plt.figure(figsize=(12, 6))
top_players.plot(kind="bar")
plt.title("Top 20 Players by Average Fantasy Points")
plt.xlabel("Player")
plt.ylabel("Average Fantasy Points")
plt.show()

Random Forest MSE: 9.65
Random Forest R-squared: 1.00


In [15]:
df

Unnamed: 0,date,match_number,innings,over_original,batter,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,...,venue_ShaheedVeerNarayanSinghInternationalStadium,venue_SharjahCricketStadium,venue_SheikhZayedStadium,venue_StGeorge'sPark,venue_SubrataRoySaharaStadium,venue_SuperSportPark,venue_VidarbhaCricketAssociationStadium,venue_WankhedeStadium,venue_ZayedCricketStadium,fantasy_points
0,2008-04-18,1,1,0,SC Ganguly,0,1,1,1.0,0.0,...,0,0,0,47,114,20,0,26,0,800
1,2008-04-18,1,1,0,BB McCullum,0,0,0,0.0,0.0,...,21,33,59,39,80,112,0,167,0,1952
2,2008-04-18,1,1,0,BB McCullum,0,1,1,0.0,1.0,...,21,33,59,39,80,112,0,167,0,1952
3,2008-04-18,1,1,0,BB McCullum,0,0,0,0.0,0.0,...,21,33,59,39,80,112,0,167,0,1952
4,2008-04-18,1,1,0,BB McCullum,0,0,0,0.0,0.0,...,21,33,59,39,80,112,0,167,0,1952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,Final,2,9,SS Iyer,1,0,1,0.0,0.0,...,56,150,126,0,0,0,0,166,34,1763
260755,2024-05-26,Final,2,9,VR Iyer,1,0,1,0.0,0.0,...,0,127,0,0,0,0,0,141,76,851
260756,2024-05-26,Final,2,10,VR Iyer,1,0,1,0.0,0.0,...,0,127,0,0,0,0,0,141,76,851
260757,2024-05-26,Final,2,10,SS Iyer,1,0,1,0.0,0.0,...,56,150,126,0,0,0,0,166,34,1763
