In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error


In [3]:
# Load your ball-by-ball IPL data
df = pd.read_csv("/Users/pavanbandaru/Downloads/Dataset_Final/ipl_male_csv/all_matches 10.39.09 AM 10.39.09 AM.csv")  # Replace with your actual file path

# Preview the data
df.head()


  df = pd.read_csv("/Users/pavanbandaru/Downloads/Dataset_Final/ipl_male_csv/all_matches 10.39.09 AM 10.39.09 AM.csv")  # Replace with your actual file path


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,18/04/08,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,18/04/08,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,18/04/08,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,18/04/08,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,18/04/08,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [4]:
# Ensure runs column is clean
df['runs'] = df['runs_off_bat'].fillna(0)

# Group by match, batsman and basic metadata
batsman_match_df = df.groupby(['match_id', 'striker', 'venue', 'innings', 'batting_team', 'bowling_team']).agg({
    'runs': 'sum',
    'ball': 'count'
}).reset_index()

# Rename for clarity
batsman_match_df.rename(columns={'striker': 'batsman_name', 'ball': 'balls_faced'}, inplace=True)

# View processed data
batsman_match_df.head()


Unnamed: 0,match_id,batsman_name,venue,innings,batting_team,bowling_team,runs,balls_faced
0,335982,AA Noffke,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,9,12
1,335982,B Akhil,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,2
2,335982,BB McCullum,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,158,77
3,335982,CL White,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,6,10
4,335982,DJ Hussey,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,12,12


In [5]:
# Batsman's overall average
batsman_avg = batsman_match_df.groupby('batsman_name')['runs'].mean().reset_index()
batsman_avg.rename(columns={'runs': 'batsman_current_avg'}, inplace=True)

# Venue average runs
venue_avg = batsman_match_df.groupby('venue')['runs'].mean().reset_index()
venue_avg.rename(columns={'runs': 'venue_avg'}, inplace=True)

# Merge both
batsman_match_df = batsman_match_df.merge(batsman_avg, on='batsman_name', how='left')
batsman_match_df = batsman_match_df.merge(venue_avg, on='venue', how='left')

batsman_match_df.head()


Unnamed: 0,match_id,batsman_name,venue,innings,batting_team,bowling_team,runs,balls_faced,batsman_current_avg,venue_avg
0,335982,AA Noffke,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,9,12,9.0,19.853734
1,335982,B Akhil,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,2,6.909091,19.853734
2,335982,BB McCullum,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,158,77,25.963964,19.853734
3,335982,CL White,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,6,10,21.108696,19.853734
4,335982,DJ Hussey,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,12,12,21.672131,19.853734


In [9]:
# Step 1: Create a helper to extract batting position
def get_batsman_positions(df):
    position_data = []

    for (match_id, innings), group in df.groupby(['match_id', 'innings']):
        unique_batsmen = list(pd.unique(group['striker']))
        for pos, batsman in enumerate(unique_batsmen, start=1):
            position_data.append([match_id, innings, batsman, pos])

    position_df = pd.DataFrame(position_data, columns=['match_id', 'innings', 'batsman_name', 'batsman_position'])
    return position_df

# Step 2: Generate position DataFrame
position_df = get_batsman_positions(df)

# Step 3: Merge with match-level batsman data
batsman_match_df = batsman_match_df.merge(position_df, on=['match_id', 'innings', 'batsman_name'], how='left')

# Step 4: Fill missing positions with 11 (default lower order)
batsman_match_df['batsman_position'] = batsman_match_df['batsman_position'].fillna(11).astype(int)

# Preview
batsman_match_df.head()


Unnamed: 0,match_id,batsman_name,venue,innings,batting_team,bowling_team,runs,balls_faced,batsman_current_avg,venue_avg,batsman_position
0,335982,AA Noffke,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,9,12,9.0,19.853734,8
1,335982,B Akhil,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,0,2,6.909091,19.853734,7
2,335982,BB McCullum,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,158,77,25.963964,19.853734,2
3,335982,CL White,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,6,10,21.108696,19.853734,5
4,335982,DJ Hussey,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,12,12,21.672131,19.853734,4


In [10]:
label_cols = ['venue', 'batting_team', 'bowling_team', 'batsman_name']
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    batsman_match_df[col] = le.fit_transform(batsman_match_df[col])
    label_encoders[col] = le

batsman_match_df.head()


Unnamed: 0,match_id,batsman_name,venue,innings,batting_team,bowling_team,runs,balls_faced,batsman_current_avg,venue_avg,batsman_position
0,335982,25,23,2,16,8,9,12,9.0,19.853734,8
1,335982,90,23,2,16,8,0,2,6.909091,19.853734,7
2,335982,102,23,1,8,16,158,77,25.963964,19.853734,2
3,335982,132,23,2,16,8,6,10,21.108696,19.853734,5
4,335982,157,23,1,8,16,12,12,21.672131,19.853734,4


In [11]:
features = ['venue', 'innings', 'batting_team', 'bowling_team',
            'batsman_name', 'batsman_position', 'venue_avg', 'batsman_current_avg']

X = batsman_match_df[features]
y = batsman_match_df['runs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Validation MAE: {mae:.2f}")


Validation MAE: 15.32


In [12]:
def predict_innings_score(players_list, model, label_encoders):
    preds = []
    for player in players_list:
        player_input = player.copy()
        for col in ['venue', 'batting_team', 'bowling_team', 'batsman_name']:
            player_input[col] = label_encoders[col].transform([player_input[col]])[0]

        input_df = pd.DataFrame([player_input])
        pred = model.predict(input_df)[0]
        preds.append((player['batsman_name'], pred))

    total = sum([p[1] for p in preds])
    return total, preds


In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# 1. Load ball-by-ball data
df = pd.read_csv('/Users/pavanbandaru/Downloads/Dataset_Final/datasets/all_matches copy.csv')

# 2. Precompute player-innings aggregates
# Aggregate runs by player per match and innings
df_player = (
    df.groupby(['match_id','venue','innings','batting_team','bowling_team','striker'])['runs_off_bat']
      .agg(['sum','mean','std'])
      .reset_index()
      .rename(columns={'sum':'player_runs','mean':'player_mean','std':'player_std'})
)

# Add recent form: average of last 5 innings per player
player_history = []
for player, g in df_player.groupby('striker'):
    g = g.sort_values('match_id')
    g['recent_form'] = g['player_runs'].rolling(5, min_periods=1).mean().shift(1)
    player_history.append(g)

df_player = pd.concat(player_history, ignore_index=True)
# Fill NaNs
df_player[['player_std','recent_form']] = df_player[['player_std','recent_form']].fillna(0)

# 3. Compute venue-average innings total
df_total = (
    df.groupby(['match_id','venue','innings'])['runs_off_bat']
      .sum()
      .reset_index()
      .rename(columns={'runs_off_bat':'innings_total'})
)
venue_avg = (
    df_total.groupby('venue')['innings_total']
      .mean()
      .reset_index()
      .rename(columns={'innings_total':'venue_avg'})
)
# Merge venue_avg into both df_player and df_total

df_player = df_player.merge(
    df_total[['match_id','innings','venue']],
    on=['match_id','innings','venue'], how='left'
).merge(venue_avg, on='venue', how='left')

df_total = df_total.merge(venue_avg, on='venue', how='left')

# 4. Train individual player-run predictor
features_player = ['venue_avg','player_mean','player_std','recent_form']
df_p = df_player.copy()

# Split matches to avoid leakage
match_ids = df_p['match_id'].unique()
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)
train = df_p[df_p['match_id'].isin(train_ids)]
test  = df_p[df_p['match_id'].isin(test_ids)]

# Prepare DMatrix
dtrain = xgb.DMatrix(train[features_player], label=train['player_runs'])
dtest  = xgb.DMatrix(test[features_player], label=test['player_runs'])
params = {'objective':'reg:squarederror','learning_rate':0.05,'max_depth':6,'seed':42}

# Train
model_player = xgb.train(
    params, dtrain, num_boost_round=300,
    evals=[(dtest,'eval')], early_stopping_rounds=20, verbose_eval=False
)
# Predict & metrics
pred_train = model_player.predict(dtrain)
pred_test  = model_player.predict(dtest)
print("Player Model Metrics:")
print(" Train R2:", r2_score(train['player_runs'], pred_train))
print(" Train MAE:", mean_absolute_error(train['player_runs'], pred_train))
print(" Train MSE:", mean_squared_error(train['player_runs'], pred_train))
print(" Test  R2:", r2_score(test['player_runs'], pred_test))
print(" Test  MAE:", mean_absolute_error(test['player_runs'], pred_test))
print(" Test  MSE:", mean_squared_error(test['player_runs'], pred_test))

# Save model
model_player.save_model('model_player.json')

# 5. Prepare aggregator training data
agg_rows = []
for mid, group in df_player.groupby('match_id'):
    top11 = group.nlargest(11, 'player_runs')
    preds = model_player.predict(xgb.DMatrix(top11[features_player]))
    row = {f'pred_{i+1}': preds[i] for i in range(len(preds))}
    va = top11['venue_avg'].iloc[0]
    inn = top11['innings'].iloc[0]
    total = df_total.loc[(df_total['match_id']==mid)&(df_total['innings']==inn),'innings_total'].values[0]
    row.update({'venue_avg': va, 'innings': inn, 'total': total})
    agg_rows.append(row)

df_agg = pd.DataFrame(agg_rows)
features_agg = [f'pred_{i+1}' for i in range(11)] + ['venue_avg','innings']
Xagg = df_agg[features_agg]
yagg = df_agg['total']

# Split aggregator data
Xtr_agg, Xte_agg, ytr_agg, yte_agg = train_test_split(Xagg, yagg, test_size=0.2, random_state=42)

dtrain_agg = xgb.DMatrix(Xtr_agg, label=ytr_agg)
dtest_agg  = xgb.DMatrix(Xte_agg, label=yte_agg)

# Train aggregator
model_agg = xgb.train(
    params, dtrain_agg, num_boost_round=200,
    evals=[(dtest_agg,'eval')], early_stopping_rounds=20, verbose_eval=False
)
# Predict & metrics
pred_train_agg = model_agg.predict(dtrain_agg)
pred_test_agg  = model_agg.predict(dtest_agg)
print("\nAggregator Model Metrics:")
print(" Train R2:", r2_score(ytr_agg, pred_train_agg))
print(" Train MAE:", mean_absolute_error(ytr_agg, pred_train_agg))
print(" Train MSE:", mean_squared_error(ytr_agg, pred_train_agg))
print(" Test  R2:", r2_score(yte_agg, pred_test_agg))
print(" Test  MAE:", mean_absolute_error(yte_agg, pred_test_agg))
print(" Test  MSE:", mean_squared_error(yte_agg, pred_test_agg))

# Save aggregator model
model_agg.save_model('model_agg.json')

# 6. Prediction function
def predict_innings(players, venue, innings, batting_team, bowling_team):  # Added batting_team and bowling_team
    feats = []
    for p in players:
        hist = df_player[df_player['striker'] == p].sort_values('match_id')
        pm = hist['player_mean'].iloc[-1]
        ps = hist['player_std'].iloc[-1]
        rf = hist['recent_form'].iloc[-1]
        va = venue_avg.loc[venue_avg['venue'] == venue, 'venue_avg'].values[0]
        feats.append([va, pm, ps, rf])
    df_feat = pd.DataFrame(feats, columns=features_player)
    pred_scores = model_player.predict(xgb.DMatrix(df_feat))
    agg_input = {f'pred_{i+1}': pred_scores[i] for i in range(len(pred_scores))}
    agg_input.update({'venue_avg': va, 'innings': innings, 'batting_team': batting_team, 'bowling_team': bowling_team})
    df_row = pd.DataFrame([agg_input])[features_agg]
    total_pred = model_agg.predict(xgb.DMatrix(df_row))[0]
    return total_pred

# Example usage:
# print(predict_innings(['Player1', 'Player2', ..., 'Player11'], 'Eden Gardens', 'KKR', 'MI', 1))

Player Model Metrics:
 Train R2: 0.6982037425041199
 Train MAE: 7.350256343249168
 Train MSE: 139.627847310032
 Test  R2: 0.5911485552787781
 Test  MAE: 8.192029822863029
 Test  MSE: 173.84761901470455

Aggregator Model Metrics:
 Train R2: 0.9581966400146484
 Train MAE: 4.716118381447988
 Train MSE: 41.824412318561684
 Test  R2: 0.6291456818580627
 Test  MAE: 14.61208298217216
 Test  MSE: 350.58723924330917


In [41]:
# Example prediction block
players = ['Q de Kock','SP Narine','AM Rahane','VR Iyer','RK Singh','AD Russell',
           'Ramandeep Singh','Harshit Rana','A Raghuvanshi','SH Johnson','CV Varun']
venue = 'Eden Gardens'
innings = 1
batting_team = 'Kolkata Knight Riders'
bowling_team = 'Royal Challengers Bangalore'

predicted_score = int(predict_innings(players, venue, innings, batting_team, bowling_team))  
print(f"Predicted innings score : {predicted_score}")

Predicted innings score : 151


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# 1. Load ball-by-ball data
df = pd.read_csv('/Users/pavanbandaru/Downloads/Dataset_Final/datasets/all_matches copy.csv')

# 2. Precompute player-innings aggregates
df_player = (
    df.groupby(['match_id','venue','innings','batting_team','bowling_team','striker'])['runs_off_bat']
      .agg(['sum','mean','std'])
      .reset_index()
      .rename(columns={'sum':'player_runs','mean':'player_mean','std':'player_std'})
)

# Add recent form: average of last 5 innings per player
player_history = []
for player, g in df_player.groupby('striker'):
    g = g.sort_values('match_id')
    g['recent_form'] = g['player_runs'].rolling(5, min_periods=1).mean().shift(1)
    player_history.append(g)

df_player = pd.concat(player_history, ignore_index=True)
df_player[['player_std','recent_form']] = df_player[['player_std','recent_form']].fillna(0)

# 3. Compute venue-average innings total
df_total = (
    df.groupby(['match_id','venue','innings'])['runs_off_bat']
      .sum()
      .reset_index()
      .rename(columns={'runs_off_bat':'innings_total'})
)
venue_avg = (
    df_total.groupby('venue')['innings_total']
      .mean()
      .reset_index()
      .rename(columns={'innings_total':'venue_avg'})
)

df_player = df_player.merge(df_total[['match_id','innings','venue']], on=['match_id','innings','venue'], how='left')
df_player = df_player.merge(venue_avg, on='venue', how='left')
df_total = df_total.merge(venue_avg, on='venue', how='left')

# 4. Train individual player-run predictor
features_player = ['venue_avg','player_mean','player_std','recent_form']
df_p = df_player.copy()

match_ids = df_p['match_id'].unique()
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)
train = df_p[df_p['match_id'].isin(train_ids)]
test  = df_p[df_p['match_id'].isin(test_ids)]

model_player = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    max_depth=6,
    n_estimators=300,
    early_stopping_rounds=20,
    random_state=42
)

model_player.fit(
    train[features_player], train['player_runs'],
    eval_set=[(test[features_player], test['player_runs'])],
    verbose=False
)

# Evaluate Player Model
pred_train = model_player.predict(train[features_player])
pred_test = model_player.predict(test[features_player])
print("Improved Player Model Metrics:")
print(" Train R2:", r2_score(train['player_runs'], pred_train))
print(" Test  R2:", r2_score(test['player_runs'], pred_test))
print(" Test  MAE:", mean_absolute_error(test['player_runs'], pred_test))

# 5. Prepare aggregator training data
agg_rows = []
for mid, group in df_player.groupby('match_id'):
    top11 = group.nlargest(11, 'player_runs')
    preds = model_player.predict(top11[features_player])
    row = {f'pred_{i+1}': preds[i] for i in range(len(preds))}
    va = top11['venue_avg'].iloc[0]
    inn = top11['innings'].iloc[0]
    total = df_total.loc[(df_total['match_id']==mid)&(df_total['innings']==inn),'innings_total'].values[0]
    row.update({'venue_avg': va, 'innings': inn, 'total': total})
    agg_rows.append(row)

df_agg = pd.DataFrame(agg_rows)
features_agg = [f'pred_{i+1}' for i in range(11)] + ['venue_avg','innings']
Xagg = df_agg[features_agg]
yagg = df_agg['total']

Xtr_agg, Xte_agg, ytr_agg, yte_agg = train_test_split(Xagg, yagg, test_size=0.2, random_state=42)

model_agg = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    max_depth=6,
    n_estimators=200,
    early_stopping_rounds=20,
    random_state=42
)

model_agg.fit(
    Xtr_agg, ytr_agg,
    eval_set=[(Xte_agg, yte_agg)],
    verbose=False
)

# Evaluate Aggregator Model
pred_train_agg = model_agg.predict(Xtr_agg)
pred_test_agg = model_agg.predict(Xte_agg)
print("\nImproved Aggregator Model Metrics:")
print(" Train R2:", r2_score(ytr_agg, pred_train_agg))
print(" Test  R2:", r2_score(yte_agg, pred_test_agg))
print(" Test  MAE:", mean_absolute_error(yte_agg, pred_test_agg))

# 6. Prediction function
def predict_innings(players, venue, innings):
    feats = []
    for p in players:
        hist = df_player[df_player['striker']==p].sort_values('match_id')
        if hist.empty:
            feats.append([venue_avg['venue_avg'].mean(), 0, 0, 0])
        else:
            pm = hist['player_mean'].iloc[-1]
            ps = hist['player_std'].iloc[-1]
            rf = hist['recent_form'].iloc[-1]
            va = venue_avg.loc[venue_avg['venue']==venue,'venue_avg'].values[0]
            feats.append([va, pm, ps, rf])
    df_feat = pd.DataFrame(feats, columns=features_player)
    pred_scores = model_player.predict(df_feat)
    agg_input = {f'pred_{i+1}': pred_scores[i] for i in range(len(pred_scores))}
    agg_input.update({'venue_avg': va, 'innings': innings})
    df_row = pd.DataFrame([agg_input])[features_agg]
    total_pred = model_agg.predict(df_row)[0]
    return total_pred

# Example usage (replace with real player names):
# print(predict_innings(['Player1', ..., 'Player11'], 'Eden Gardens', 1))


Improved Player Model Metrics:
 Train R2: 0.6882587671279907
 Test  R2: 0.5922552943229675
 Test  MAE: 8.19898211169219

Improved Aggregator Model Metrics:
 Train R2: 0.9456862807273865
 Test  R2: 0.5993441939353943
 Test  MAE: 15.319839407864226


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# 1. Load ball-by-ball data
df = pd.read_csv('/Users/pavanbandaru/Downloads/Dataset_Final/datasets/all_matches copy.csv')

# 2. Precompute player-innings aggregates
# Aggregate runs by player per match and innings
df_player = (
    df.groupby(['match_id','venue','innings','batting_team','bowling_team','striker'])['runs_off_bat']
      .agg(['sum','mean','std'])
      .reset_index()
      .rename(columns={'sum':'player_runs','mean':'player_mean','std':'player_std'})
)

# Add recent form: average of last 5 innings per player
player_history = []
for player, g in df_player.groupby('striker'):
    g = g.sort_values('match_id')
    g['recent_form'] = g['player_runs'].rolling(5, min_periods=1).mean().shift(1)
    player_history.append(g)

df_player = pd.concat(player_history, ignore_index=True)
# Fill NaNs
df_player[['player_std','recent_form']] = df_player[['player_std','recent_form']].fillna(0)

# 3. Compute venue-average innings total
df_total = (
    df.groupby(['match_id','venue','innings'])['runs_off_bat']
      .sum()
      .reset_index()
      .rename(columns={'runs_off_bat':'innings_total'})
)
venue_avg = (
    df_total.groupby('venue')['innings_total']
      .mean()
      .reset_index()
      .rename(columns={'innings_total':'venue_avg'})
)
# Merge venue_avg into both df_player and df_total

df_player = df_player.merge(
    df_total[['match_id','innings','venue']],
    on=['match_id','innings','venue'], how='left'
).merge(venue_avg, on='venue', how='left')

df_total = df_total.merge(venue_avg, on='venue', how='left')

# 4. Train individual player-run predictor
features_player = ['venue_avg','player_mean','player_std','recent_form']
df_p = df_player.copy()

# Split matches to avoid leakage
match_ids = df_p['match_id'].unique()
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)
train = df_p[df_p['match_id'].isin(train_ids)]
test  = df_p[df_p['match_id'].isin(test_ids)]

# Prepare DMatrix
dtrain = xgb.DMatrix(train[features_player], label=train['player_runs'])
dtest  = xgb.DMatrix(test[features_player], label=test['player_runs'])
params = {'objective':'reg:squarederror','learning_rate':0.05,'max_depth':6,'seed':42}

# Train
model_player = xgb.train(
    params, dtrain, num_boost_round=300,
    evals=[(dtest,'eval')], early_stopping_rounds=20, verbose_eval=False
)
# Predict & metrics
pred_train = model_player.predict(dtrain)
pred_test  = model_player.predict(dtest)
print("Player Model Metrics:")
print(" Train R2:", r2_score(train['player_runs'], pred_train))
print(" Train MAE:", mean_absolute_error(train['player_runs'], pred_train))
print(" Test  R2:", r2_score(test['player_runs'], pred_test))
print(" Test  MAE:", mean_absolute_error(test['player_runs'], pred_test))

# Save model
model_player.save_model('model_player.json')

# 5. Prepare aggregator training data
agg_rows = []
for mid, group in df_player.groupby('match_id'):
    top11 = group.nlargest(11, 'player_runs')
    preds = model_player.predict(xgb.DMatrix(top11[features_player]))
    row = {f'pred_{i+1}': preds[i] for i in range(len(preds))}
    for i in range(len(preds), 11):  # Fill missing pred_i with 0 if < 11
        row[f'pred_{i+1}'] = 0
    va = top11['venue_avg'].iloc[0]
    inn = top11['innings'].iloc[0]
    total = df_total.loc[(df_total['match_id']==mid)&(df_total['innings']==inn),'innings_total'].values[0]
    row.update({'venue_avg': va, 'innings': inn, 'total': total})
    agg_rows.append(row)

df_agg = pd.DataFrame(agg_rows)
features_agg = [f'pred_{i+1}' for i in range(11)] + ['venue_avg','innings']
Xagg = df_agg[features_agg]
yagg = df_agg['total']

# Split aggregator data
Xtr_agg, Xte_agg, ytr_agg, yte_agg = train_test_split(Xagg, yagg, test_size=0.2, random_state=42)

dtrain_agg = xgb.DMatrix(Xtr_agg, label=ytr_agg)
dtest_agg  = xgb.DMatrix(Xte_agg, label=yte_agg)

# Train aggregator
model_agg = xgb.train(
    params, dtrain_agg, num_boost_round=200,
    evals=[(dtest_agg,'eval')], early_stopping_rounds=20, verbose_eval=False
)
# Predict & metrics
pred_train_agg = model_agg.predict(dtrain_agg)
pred_test_agg  = model_agg.predict(dtest_agg)
print("\nAggregator Model Metrics:")
print(" Train R2:", r2_score(ytr_agg, pred_train_agg))
print(" Train MAE:", mean_absolute_error(ytr_agg, pred_train_agg))
print(" Test  R2:", r2_score(yte_agg, pred_test_agg))
print(" Test  MAE:", mean_absolute_error(yte_agg, pred_test_agg))

# Save aggregator model
model_agg.save_model('model_agg.json')

# 6. Prediction function
def predict_innings(players, venue, innings):
    feats = []
    for p in players:
        hist = df_player[df_player['striker']==p].sort_values('match_id')
        pm = hist['player_mean'].iloc[-1]
        ps = hist['player_std'].iloc[-1]
        rf = hist['recent_form'].iloc[-1]
        va = venue_avg.loc[venue_avg['venue']==venue,'venue_avg'].values[0]
        feats.append([va, pm, ps, rf])
    df_feat = pd.DataFrame(feats, columns=features_player)
    pred_scores = model_player.predict(xgb.DMatrix(df_feat))
    agg_input = {f'pred_{i+1}': pred_scores[i] for i in range(len(pred_scores))}
    for i in range(len(pred_scores), 11):  # Fill missing preds
        agg_input[f'pred_{i+1}'] = 0
    agg_input.update({'venue_avg': va, 'innings': innings})
    df_row = pd.DataFrame([agg_input])[features_agg]
    total_pred = model_agg.predict(xgb.DMatrix(df_row))[0]
    return total_pred

# Example usage:
# print(predict_innings(['Player1', 'Player2', ..., 'Player5 to 11'], 'Eden Gardens', 1))


Player Model Metrics:
 Train R2: 0.6982037425041199
 Train MAE: 7.350256343249168
 Test  R2: 0.5911485552787781
 Test  MAE: 8.192029822863029

Aggregator Model Metrics:
 Train R2: 0.9682453274726868
 Train MAE: 4.0628772413349585
 Test  R2: 0.6051228642463684
 Test  MAE: 15.121268668675532


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# 1. Load ball-by-ball data
df = pd.read_csv('/Users/pavanbandaru/Downloads/Dataset_Final/datasets/all_matches copy.csv')

# 2. Precompute player-innings aggregates
df_player = (
    df.groupby(['match_id', 'venue', 'innings', 'batting_team', 'bowling_team', 'striker'])['runs_off_bat']
      .agg(['sum', 'mean', 'std'])
      .reset_index()
      .rename(columns={'sum': 'player_runs', 'mean': 'player_mean', 'std': 'player_std'})
)

# Add recent form: average of last 5 innings per player
player_history = []
for player, g in df_player.groupby('striker'):
    g = g.sort_values('match_id')
    g['recent_form'] = g['player_runs'].rolling(5, min_periods=1).mean().shift(1)
    player_history.append(g)

df_player = pd.concat(player_history, ignore_index=True)
df_player[['player_std', 'recent_form']] = df_player[['player_std', 'recent_form']].fillna(0)

# 3. Improved Venue-Innings Wise Average (dynamic over time)
df_total = (
    df.groupby(['match_id', 'venue', 'innings'])['runs_off_bat']
      .sum()
      .reset_index()
      .rename(columns={'runs_off_bat': 'innings_total'})
)
df_total = df_total.sort_values('match_id')

# Compute expanding average per venue and innings
venue_history = []
for (venue, inns), g in df_total.groupby(['venue', 'innings']):
    g = g.sort_values('match_id').copy()
    g['venue_avg'] = g['innings_total'].expanding().mean().shift(1).fillna(0)
    venue_history.append(g)

df_total = pd.concat(venue_history, ignore_index=True)

# Merge refined venue_avg into df_player
df_player = df_player.merge(
    df_total[['match_id', 'venue', 'innings', 'venue_avg']],
    on=['match_id', 'venue', 'innings'], how='left'
)

# 4. Train individual player-run predictor
features_player = ['venue_avg', 'player_mean', 'player_std', 'recent_form']
df_p = df_player.copy()

match_ids = df_p['match_id'].unique()
train_ids, test_ids = train_test_split(match_ids, test_size=0.2, random_state=42)
train = df_p[df_p['match_id'].isin(train_ids)]
test = df_p[df_p['match_id'].isin(test_ids)]

dtrain = xgb.DMatrix(train[features_player], label=train['player_runs'])
dtest = xgb.DMatrix(test[features_player], label=test['player_runs'])

params = {'objective': 'reg:squarederror', 'learning_rate': 0.05, 'max_depth': 6, 'seed': 42}

model_player = xgb.train(
    params, dtrain, num_boost_round=300,
    evals=[(dtest, 'eval')], early_stopping_rounds=20, verbose_eval=False
)

pred_train = model_player.predict(dtrain)
pred_test = model_player.predict(dtest)
print("Player Model Metrics:")
print(" Train R2:", r2_score(train['player_runs'], pred_train))
print(" Train MAE:", mean_absolute_error(train['player_runs'], pred_train))
print(" Test  R2:", r2_score(test['player_runs'], pred_test))
print(" Test  MAE:", mean_absolute_error(test['player_runs'], pred_test))

model_player.save_model('model_player.json')

# 5. Prepare aggregator training data
agg_rows = []
for mid, group in df_player.groupby('match_id'):
    for inn in [1, 2]:
        group_inn = group[group['innings'] == inn]
        if group_inn.empty:
            continue
        top11 = group_inn.nlargest(11, 'player_runs')
        preds = model_player.predict(xgb.DMatrix(top11[features_player]))
        row = {f'pred_{i+1}': preds[i] for i in range(len(preds))}
        for i in range(len(preds), 11):
            row[f'pred_{i+1}'] = 0
        va = top11['venue_avg'].iloc[0]
        total = df_total.loc[
            (df_total['match_id'] == mid) & (df_total['innings'] == inn),
            'innings_total'
        ].values[0]
        row.update({'venue_avg': va, 'innings': inn, 'total': total})
        agg_rows.append(row)

df_agg = pd.DataFrame(agg_rows)
features_agg = [f'pred_{i+1}' for i in range(11)] + ['venue_avg', 'innings']
Xagg = df_agg[features_agg]
yagg = df_agg['total']

Xtr_agg, Xte_agg, ytr_agg, yte_agg = train_test_split(Xagg, yagg, test_size=0.2, random_state=42)

dtrain_agg = xgb.DMatrix(Xtr_agg, label=ytr_agg)
dtest_agg = xgb.DMatrix(Xte_agg, label=yte_agg)

model_agg = xgb.train(
    params, dtrain_agg, num_boost_round=200,
    evals=[(dtest_agg, 'eval')], early_stopping_rounds=20, verbose_eval=False
)

pred_train_agg = model_agg.predict(dtrain_agg)
pred_test_agg = model_agg.predict(dtest_agg)
print("\nAggregator Model Metrics:")
print(" Train R2:", r2_score(ytr_agg, pred_train_agg))
print(" Train MAE:", mean_absolute_error(ytr_agg, pred_train_agg))
print(" Test  R2:", r2_score(yte_agg, pred_test_agg))
print(" Test  MAE:", mean_absolute_error(yte_agg, pred_test_agg))

model_agg.save_model('model_agg.json')

# 6. Prediction function
def predict_innings(players, venue, innings):
    feats = []
    for p in players:
        hist = df_player[df_player['striker'] == p].sort_values('match_id')
        if hist.empty:
            pm, ps, rf = 0, 0, 0
        else:
            pm = hist['player_mean'].iloc[-1]
            ps = hist['player_std'].iloc[-1]
            rf = hist['recent_form'].iloc[-1]
        va = df_total[(df_total['venue'] == venue) & (df_total['innings'] == innings)].sort_values('match_id')
        va = va['venue_avg'].iloc[-1] if not va.empty else 0
        feats.append([va, pm, ps, rf])

    df_feat = pd.DataFrame(feats, columns=features_player)
    pred_scores = model_player.predict(xgb.DMatrix(df_feat))
    agg_input = {f'pred_{i+1}': pred_scores[i] for i in range(len(pred_scores))}
    for i in range(len(pred_scores), 11):
        agg_input[f'pred_{i+1}'] = 0
    agg_input.update({'venue_avg': va, 'innings': innings})
    df_row = pd.DataFrame([agg_input])[features_agg]
    total_pred = model_agg.predict(xgb.DMatrix(df_row))[0]
    return total_pred

# Example usage:
# predicted_total = predict_innings(['Player1', ..., 'Player11'], 'Wankhede Stadium', 2)


Player Model Metrics:
 Train R2: 0.7222804427146912
 Train MAE: 7.092842631368948
 Test  R2: 0.6033191680908203
 Test  MAE: 8.12347653273828

Aggregator Model Metrics:
 Train R2: 0.9054068326950073
 Train MAE: 7.32900210664639
 Test  R2: 0.6204249858856201
 Test  MAE: 14.603097541147172


In [None]:
players = [
    'PD Salt', 'V Kohli', 'RM Patidar', 'LS Livingstone', 'JM Sharma', 'TH David', 'KH Pandya', 'B Kumar', 'Rasikh Salam', 'Yash Dayal', 'JR Hazlewood'
]
venue = 'Eden Gardens'
innings = 1
predicted_total = int(predict_innings(players, venue, innings))
print(f"\nPredicted Total Runs: {predicted_total}")



Predicted Total Runs: 150


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb

# Load dataset
df = pd.read_csv('/Users/pavanbandaru/Downloads/Dataset_Final/datasets/all_matches copy.csv')

# Player-level features without using match_id
df_player = (
    df.groupby(['venue', 'innings', 'batting_team', 'bowling_team', 'striker'])['runs_off_bat']
    .agg(['sum', 'mean', 'std'])
    .reset_index()
    .rename(columns={'sum': 'player_runs', 'mean': 'player_mean', 'std': 'player_std'})
)

# Add recent form based on player and innings
player_history = []
for player, g in df_player.groupby('striker'):
    g = g.sort_values(['venue', 'innings'])  # approximate sort
    g['recent_form'] = g['player_runs'].rolling(5, min_periods=1).mean().shift(1)
    player_history.append(g)

df_player = pd.concat(player_history, ignore_index=True)
df_player[['player_std', 'recent_form']] = df_player[['player_std', 'recent_form']].fillna(0)

# Venue-Innings level target without match_id
df_total = (
    df.groupby(['venue', 'innings'])['runs_off_bat']
    .sum()
    .reset_index()
    .rename(columns={'runs_off_bat': 'innings_total'})
)

# Compute average per venue/innings dynamically
venue_history = []
for (venue, inn), g in df_total.groupby(['venue', 'innings']):
    g['venue_avg'] = g['innings_total'].expanding().mean().shift(1).fillna(0)
    venue_history.append(g)

df_total = pd.concat(venue_history, ignore_index=True)

# Merge venue avg into player data
df_player = df_player.merge(df_total[['venue', 'innings', 'venue_avg']], on=['venue', 'innings'], how='left')

# Prepare features
features_player = ['venue_avg', 'player_mean', 'player_std', 'recent_form']
df_p = df_player.copy()

# Train-test split without match_id
df_p['split'] = np.random.rand(len(df_p))
train = df_p[df_p['split'] < 0.8]
test = df_p[df_p['split'] >= 0.8]

dtrain = xgb.DMatrix(train[features_player], label=train['player_runs'])
dtest = xgb.DMatrix(test[features_player], label=test['player_runs'])

params = {'objective': 'reg:squarederror', 'learning_rate': 0.05, 'max_depth': 6, 'seed': 42}

model_player = xgb.train(params, dtrain, num_boost_round=300, evals=[(dtest, 'eval')], early_stopping_rounds=20, verbose_eval=False)

pred_train = model_player.predict(dtrain)
pred_test = model_player.predict(dtest)

print("Player Model Metrics:")
print(" Train R2:", r2_score(train['player_runs'], pred_train))
print(" Train MAE:", mean_absolute_error(train['player_runs'], pred_train))
print(" Test  R2:", r2_score(test['player_runs'], pred_test))
print(" Test  MAE:", mean_absolute_error(test['player_runs'], pred_test))

# Aggregator stage
agg_rows = []
for (venue, inn), group in df_player.groupby(['venue', 'innings']):
    top11 = group.nlargest(11, 'player_runs')
    preds = model_player.predict(xgb.DMatrix(top11[features_player]))
    row = {f'pred_{i+1}': preds[i] for i in range(len(preds))}
    for i in range(len(preds), 11):
        row[f'pred_{i+1}'] = 0
    va = top11['venue_avg'].iloc[0]
    total = df_total[(df_total['venue'] == venue) & (df_total['innings'] == inn)]['innings_total'].values[0]
    row.update({'venue_avg': va, 'innings': inn, 'total': total})
    agg_rows.append(row)

df_agg = pd.DataFrame(agg_rows)
features_agg = [f'pred_{i+1}' for i in range(11)] + ['venue_avg', 'innings']
Xagg = df_agg[features_agg]
yagg = df_agg['total']

Xtr_agg, Xte_agg, ytr_agg, yte_agg = train_test_split(Xagg, yagg, test_size=0.2, random_state=42)
dtrain_agg = xgb.DMatrix(Xtr_agg, label=ytr_agg)
dtest_agg = xgb.DMatrix(Xte_agg, label=yte_agg)

model_agg = xgb.train(params, dtrain_agg, num_boost_round=200, evals=[(dtest_agg, 'eval')], early_stopping_rounds=20, verbose_eval=False)

pred_train_agg = model_agg.predict(dtrain_agg)
pred_test_agg = model_agg.predict(dtest_agg)

print("\nAggregator Model Metrics:")
print(" Train R2:", r2_score(ytr_agg, pred_train_agg))
print(" Train MAE:", mean_absolute_error(ytr_agg, pred_train_agg))
print(" Test  R2:", r2_score(yte_agg, pred_test_agg))
print(" Test  MAE:", mean_absolute_error(yte_agg, pred_test_agg))


Player Model Metrics:
 Train R2: 0.680705726146698
 Train MAE: 8.928921911836564
 Test  R2: 0.48059970140457153
 Test  MAE: 10.85984425456369

Aggregator Model Metrics:
 Train R2: 0.9947566986083984
 Train MAE: 179.41153507232667
 Test  R2: 0.5614006519317627
 Test  MAE: 1925.6480529785156
