In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import necessary libraries
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings

In [None]:
# 01_data_loading_exploration.ipynb

# Define file paths
train_data_path = '/content/drive/MyDrive/Amex/663e2b6d54457_train_data_with_samplefeatures.csv'
match_data_path = '/content/drive/MyDrive/Amex/664389efa0868_match_level_scorecard.csv'
batsman_data_path = '/content/drive/MyDrive/Amex/663e2b548c98c_batsman_level_scorecard.csv'
bowler_data_path = '/content/drive/MyDrive/Amex/663e2b2c60743_bowler_level_scorecard.csv'
round1_data_path = '/content/drive/MyDrive/Amex/6644a1e287df6_test_data_with_samplefeatures.csv'

# Load datasets
train_data = pd.read_csv(train_data_path)
match_data = pd.read_csv(match_data_path)
batsman_data = pd.read_csv(batsman_data_path)
bowler_data = pd.read_csv(bowler_data_path)
round1_data = pd.read_csv(round1_data_path)

train_data['target'] = train_data.apply(lambda row: 1 if row['winner_id'] == row['team1_id'] else 0, axis=1)
# Display the first few rows of each dataset to understand their structures
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,target
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571,0
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5,1
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,day/night match,Vy Bt,2023,251,0.857143,0.672131,173.266667,0.0,154.333333,1
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,day match,Cn Pr Le,2023,14300,2.166667,1.97561,164.266667,50.0,144.25,1
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,night match,In Pr Le,2023,7118,0.818182,1.327869,164.666667,0.0,189.0,0


In [None]:
# Calculate average strike rate for each batsman across all matches
avg_strike_rates = batsman_data.groupby('batsman_id')['strike_rate'].mean().reset_index()

# Example function to calculate average strike rate per match ID
def average_strike_rate(roster_ids):
  player_ids = roster_ids.split(':')
  strike_rates = []
  # Iterate over each player_id in player_ids
  for player_id in player_ids:
    # Check if player_id is in avg_strike_rates DataFrame
    # if player_id in avg_strike_rates['batsman_id'].values:
    # Fetch strike rate corresponding to player_id and append to strike_rates
    strike_rate = avg_strike_rates.loc[avg_strike_rates['batsman_id'] == float(player_id)]['strike_rate']
    if not strike_rate.empty:
      strike_rates.append(strike_rate.iloc[0])
  return np.mean(strike_rates) if strike_rates else 0

# Apply the function to both teams
train_data['team1_avg_strike_rate'] = train_data['team1_roster_ids'].apply(average_strike_rate)
train_data['team2_avg_strike_rate'] = train_data['team2_roster_ids'].apply(average_strike_rate)

round1_data['team1_avg_strike_rate'] = round1_data['team1_roster_ids'].apply(average_strike_rate)
round1_data['team2_avg_strike_rate'] = round1_data['team2_roster_ids'].apply(average_strike_rate)

In [None]:
# Calculate average strike rate for each batsman across all matches
avg_economy_rates = bowler_data.groupby('bowler_id')['economy'].mean().reset_index()

# Example function to calculate average strike rate per match ID
def average_economy_rate(roster_ids):
  player_ids = roster_ids.split(':')
  economy_rates = []
  # Iterate over each player_id in player_ids
  for player_id in player_ids:
    # Check if player_id is in avg_economy_rates DataFrame
    # if player_id in avg_economy_rates['bowler_id'].values:
    # Fetch economy rate corresponding to player_id and append to economy_rates
    economy = avg_economy_rates.loc[avg_economy_rates['bowler_id'] == float(player_id)]['economy']
    if not economy.empty:
      economy_rates.append(economy.iloc[0])
  return np.mean(economy_rates) if economy_rates else 0

# Apply the function to both teams
train_data['team1_avg_economy_rate'] = train_data['team1_roster_ids'].apply(average_economy_rate)
train_data['team2_avg_economy_rate'] = train_data['team2_roster_ids'].apply(average_economy_rate)

round1_data['team1_avg_economy_rate'] = round1_data['team1_roster_ids'].apply(average_economy_rate)
round1_data['team2_avg_economy_rate'] = round1_data['team2_roster_ids'].apply(average_economy_rate)

In [None]:
# Historical win rate at the venue for both teams
def win_rate_at_venue(team_id, venue):
    matches_at_venue = train_data[train_data['venue'] == venue]
    wins_at_venue = len(matches_at_venue[matches_at_venue['winner_id'] == team_id])
    total_matches = len(matches_at_venue[matches_at_venue['team1_id'] == team_id])
    return (wins_at_venue / total_matches) * 100 if total_matches > 0 else 0

# Apply the function to both teams
train_data['team1_win_rate_at_venue'] = train_data.apply(lambda row: win_rate_at_venue(row['team1_id'], row['venue']), axis=1)
train_data['team2_win_rate_at_venue'] = train_data.apply(lambda row: win_rate_at_venue(row['team2_id'], row['venue']), axis=1)

round1_data['team1_win_rate_at_venue'] = round1_data.apply(lambda row: win_rate_at_venue(row['team1_id'], row['venue']), axis=1)
round1_data['team2_win_rate_at_venue'] = round1_data.apply(lambda row: win_rate_at_venue(row['team2_id'], row['venue']), axis=1)

In [None]:
important_columns = [
    'match id', 'team1_id', 'team2_id', 'ground_id', 'venue', 'team_count_50runs_last15',	'team_winp_last5',	'team1only_avg_runs_last15', 'team1_winp_team2_last15',	'ground_avg_runs_last15',
    'team1_avg_strike_rate', 'team2_avg_strike_rate', 'team1_avg_economy_rate', 'team2_avg_economy_rate', 'team1_win_rate_at_venue', 'team2_win_rate_at_venue',  'target'
]

train_data_selected = train_data[important_columns]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming 'target' is the target variable
X = train_data_selected.drop(['match id','target'], axis=1)
y = train_data_selected[['target']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

# Ensure consistent categories between train and test sets
for col in cats:
    X[col] = X[col].cat.set_categories(X[col].cat.categories.union(round1_data[col].unique()), ordered=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, enable_categorical=True)

# Set parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': 'hist',
    'enable_categorical': True
}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=500)

# Make predictions
predictions = xgb_model.predict(dtest)
y_pred = [1 if pred > 0.5 else 0 for pred in predictions]

# Evaluate the model
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Parameters: { "enable_categorical" } are not used.



Accuracy on test set: 0.7684210526315789
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.67      0.73        45
           1       0.74      0.86      0.80        50

    accuracy                           0.77        95
   macro avg       0.78      0.76      0.76        95
weighted avg       0.77      0.77      0.77        95



In [None]:
# Assuming 'target' is the target variable
X = train_data_selected.drop(['match id', 'target'], axis=1)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

# Ensure consistent categories between train and test sets
for col in cats:
    X[col] = X[col].cat.set_categories(X[col].cat.categories.union(round1_data[col].unique()), ordered=False)


# Create DMatrix for XGBoost
dtest_train = xgb.DMatrix(X, enable_categorical=True)

# Make predictions
predictions_train = xgb_model.predict(dtest_train)
y_pred = [1 if pred > 0.5 else 0 for pred in predictions_train]

In [None]:
# Extract winning team ID and probabilities for train data
win_pred_team_id_train = train_data_selected.apply(lambda row: row['team1_id'] if y_pred[row.name] == 1 else row['team2_id'], axis=1)

# Add model details to the original train data
train_data_selected['win_pred_team_id'] = win_pred_team_id_train
train_data_selected['win_pred_score'] = predictions_train
train_data_selected['train_algorithm'] = 'xgboost'
train_data_selected['is_ensemble'] = 'no'
train_data_selected['train_hps_trees'] = 100
train_data_selected['train_hps_depth'] = params.get('max_depth', 6)  # Default depth is 6 if not specified
train_data_selected['train_hps_lr'] = params.get('learning_rate', 0.3)  # Default learning rate is 0.3 if not specified
train_data_selected['dataset_type'] = 'train'

# Select necessary columns
train_data_selected = train_data_selected[['match id', 'dataset_type', 'win_pred_team_id', 'win_pred_score', 'train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr', 'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15', 'ground_avg_runs_last15',
    'team1_avg_strike_rate', 'team2_avg_strike_rate', 'team1_avg_economy_rate', 'team2_avg_economy_rate', 'team1_win_rate_at_venue']]
train_data_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected['win_pred_team_id'] = win_pred_team_id_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected['win_pred_score'] = predictions_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected['train_algorithm'] = 'xgboost'
A value is trying to be set on 

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,team1_avg_strike_rate,team2_avg_strike_rate,team1_avg_economy_rate,team2_avg_economy_rate,team1_win_rate_at_venue
0,9331181,train,12634,0.003844,xgboost,no,100,6,0.3,1.666667,0.672131,139.0,100.0,157.178571,97.34349,103.884434,7.367183,7.833869,50.0
1,8797060,train,20,0.981991,xgboost,no,100,6,0.3,1.285714,1.952381,156.0,50.0,103.5,113.163861,115.051911,8.265474,8.778177,50.0
2,9433269,train,10576,0.986998,xgboost,no,100,6,0.3,0.857143,0.672131,173.266667,0.0,154.333333,103.483922,111.789417,8.770945,9.164045,100.0
3,9587073,train,36084,0.99477,xgboost,no,100,6,0.3,2.166667,1.97561,164.266667,50.0,144.25,126.837172,109.644381,8.362809,8.463754,100.0
4,9516457,train,48341,0.005904,xgboost,no,100,6,0.3,0.818182,1.327869,164.666667,0.0,189.0,120.471347,96.755714,8.043134,8.032344,50.0


In [None]:
# Preprocess and encode the new test data similarly to how train data was processed
important_columns.remove('target')
test_data_selected = round1_data[important_columns]
X_test_r1 = test_data_selected.drop('match id', axis=1)

# Extract text features
cats = X_test_r1.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X_test_r1[col] = X_test_r1[col].astype('category')

# Ensure consistent categories between train and test sets
for col in cats:
    X_test_r1[col] = X_test_r1[col].cat.set_categories(X_test_r1[col].cat.categories.union(round1_data[col].unique()), ordered=False)

# Create DMatrix for XGBoost
dtest_r1 = xgb.DMatrix(X_test_r1, enable_categorical=True)

In [None]:
# Make predictions for test data (r1)
predictions_r1 = xgb_model.predict(dtest_r1)
predictions_binary_r1 = [1 if pred > 0.5 else 0 for pred in predictions_r1]

In [None]:
# Add predictions to test_data_r1
test_data_selected['win_pred_team_id'] = test_data_selected.apply(lambda row: row['team1_id'] if predictions_binary_r1[row.name] == 1 else row['team2_id'], axis=1)
test_data_selected['win_pred_score'] = predictions_r1
test_data_selected['train_algorithm'] = 'xgboost'
test_data_selected['is_ensemble'] = 'no'
test_data_selected['train_hps_trees'] = 100
test_data_selected['train_hps_depth'] = params.get('max_depth', 6)  # Default depth is 6 if not specified
test_data_selected['train_hps_lr'] = params.get('learning_rate', 0.3)  # Default learning rate is 0.3 if not specified
test_data_selected['dataset_type'] = 'r1'

# Select necessary columns
test_data_required = test_data_selected[['match id', 'dataset_type', 'win_pred_team_id', 'win_pred_score', 'train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr', 'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15', 'ground_avg_runs_last15',
    'team1_avg_strike_rate', 'team2_avg_strike_rate', 'team1_avg_economy_rate', 'team2_avg_economy_rate', 'team1_win_rate_at_venue']]
test_data_required.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected['win_pred_team_id'] = test_data_selected.apply(lambda row: row['team1_id'] if predictions_binary_r1[row.name] == 1 else row['team2_id'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected['win_pred_score'] = predictions_r1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,team1_avg_strike_rate,team2_avg_strike_rate,team1_avg_economy_rate,team2_avg_economy_rate,team1_win_rate_at_venue
0,9250275,r1,90,0.000182,xgboost,no,100,6,0.3,0.0,0.019608,,0.0,,138.7705,108.479,6.7075,7.631187,0.0
1,9262189,r1,36084,0.995462,xgboost,no,100,6,0.3,0.615385,0.344262,151.285714,66.67,153.5,123.14489,109.455779,8.252163,8.579393,133.333333
2,9128776,r1,48334,0.058289,xgboost,no,100,6,0.3,0.842105,0.753086,171.066667,100.0,179.625,104.448755,108.302414,8.948133,8.00383,100.0
3,9586919,r1,36112,0.022069,xgboost,no,100,6,0.3,0.285714,1.487805,166.0,75.0,155.5,88.545492,120.46018,8.292504,8.332112,33.333333
4,9128538,r1,30414,0.990065,xgboost,no,100,6,0.3,2.375,0.31033,169.933333,0.0,164.125,115.901728,97.460336,8.279965,8.264879,100.0


In [None]:
# Concatenate the new test data (r1) with the original train data
final_data = pd.concat([test_data_required, train_data_selected], ignore_index=True)

In [None]:
# Add additional columns based on important_features
important_columns = [
    'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15', 'ground_avg_runs_last15',
    'team1_avg_strike_rate', 'team2_avg_strike_rate', 'team1_avg_economy_rate', 'team2_avg_economy_rate', 'team1_win_rate_at_venue'
]

for i, col in enumerate(important_columns, start=1):
  final_data.rename(columns = {col: f'indep_feat_id{i}'}, inplace = True)

# Fill NaN with 0 in final_df
final_data = final_data.fillna(0)

# Save to CSV
final_data.to_csv('/content/drive/MyDrive/Amex/primary_submission.csv', index=False)

In [None]:
final_data

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,90,0.000182,xgboost,no,100,6,0.3,0.000000,0.019608,0.000000,0.00,0.000000,138.770500,108.479000,6.707500,7.631187,0.000000
1,9262189,r1,36084,0.995462,xgboost,no,100,6,0.3,0.615385,0.344262,151.285714,66.67,153.500000,123.144890,109.455779,8.252163,8.579393,133.333333
2,9128776,r1,48334,0.058289,xgboost,no,100,6,0.3,0.842105,0.753086,171.066667,100.00,179.625000,104.448755,108.302414,8.948133,8.003830,100.000000
3,9586919,r1,36112,0.022069,xgboost,no,100,6,0.3,0.285714,1.487805,166.000000,75.00,155.500000,88.545492,120.460180,8.292504,8.332112,33.333333
4,9128538,r1,30414,0.990065,xgboost,no,100,6,0.3,2.375000,0.310330,169.933333,0.00,164.125000,115.901728,97.460336,8.279965,8.264879,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,9128601,train,30414,0.019065,xgboost,no,100,6,0.3,0.823529,1.000000,147.333333,66.67,166.400000,108.555728,112.039258,8.244350,8.140777,66.666667
1215,9433241,train,9701,0.000494,xgboost,no,100,6,0.3,1.571429,0.012346,167.400000,0.00,170.466667,107.579355,115.795672,10.273110,8.665860,0.000000
1216,9097227,train,23869,0.005823,xgboost,no,100,6,0.3,3.000000,1.000000,0.000000,0.00,0.000000,108.108499,85.547038,8.020950,8.134883,100.000000
1217,9516695,train,36014,0.008684,xgboost,no,100,6,0.3,0.789474,1.487805,182.800000,66.67,133.375000,97.403997,116.296778,8.552488,8.953509,50.000000


In [None]:
important_columns = [
    'match id', 'team1_id', 'team2_id', 'ground_id', 'venue', 'team_count_50runs_last15',	'team_winp_last5',	'team1only_avg_runs_last15', 'team1_winp_team2_last15',	'ground_avg_runs_last15',
    'team1_avg_strike_rate', 'team2_avg_strike_rate', 'team1_avg_economy_rate', 'team2_avg_economy_rate', 'team1_win_rate_at_venue', 'team2_win_rate_at_venue',  'target'
]

# Get feature importance scores
feature_importance = xgb_model.get_score(importance_type='weight')

# Normalize importance scores to sum to 100%
total_importance = sum(feature_importance.values())
normalized_importance = {k: (v / total_importance) * 100 for k, v in feature_importance.items()}

# Create DataFrame from feature importance scores
feature_importance_df = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Importance'])

# Create a DataFrame to hold feature information
feat_df = pd.DataFrame({
    'feat_id': range(len(important_columns) - 2),  # Excluding 'match id' and 'target'
    'feat_name': X.columns,
    'feat_description': [
        'team1_id', 'team2_id', 'ground_id', 'venue',
        'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',
        "Ratio of team1's win% to team2's win% in last 5 games",
        "team1's avg inning runs in last 15 games", "Team1's win percentage against Team2 in last 15 games",
        'average runs scored in the ground in last 15 games', 'average strike rate of team 1 batsmen',
        'average strike rate of team 2 batsmen', 'average economy of team 1 bowlers',
        'average economy of team 1 bowlers', "Team1's win percentage at the venue",
        "Team2's win percentage at the venue"
    ]
})

# Map importance scores to the feature DataFrame
feat_df['model_feat_imp_train'] = feat_df['feat_name'].map(normalized_importance)
feat_df['model_feat_imp_train'].fillna(0, inplace=True)

# Rank features based on importance
feat_df['feat_rank_train'] = feat_df['model_feat_imp_train'].rank(ascending=True).astype(int)

# Display the feature importance DataFrame
feat_df

# Save to CSV file
feat_df.to_csv('/content/drive/MyDrive/Amex/secondary_submission.csv', index=False)

In [None]:
train_data['dataset_type'] = 'train'
train_data.rename(columns = {'winner_id':'win_team_id'}, inplace = True)
final_data = train_data[['match id', 'dataset_type', 'win_team_id']]

# Fill NaN with 0 in final_df
final_data = final_data.fillna(0)

# Save to CSV
final_data.to_csv('/content/drive/MyDrive/Amex/dep_var.csv', index=False)