In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Suppress warnings to improve code readability (optional)
warnings.filterwarnings("ignore")

In [2]:
# Read the cleaned and preprocessed data from 'cricket.csv' into a DataFrame 'df'
df = pd.read_csv('cricket2--.csv')

In [3]:
df.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id,...,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_conceded_mean/yr,runs_scored_mean/opp,wickets_mean/opp,runs_conceded_mean/opp,mean_runs_scored,mean_wicket
0,4,Virat Kohli,31,1,15,1,0,2013-12-05,v South Africa Johannesburg,112,...,15,166.25,37.294118,0.029412,5.235294,42.666667,0.333333,5.0,46.558719,0.014235
1,10,Mohammed Siraj,4,3,29,0,0,2022-02-11,v West Indies Ahmedabad,230,...,4,20.018519,1.333333,1.6,37.6,2.333333,1.666667,31.0,1.233333,1.8
2,13,Ravichandran ashwin,0,1,20,0,0,2013-07-09,v Sri Lanka Port of Spain,97,...,13,33.2,5.758621,1.37931,46.655172,0.0,1.5,31.0,6.147826,1.347826
3,27,Salman Agha,27,0,25,0,0,2022-08-16,v Netherlands Rotterdam,444,...,1,88.25,33.666667,0.0,10.0,33.666667,0.0,10.0,24.222222,0.222222
4,4,Virat Kohli,22,0,6,1,0,2014-11-02,v Sri Lanka Cuttack,338,...,15,166.25,50.190476,0.047619,5.095238,22.0,0.0,6.0,46.558719,0.014235


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021 entries, 0 to 2020
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   player_id                     2021 non-null   int64  
 1   player_name                   2021 non-null   object 
 2   runs_scored                   2021 non-null   int64  
 3   wickets                       2021 non-null   int64  
 4   runs_conceded                 2021 non-null   int64  
 5   catches                       2021 non-null   int64  
 6   stumpings                     2021 non-null   int64  
 7   match_date                    2021 non-null   object 
 8   opposition                    2021 non-null   object 
 9   match_id                      2021 non-null   int64  
 10  avg_runs_scored_last7matches  2021 non-null   float64
 11  DNB                           2021 non-null   int64  
 12  TDNB                          2021 non-null   int64  
 13  yea

In [5]:
# Create a new DataFrame 'new_df' by dropping columns 'player_name', 'match_date', and 'opposition' from 'df'
new_df = df.drop(['player_name','match_date'], axis=1)

In [6]:
new_df = pd.get_dummies(new_df, columns=['opposition'], drop_first=True)

In [8]:
new_df.head()

Unnamed: 0,player_id,runs_scored,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,...,opposition_v West Indies Sharjah,opposition_v West Indies Tarouba,opposition_v West Indies The Oval,opposition_v West Indies Thiruvananthapuram,opposition_v West Indies Visakhapatnam,opposition_v Zimbabwe Auckland,opposition_v Zimbabwe Bulawayo,opposition_v Zimbabwe Harare,opposition_v Zimbabwe Lahore,opposition_v Zimbabwe Rawalpindi
0,4,31,1,15,1,0,112,39.142857,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,4,3,29,0,0,230,2.428571,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13,0,1,20,0,0,97,7.714286,1,0,...,0,0,0,0,0,0,0,0,0,0
3,27,27,0,25,0,0,444,22.285714,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,22,0,6,1,0,338,39.142857,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Create the feature matrix 'X' by dropping the 'wickets' column from 'new_df'
X = new_df.drop('wickets', axis=1)

In [10]:
# Create the target variable 'y' by selecting the 'wickets' column from 'new_df'
y = new_df['wickets']

In [13]:
# Import required libraries and modules for machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

In [14]:
X.head()

Unnamed: 0,player_id,runs_scored,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,...,opposition_v West Indies Sharjah,opposition_v West Indies Tarouba,opposition_v West Indies The Oval,opposition_v West Indies Thiruvananthapuram,opposition_v West Indies Visakhapatnam,opposition_v Zimbabwe Auckland,opposition_v Zimbabwe Bulawayo,opposition_v Zimbabwe Harare,opposition_v Zimbabwe Lahore,opposition_v Zimbabwe Rawalpindi
0,4,31,15,1,0,112,39.142857,0,0,2013,...,0,0,0,0,0,0,0,0,0,0
1,10,4,29,0,0,230,2.428571,0,0,2022,...,0,0,0,0,0,0,0,0,0,0
2,13,0,20,0,0,97,7.714286,1,0,2013,...,0,0,0,0,0,0,0,0,0,0
3,27,27,25,0,0,444,22.285714,0,0,2022,...,0,0,0,0,0,0,0,0,0,0
4,4,22,6,1,0,338,39.142857,0,0,2014,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Split the data into training and testing sets
# X_train and y_train will be used for training the model
# X_test and y_test will be used for evaluating the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [16]:
X_train

Unnamed: 0,player_id,runs_scored,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,...,opposition_v West Indies Sharjah,opposition_v West Indies Tarouba,opposition_v West Indies The Oval,opposition_v West Indies Thiruvananthapuram,opposition_v West Indies Visakhapatnam,opposition_v Zimbabwe Auckland,opposition_v Zimbabwe Bulawayo,opposition_v Zimbabwe Harare,opposition_v Zimbabwe Lahore,opposition_v Zimbabwe Rawalpindi
1763,19,20,0,2,0,459,17.714286,0,0,2023,...,0,0,0,0,0,0,0,0,0,0
289,3,49,0,0,0,277,67.142857,0,0,2022,...,0,0,0,0,0,0,0,0,0,0
1668,11,0,65,0,0,223,3.857143,1,0,2020,...,0,0,0,0,0,0,0,0,0,0
534,4,78,12,0,0,116,39.142857,0,0,2014,...,0,0,0,0,0,0,0,0,0,0
806,19,88,0,0,0,408,17.714286,0,0,2018,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,2,64,48,1,0,227,30.714286,0,0,2021,...,0,0,0,0,0,0,0,0,0,0
1294,1,2,0,0,0,172,41.000000,0,0,2017,...,0,0,0,0,0,0,0,0,0,0
860,26,7,35,0,0,449,8.142857,0,0,2023,...,0,0,0,0,0,0,0,0,0,0
1459,1,20,0,0,0,99,41.000000,0,0,2013,...,0,0,0,0,0,0,0,1,0,0


In [17]:
# Define a parameter grid with various hyperparameter options
param_grid = {
    'n_estimators': [50, 100, 200, 300, 350],  # Number of estimators (trees)
    'max_depth': [3, 4, 5],  # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate for boosting
}

# Initialize an XGBoost Regressor
xgb_regressor = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Initialize GridSearchCV with the regressor and parameter grid
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the training data to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Initialize an XGBoost Regressor with the best hyperparameters
best_xgb_regressor = xgb.XGBRegressor(objective="reg:squarederror", **best_params, random_state=42)

# Fit the model with the best hyperparameters to the training data
best_xgb_regressor.fit(X_train, y_train)


XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=300, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

In [18]:
# Make predictions using the trained XGBoost regressor
y_pred = np.round(best_xgb_regressor.predict(X_test)).astype(int)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)  # Calculate Mean Squared Error
r2 = r2_score(y_test, y_pred)  # Calculate R-squared (coefficient of determination)

# Print the best hyperparameters and evaluation metrics
print("Best Hyperparameters:", best_params)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Mean Squared Error: 0.14285714285714285
R-squared: 0.9004194999830847


In [19]:
model =best_xgb_regressor
# Get feature importances
importances = model.feature_importances_

# Get the names of the features
feature_names = X_train.columns

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [20]:
feature_importance_df

Unnamed: 0,Feature,Importance
16,wickets_mean/opp,0.707034
10,years_of_experience,0.047229
2,runs_conceded,0.026123
1,runs_scored,0.025930
13,wickets_mean/yr,0.025171
...,...,...
97,opposition_v Hong Kong Dubai (DSC),0.000000
98,opposition_v Hong Kong Karachi,0.000000
99,opposition_v India Birmingham,0.000000
100,opposition_v India Colombo (RPS),0.000000


In [21]:
# Create a DataFrame to compare true and predicted values
comparison_df = pd.DataFrame({'true': y_test, 'pred': y_pred})

In [22]:
# Add a new column 'wickets_pred' to the X_test DataFrame to store the predicted wicket values
X_test['wickets_pred'] = y_pred

In [23]:
X_test.head()

Unnamed: 0,player_id,runs_scored,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,...,opposition_v West Indies Tarouba,opposition_v West Indies The Oval,opposition_v West Indies Thiruvananthapuram,opposition_v West Indies Visakhapatnam,opposition_v Zimbabwe Auckland,opposition_v Zimbabwe Bulawayo,opposition_v Zimbabwe Harare,opposition_v Zimbabwe Lahore,opposition_v Zimbabwe Rawalpindi,wickets_pred
674,13,5,42,0,0,135,7.714286,0,0,2015,...,0,0,0,0,0,0,0,0,0,1
1383,5,9,0,0,0,172,37.714286,0,0,2017,...,0,0,0,0,0,0,0,0,0,0
720,4,4,0,1,0,341,39.142857,0,0,2015,...,0,0,0,0,0,0,0,0,0,0
590,4,108,0,2,0,78,39.142857,0,0,2012,...,0,0,0,0,0,0,0,0,0,0
576,24,3,19,0,0,442,14.142857,0,0,2022,...,0,0,0,0,0,0,0,0,0,2


In [24]:
X_test['wickets'] =y_test

In [25]:
from scipy import stats

# Calculate the mean and mode of predictions for each player_id
mean_predictions = X_test.groupby('player_id')['wickets_pred'].mean()
mode_predictions = X_test.groupby('player_id')['wickets_pred'].apply(lambda x: stats.mode(x)[0][0])
mean_y_test = X_test.groupby('player_id')['wickets'].mean()
mode_y_test = X_test.groupby('player_id')['wickets'].apply(lambda x: stats.mode(x)[0][0])

# Create a DataFrame to store both mean and mode predictions
player_predictions = pd.DataFrame({
    'player_id': mean_predictions.index,
    'mean_prediction': np.round(mean_predictions.values).astype(int),
    'mode_prediction': mode_predictions.values,
    'mean_Wickets': np.round(mean_y_test).astype(int),
    'mode_wickets': mode_y_test
})

In [26]:
player_predictions.tail()

Unnamed: 0_level_0,player_id,mean_prediction,mode_prediction,mean_Wickets,mode_wickets
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24,24,1,0,1,0
25,25,0,0,0,0
26,26,0,0,0,0
27,27,0,0,0,0
29,29,2,1,2,1


In [27]:
mse = mean_squared_error(player_predictions['mean_Wickets'], player_predictions['mean_prediction'])  # Calculate Mean Squared Error
r2 = r2_score(player_predictions['mean_Wickets'], player_predictions['mean_prediction'])  # Calculate R-squared (coefficient of determination)

# Print the best hyperparameters and evaluation metrics
print("Best Hyperparameters:", best_params)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Mean Squared Error: 0.0
R-squared: 1.0


In [28]:
mse = mean_squared_error(player_predictions['mode_wickets'], player_predictions['mode_prediction'])  # Calculate Mean Squared Error
r2 = r2_score(player_predictions['mode_wickets'], player_predictions['mode_prediction'])  # Calculate R-squared (coefficient of determination)

# Print the best hyperparameters and evaluation metrics
print("Best Hyperparameters:", best_params)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Mean Squared Error: 0.2222222222222222
R-squared: 0.4375000000000001


In [29]:
wickets_predd = player_predictions[['mean_prediction']]

In [30]:
wickets_predd = wickets_predd.reset_index()

In [31]:
wickets_predd

Unnamed: 0,player_id,mean_prediction
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0
5,6,0
6,7,1
7,8,1
8,9,2
9,10,1


In [32]:
 X_test[X_test['player_id'] == 18]

Unnamed: 0,player_id,runs_scored,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,...,opposition_v West Indies The Oval,opposition_v West Indies Thiruvananthapuram,opposition_v West Indies Visakhapatnam,opposition_v Zimbabwe Auckland,opposition_v Zimbabwe Bulawayo,opposition_v Zimbabwe Harare,opposition_v Zimbabwe Lahore,opposition_v Zimbabwe Rawalpindi,wickets_pred,wickets


In [33]:
id_30 = X_train[X_train['player_id'] == 30]
id_28 = X_train[X_train['player_id'] == 28]
id_18 = X_train[X_train['player_id'] == 18]

# Make predictions using the trained XGBoost regressor
id30_pred = np.round(best_xgb_regressor.predict(id_30)).astype(int)
id18_pred = np.round(best_xgb_regressor.predict(id_18)).astype(int)
id28_pred = np.round(best_xgb_regressor.predict(id_28)).astype(int)

print(id30_pred)
print(id18_pred)
print(id28_pred)

# Calculate the mean for each of the four indices and round to the nearest integer
mean_id30_pred = int(round(id30_pred.mean()))
mean_id18_pred = int(round(id18_pred.mean()))
mean_id28_pred = int(round(id28_pred.mean()))

# Print the rounded means
print("Mean of id30_pred (rounded):", mean_id30_pred)
print("Mean of id18_pred (rounded):", mean_id18_pred)
print("Mean of id28_pred (rounded):", mean_id28_pred)

[1 2 1 0 2 2 2 2]
[0 0 0 0]
[0 1 0 0 0 0]
Mean of id30_pred (rounded): 2
Mean of id18_pred (rounded): 0
Mean of id28_pred (rounded): 0


In [34]:
missing_id = pd.DataFrame({'player_id':[18 ,28, 30], 'mean_prediction':[0, 0, 2]})

In [35]:
missing_id

Unnamed: 0,player_id,mean_prediction
0,18,0
1,28,0
2,30,2


In [36]:
wickets_predd = pd.concat([wickets_predd, missing_id], ignore_index=True).sort_values(by='player_id').reset_index()

In [37]:
wiwickets_predd = wickets_predd.drop('index',axis=1)

In [38]:
wickets_predd

Unnamed: 0,index,player_id,mean_prediction
0,0,1,0
1,1,2,1
2,2,3,0
3,3,4,0
4,4,5,0
5,5,6,0
6,6,7,1
7,7,8,1
8,8,9,2
9,9,10,1


In [39]:
# Read the sample submission CSV file into a Pandas DataFrame
sub = pd.read_csv('sample_submission_br9x6st.csv')

In [40]:
sub.head()

Unnamed: 0,player_id,runs,wickets
0,1,30,2
1,2,24,0
2,3,45,1
3,4,5,2
4,5,9,0


In [41]:
wickets_predd['mean_prediction'].shape

(30,)

In [42]:
# Assign the predicted wickets values to the 'wickets' column in the submission DataFrame
sub['wickets'] = wickets_predd['mean_prediction']

In [43]:
sub.isnull().sum()

player_id    0
runs         0
wickets      0
dtype: int64

In [44]:
sub.head()

Unnamed: 0,player_id,runs,wickets
0,1,30,0
1,2,24,1
2,3,45,0
3,4,5,0
4,5,9,0


In [48]:
# Save the submission DataFrame to a CSV file
sub.to_csv('cricket_predictions2--.csv', index=False)