In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

# Suppress warnings to improve code readability (optional)
warnings.filterwarnings("ignore")

In [2]:
# Read the cleaned and preprocessed data from 'cricket.csv' into a DataFrame 'df'
df = pd.read_csv('cricket2.csv')

In [3]:
df.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id,...,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket
0,4,Virat Kohli,28,0,16,0,0,2010-06-24,v Sri Lanka Dambulla,50,...,0,2010,15,923.5,39.8,0.0,29.428571,0.0,46.558719,0.014235
1,14,Ishan Kishan,33,0,0,0,0,2023-09-12,v Sri Lanka Colombo (RPS),248,...,0,2023,2,0.0,27.266667,0.0,29.0,0.0,35.44,0.0
2,20,Haris Rauf,0,0,57,2,0,2022-03-31,v Australia Lahore,439,...,0,2022,3,24.320755,1.0,2.142857,2.333333,1.666667,0.535714,1.892857
3,25,Mohammad Rizwan,34,0,0,1,0,2016-09-04,v England Cardiff,374,...,0,2016,8,0.0,12.833333,0.0,23.5,0.0,26.046154,0.0
4,12,Mohammed Shami,0,1,59,0,0,2018-10-24,v West Indies Visakhapatnam,190,...,0,2018,10,25.427746,0.0,1.5,0.0,2.0,2.234043,1.840426


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2021 entries, 0 to 2020
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   player_id                     2021 non-null   int64  
 1   player_name                   2021 non-null   object 
 2   runs_scored                   2021 non-null   int64  
 3   wickets                       2021 non-null   int64  
 4   runs_conceded                 2021 non-null   int64  
 5   catches                       2021 non-null   int64  
 6   stumpings                     2021 non-null   int64  
 7   match_date                    2021 non-null   object 
 8   opposition                    2021 non-null   object 
 9   match_id                      2021 non-null   int64  
 10  avg_runs_scored_last7matches  2021 non-null   float64
 11  DNB                           2021 non-null   int64  
 12  TDNB                          2021 non-null   int64  
 13  yea

In [5]:
# Create a new DataFrame 'new_df' by dropping columns 'player_name', 'match_date', and 'opposition' from 'df'
new_df = df.drop(['player_name','match_date','opposition'], axis=1)

In [6]:
#new_df = pd.get_dummies(new_df, columns=['opposition'], drop_first=True)

In [7]:
new_df.head()

Unnamed: 0,player_id,runs_scored,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket
0,4,28,0,16,0,0,50,39.142857,0,0,2010,15,923.5,39.8,0.0,29.428571,0.0,46.558719,0.014235
1,14,33,0,0,0,0,248,38.428571,0,0,2023,2,0.0,27.266667,0.0,29.0,0.0,35.44,0.0
2,20,0,0,57,2,0,439,1.857143,1,0,2022,3,24.320755,1.0,2.142857,2.333333,1.666667,0.535714,1.892857
3,25,34,0,0,1,0,374,40.714286,0,0,2016,8,0.0,12.833333,0.0,23.5,0.0,26.046154,0.0
4,12,0,1,59,0,0,190,7.142857,0,0,2018,10,25.427746,0.0,1.5,0.0,2.0,2.234043,1.840426


In [8]:
# Create the feature matrix 'X' by dropping the 'wickets' column from 'new_df'
X = new_df.drop('runs_scored', axis=1)

In [9]:
# Create the target variable 'y' by selecting the 'wickets' column from 'new_df'
y = new_df['runs_scored']

In [10]:
# Import required libraries and modules for machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

In [11]:
X.head()

Unnamed: 0,player_id,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket
0,4,0,16,0,0,50,39.142857,0,0,2010,15,923.5,39.8,0.0,29.428571,0.0,46.558719,0.014235
1,14,0,0,0,0,248,38.428571,0,0,2023,2,0.0,27.266667,0.0,29.0,0.0,35.44,0.0
2,20,0,57,2,0,439,1.857143,1,0,2022,3,24.320755,1.0,2.142857,2.333333,1.666667,0.535714,1.892857
3,25,0,0,1,0,374,40.714286,0,0,2016,8,0.0,12.833333,0.0,23.5,0.0,26.046154,0.0
4,12,1,59,0,0,190,7.142857,0,0,2018,10,25.427746,0.0,1.5,0.0,2.0,2.234043,1.840426


In [12]:
# Split the data into training and testing sets
# X_train and y_train will be used for training the model
# X_test and y_test will be used for evaluating the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [13]:
X_train

Unnamed: 0,player_id,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket
1763,6,0,0,0,1,344,48.142857,0,0,2020,7,0.000000,49.222222,0.000000,88.000000,0.0,37.557377,0.000000
289,4,0,13,0,0,286,39.142857,0,0,2009,15,923.500000,32.500000,0.000000,79.000000,0.0,46.558719,0.014235
1668,13,2,42,0,0,314,7.714286,0,0,2011,13,32.955975,6.777778,1.500000,1.000000,2.0,6.147826,1.382609
534,15,0,0,0,0,243,33.428571,0,0,2023,2,0.000000,20.214286,0.000000,0.000000,0.0,22.233333,0.000000
806,1,0,18,1,0,246,41.000000,0,0,2023,16,543.625000,41.125000,0.000000,74.000000,0.0,40.286853,0.031873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,4,0,26,0,0,90,39.142857,0,0,2013,15,923.500000,37.294118,0.029412,22.000000,0.0,46.558719,0.014235
1294,4,0,13,0,0,117,39.142857,0,0,2014,15,923.500000,50.190476,0.047619,10.500000,0.0,46.558719,0.014235
860,27,0,29,0,0,458,22.285714,0,0,2023,1,124.500000,22.333333,0.266667,5.000000,0.0,24.222222,0.222222
1459,4,0,13,0,0,299,39.142857,1,0,2010,15,923.500000,39.800000,0.000000,0.000000,0.0,46.558719,0.014235


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [16]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import mean_squared_error, r2_score

# Define your model architecture
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Output layer with 1 neuron for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model.fit(scaled_X_train, y_train, epochs=60, batch_size=32, validation_split=0.2)

# Make predictions
predictions = model.predict(scaled_X_test)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, predictions)

# Print the R2 score
print("R-squared (R2) Score:", r2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
R-squared (R2) Score: 0.7366654447774412


In [17]:
# Create a DataFrame to compare true and predicted values
comparison_df = pd.DataFrame({'true': y_test, 'pred': np.round(predictions.reshape(-1)).astype(int)})

In [18]:
# Add a new column 'wickets_pred' to the X_test DataFrame to store the predicted wicket values
X_test['runs_scored_pred'] = comparison_df['pred'].values

In [19]:
X_test.head()

Unnamed: 0,player_id,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket,runs_scored_pred
674,17,0,54,0,0,187,16.714286,0,0,2018,6,33.188235,10.176471,1.411765,9.0,0.5,11.46875,1.328125,8
1383,29,2,31,1,0,457,5.571429,0,0,2023,5,23.363636,3.25,2.0,2.0,2.0,3.204545,2.0,3
720,1,0,11,0,0,24,41.0,0,0,2008,16,543.625,19.0,0.0,19.333333,0.0,40.286853,0.031873,14
590,5,0,8,0,0,216,37.714286,0,1,2019,6,0.0,44.333333,0.0,0.0,0.0,38.319149,0.0,0
576,13,2,64,0,0,107,7.714286,1,0,2013,13,32.955975,5.758621,1.37931,0.0,2.0,6.147826,1.382609,0


In [20]:
X_test['runs_scored'] =y_test

In [21]:
from scipy import stats

# Calculate the mean and mode of predictions for each player_id
mean_predictions = X_test.groupby('player_id')['runs_scored_pred'].mean()
mode_predictions = X_test.groupby('player_id')['runs_scored_pred'].apply(lambda x: stats.mode(x)[0][0])
mean_y_test = X_test.groupby('player_id')['runs_scored'].mean()
mode_y_test = X_test.groupby('player_id')['runs_scored'].apply(lambda x: stats.mode(x)[0][0])

# Create a DataFrame to store both mean and mode predictions
player_predictions = pd.DataFrame({
    'player_id': mean_predictions.index,
    'mean_prediction': np.round(mean_predictions.values).astype(int),
    'mode_prediction': mode_predictions.values,
    'mean_runs_scored': np.round(mean_y_test).astype(int),
    'mode_runs_scored': mode_y_test
})

In [22]:
player_predictions.tail()

Unnamed: 0_level_0,player_id,mean_prediction,mode_prediction,mean_runs_scored,mode_runs_scored
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
25,25,22,-3,22,0
26,26,2,0,2,0
27,27,22,22,23,23
29,29,3,3,2,0
30,30,1,1,0,0


In [23]:
mse = mean_squared_error(player_predictions['mean_runs_scored'], player_predictions['mean_prediction'])  # Calculate Mean Squared Error
r2 = r2_score(player_predictions['mean_runs_scored'], player_predictions['mean_prediction'])  # Calculate R-squared (coefficient of determination)

# Print the best hyperparameters and evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 13.384615384615385
R-squared: 0.9543843551647819


In [24]:
mse = mean_squared_error(player_predictions['mode_runs_scored'], player_predictions['mode_prediction'])  # Calculate Mean Squared Error
r2 = r2_score(player_predictions['mode_runs_scored'], player_predictions['mode_prediction'])  # Calculate R-squared (coefficient of determination)

# Print the best hyperparameters and evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 154.65384615384616
R-squared: -4.9850011449507665


In [25]:
runs_predd = player_predictions[['mean_prediction']]

In [26]:
runs_predd = runs_predd.reset_index()

In [27]:
runs_predd

Unnamed: 0,player_id,mean_prediction
0,1,35
1,2,12
2,3,54
3,4,38
4,5,38
5,6,47
6,7,11
7,8,31
8,9,1
9,10,0


In [28]:
 X_test[X_test['player_id'] == 18]

Unnamed: 0,player_id,wickets,runs_conceded,catches,stumpings,match_id,avg_runs_scored_last7matches,DNB,TDNB,year,years_of_experience,bowling_average,runs_scored_mean/yr,wickets_mean/yr,runs_scored_mean/opp,wickets_mean/opp,mean_runs_scored,mean_wicket,runs_scored_pred,runs_scored


In [29]:
id_15 = X_train[X_train['player_id'] == 15]
id_24 = X_train[X_train['player_id'] == 24]
id_28 = X_train[X_train['player_id'] == 28]
id_18 = X_train[X_train['player_id'] == 18]

In [30]:
# Make predictions using the trained XGBoost regressor
id15_pred = np.round(model.predict(scaler.transform(id_15))).astype(int)
id18_pred = np.round(model.predict(scaler.transform(id_18))).astype(int)
id24_pred = np.round(model.predict(scaler.transform(id_24))).astype(int)
id28_pred = np.round(model.predict(scaler.transform(id_28))).astype(int)

print(id15_pred)
print(id18_pred)
print(id24_pred)
print(id28_pred)

# Calculate the mean for each of the four indices and round to the nearest integer
mean_id15_pred = int(round(id15_pred.mean()))
mean_id18_pred = int(round(id18_pred.mean()))
mean_id24_pred = int(round(id24_pred.mean()))
mean_id28_pred = int(round(id28_pred.mean()))

# Print the rounded means
print("Mean of id15_pred (rounded):", mean_id15_pred)
print("Mean of id18_pred (rounded):", mean_id18_pred)
print("Mean of id24_pred (rounded):", mean_id24_pred)
print("Mean of id28_pred (rounded):", mean_id28_pred)

[[ 3]
 [-1]
 [30]
 [ 7]
 [22]
 [10]
 [34]
 [45]
 [10]
 [48]
 [ 5]
 [ 9]
 [26]
 [37]
 [ 4]
 [45]
 [-1]
 [70]
 [27]
 [45]
 [37]
 [39]
 [ 3]
 [33]
 [22]
 [34]
 [ 3]
 [15]
 [10]
 [14]]
[[57]
 [ 9]
 [11]
 [25]]
[[ 0]
 [ 4]
 [ 6]
 [ 0]
 [ 0]
 [14]
 [ 7]
 [ 1]
 [ 3]
 [37]
 [ 9]
 [ 5]
 [ 7]
 [ 0]
 [12]
 [12]
 [19]
 [55]
 [24]
 [10]
 [ 3]
 [ 5]
 [13]
 [ 7]
 [ 1]
 [ 9]
 [17]
 [ 8]
 [-1]
 [ 7]
 [ 0]
 [28]]
[[13]
 [-2]
 [ 5]
 [ 8]
 [ 9]
 [56]]
Mean of id15_pred (rounded): 23
Mean of id18_pred (rounded): 26
Mean of id24_pred (rounded): 10
Mean of id28_pred (rounded): 15


In [31]:
missing_id = pd.DataFrame({'player_id':[15 ,18 ,24 ,28], 'mean_prediction':[23, 26, 10, 15]})

In [32]:
missing_id

Unnamed: 0,player_id,mean_prediction
0,15,23
1,18,26
2,24,10
3,28,15


In [33]:
runs_predd = pd.concat([runs_predd, missing_id], ignore_index=True).sort_values(by='player_id').reset_index()

In [34]:
runs_predd = runs_predd.drop('index',axis=1)

In [35]:
runs_predd

Unnamed: 0,player_id,mean_prediction
0,1,35
1,2,12
2,3,54
3,4,38
4,5,38
5,6,47
6,7,11
7,8,31
8,9,1
9,10,0


In [36]:
# Read the sample submission CSV file into a Pandas DataFrame
subm = pd.read_csv('cricket_predictionstf.csv')

In [37]:
subm.head()

Unnamed: 0,player_id,runs,wickets
0,1,30,0
1,2,24,1
2,3,45,0
3,4,5,0
4,5,9,0


In [38]:
runs_predd['mean_prediction'].shape

(30,)

In [39]:
# Assign the predicted wickets values to the 'wickets' column in the submission DataFrame
subm['runs'] = runs_predd['mean_prediction']

In [40]:
subm.isnull().sum()

player_id    0
runs         0
wickets      0
dtype: int64

In [41]:
subm.head()

Unnamed: 0,player_id,runs,wickets
0,1,35,0
1,2,12,1
2,3,54,0
3,4,38,0
4,5,38,0


In [42]:
# Save the submission DataFrame to a CSV file
subm.to_csv('cricket_predictionstf.csv', index=False)

In [43]:
pd.read_csv('cricket_predictionstf.csv')

Unnamed: 0,player_id,runs,wickets
0,1,35,0
1,2,12,1
2,3,54,0
3,4,38,0
4,5,38,0
5,6,47,0
6,7,11,1
7,8,31,1
8,9,1,2
9,10,0,2
