# Train Model

**INPUT**: "./data/1finalDataset.csv"

**OUTPUT**: Outputs the XGBoostModels "./models/best_xgb_model.json"

In this notebook, we take the final dataset (which contains all the tennis statistics), and we train several models with it (Random Forest, XGBoost, Neural Net). Then, we will save the best models to the models folder.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras import layers
from google.colab import drive
drive.mount('/content/drive')
pd.set_option('display.max_columns', None)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
final_dataset = pd.read_csv("/content/drive/MyDrive/Tennis_Project/data/1finalDataset.csv")
final_dataset

Unnamed: 0,AGE_DIFF,ATP_POINTS_DIFF,ATP_RANK_DIFF,BEST_OF,DRAW_SIZE,ELO_DIFF,ELO_GRAD_LAST_100_DIFF,ELO_GRAD_LAST_10_DIFF,ELO_GRAD_LAST_200_DIFF,ELO_GRAD_LAST_25_DIFF,ELO_GRAD_LAST_3_DIFF,ELO_GRAD_LAST_50_DIFF,ELO_GRAD_LAST_5_DIFF,ELO_SURFACE_DIFF,H2H_DIFF,H2H_SURFACE_DIFF,HEIGHT_DIFF,N_GAMES_DIFF,P_1ST_IN_LAST_100_DIFF,P_1ST_IN_LAST_10_DIFF,P_1ST_IN_LAST_200_DIFF,P_1ST_IN_LAST_25_DIFF,P_1ST_IN_LAST_3_DIFF,P_1ST_IN_LAST_50_DIFF,P_1ST_IN_LAST_5_DIFF,P_1ST_WON_LAST_100_DIFF,P_1ST_WON_LAST_10_DIFF,P_1ST_WON_LAST_200_DIFF,P_1ST_WON_LAST_25_DIFF,P_1ST_WON_LAST_3_DIFF,P_1ST_WON_LAST_50_DIFF,P_1ST_WON_LAST_5_DIFF,P_2ND_WON_LAST_100_DIFF,P_2ND_WON_LAST_10_DIFF,P_2ND_WON_LAST_200_DIFF,P_2ND_WON_LAST_25_DIFF,P_2ND_WON_LAST_3_DIFF,P_2ND_WON_LAST_50_DIFF,P_2ND_WON_LAST_5_DIFF,P_ACE_LAST_100_DIFF,P_ACE_LAST_10_DIFF,P_ACE_LAST_200_DIFF,P_ACE_LAST_25_DIFF,P_ACE_LAST_3_DIFF,P_ACE_LAST_50_DIFF,P_ACE_LAST_5_DIFF,P_BP_SAVED_LAST_100_DIFF,P_BP_SAVED_LAST_10_DIFF,P_BP_SAVED_LAST_200_DIFF,P_BP_SAVED_LAST_25_DIFF,P_BP_SAVED_LAST_3_DIFF,P_BP_SAVED_LAST_50_DIFF,P_BP_SAVED_LAST_5_DIFF,P_DF_LAST_100_DIFF,P_DF_LAST_10_DIFF,P_DF_LAST_200_DIFF,P_DF_LAST_25_DIFF,P_DF_LAST_3_DIFF,P_DF_LAST_50_DIFF,P_DF_LAST_5_DIFF,WIN_LAST_100_DIFF,WIN_LAST_10_DIFF,WIN_LAST_200_DIFF,WIN_LAST_25_DIFF,WIN_LAST_3_DIFF,WIN_LAST_50_DIFF,WIN_LAST_5_DIFF,RESULT
0,5.0,1028.0,-69.0,3,32,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,5.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,1
1,10.3,-257.0,126.0,3,32,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,8.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0
2,3.7,352.0,-135.0,3,32,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,1
3,2.0,19.0,-7.0,3,32,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,7.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,1
4,0.9,734.0,-162.0,3,32,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,11.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95370,3.2,-40.0,250.0,3,4,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,8.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0
95371,2.2,187.0,-621.0,3,4,-11.415089,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.000000,0,0,-10.0,1,53.820988,53.820988,53.820988,53.820988,53.820988,53.820988,53.820988,69.954545,69.954545,69.954545,69.954545,69.954545,69.954545,69.954545,40.040541,40.040541,40.040541,40.040541,40.040541,40.040541,40.040541,3.203704,3.203704,3.203704,3.203704,3.203704,3.203704,3.203704,49.500000,49.500000,49.500000,49.500000,49.500000,49.500000,49.500000,3.203704,3.203704,3.203704,3.203704,3.203704,3.203704,3.203704,0,0,0,0,0,0,0,1
95372,-2.1,-21.0,124.0,3,4,-21.484266,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,-22.190451,0,0,3.0,-2,-72.147059,-72.147059,-72.147059,-72.147059,-72.147059,-72.147059,-72.147059,-63.450893,-63.450893,-63.450893,-63.450893,-63.450893,-63.450893,-63.450893,-47.714286,-47.714286,-47.714286,-47.714286,-47.714286,-47.714286,-47.714286,-0.713235,-0.713235,-0.713235,-0.713235,-0.713235,-0.713235,-0.713235,-70.333333,-70.333333,-70.333333,-70.333333,-70.333333,-70.333333,-70.333333,-2.477941,-2.477941,-2.477941,-2.477941,-2.477941,-2.477941,-2.477941,0,0,0,0,0,0,0,0
95373,10.3,434.0,-480.0,3,4,-21.029457,0.0,0.0,0.0,0.0,-2.305865e-16,0.0,0.0,-58.946102,0,0,-10.0,226,-8.322480,-6.413177,-7.644898,-7.970157,-7.202387,-7.201900,-8.811394,2.903634,3.002937,2.913438,1.397046,0.017734,3.707832,0.920725,-2.842961,-0.235478,-0.350060,-2.964308,0.613553,-1.651779,0.863322,2.280146,2.584554,2.241952,1.703725,0.876012,2.251091,0.513701,-10.982150,-14.552512,-11.355782,-9.835787,5.698653,-8.524708,-6.877104,1.190927,1.091163,0.910355,0.749971,1.297126,0.634542,1.540796,0,0,0,0,-2,0,0,1


## Split Training vs Testing Data

We'll shuffle the data, and do a 85% split between training and testing data.

In [None]:
# Convert data to numpy (exclude the first 5k matches, since ELO hasn't been properly calculated yet)
data = final_dataset.to_numpy(dtype=object)[5000:,:]
np.random.shuffle(data)

# Split the data using an 85% split between training and testing
split = 0.85
value = round(split*len(data))

data_train = data[:value,:]
data_test = data[value:,:]

print("Training Data: "+str(data_train.shape))
print("Testing Data: "+str(data_test.shape))

Training Data: (76819, 68)
Testing Data: (13556, 68)


We need to map the result column to string values (since that's what the sklearn library requires I'm pretty sure)

In [None]:
# Define several mappers
mapper = np.vectorize(lambda x: "Player 2 Wins" if x == 0 else "Player 1 Wins")
reverse_mapper = np.vectorize(lambda x: 0 if x == "Player 2 Wins" else 1)

# Training data
x_train = data_train[:,:-1]
y_pred_train = mapper(data_train[:,-1:]).squeeze()

# Testing data
x_test = data_test[:,:-1]
y_pred_test = mapper(data_test[:,-1:]).squeeze()

## Train Models

### Train Simple Decision Tree

We can start by training a really simple decision tree (max_depth=4) to see how good it is.

In [None]:
# Instantiate a Decision Tree
decision_sklearn = DecisionTreeClassifier(max_depth=4)
decision_sklearn = decision_sklearn.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = decision_sklearn.predict(x_train)
predictions_test = decision_sklearn.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.656907796248324
Test Accuracy: 0.6637651224550015


In [None]:
text_representation = tree.export_text(decision_sklearn, feature_names=final_dataset.columns[:-1])
print(text_representation)

|--- ELO_DIFF <= -23.81
|   |--- ELO_DIFF <= -161.27
|   |   |--- ELO_DIFF <= -272.74
|   |   |   |--- ELO_DIFF <= -402.64
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- ELO_DIFF >  -402.64
|   |   |   |   |--- class: Player 2 Wins
|   |   |--- ELO_DIFF >  -272.74
|   |   |   |--- ELO_SURFACE_DIFF <= -11.34
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- ELO_SURFACE_DIFF >  -11.34
|   |   |   |   |--- class: Player 2 Wins
|   |--- ELO_DIFF >  -161.27
|   |   |--- ELO_SURFACE_DIFF <= -48.23
|   |   |   |--- AGE_DIFF <= -0.35
|   |   |   |   |--- class: Player 2 Wins
|   |   |   |--- AGE_DIFF >  -0.35
|   |   |   |   |--- class: Player 2 Wins
|   |   |--- ELO_SURFACE_DIFF >  -48.23
|   |   |   |--- ATP_RANK_DIFF <= -8.50
|   |   |   |   |--- class: Player 1 Wins
|   |   |   |--- ATP_RANK_DIFF >  -8.50
|   |   |   |   |--- class: Player 2 Wins
|--- ELO_DIFF >  -23.81
|   |--- ELO_DIFF <= 108.47
|   |   |--- ELO_SURFACE_DIFF <= 41.98
|   |   |   |--- ATP_RANK_DIFF <=

As we can see in the output, it seems like it's only taking ELO into account, which we don't really want (since otherwise we could just predict using ELO alone).

Let's see if a Random Forest works better :)

### Train Random Forest

We start by training a pretty big random forest (n_estimators=500)

In [None]:
# Instantiate a Random Forsest
forest_sklearn = RandomForestClassifier(n_estimators=500, max_depth=10, max_features="sqrt", bootstrap=True)
forest_sklearn = forest_sklearn.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = forest_sklearn.predict(x_train)
predictions_test = forest_sklearn.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.7097072338874497
Test Accuracy: 0.6751254057244025


That's a slight improvement :). Let's try a simpler less overfitted model.

In [None]:
# Instantiate a Random Forsest
forest_sklearn2 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=400, min_samples_leaf=250, max_features="sqrt", bootstrap=True)
forest_sklearn2 = forest_sklearn2.fit(x_train, y_pred_train)

# Make predictions and test accuracy
predictions_train = forest_sklearn2.predict(x_train)
predictions_test = forest_sklearn2.predict(x_test)
print("Train Accuracy: "+str(accuracy_score(y_pred_train, predictions_train)))
print("Test Accuracy: "+str(accuracy_score(y_pred_test, predictions_test)))

Train Accuracy: 0.6698993738528228
Test Accuracy: 0.6718058424313957


Seems like accuracy kinda decreased. I'm going to run a quick GridSearch to see if we could improve this. Let's see if we can find the best hyperparameters :)

The gridSearchCV wasn't that successful, since the test accuracy was actually 0.6611. Let's train an XGBoost model and see if it does better.

### Train XGBoost Algorithm

Let's try with XGBoost and see if we can get better results.

In [None]:
# Instantiate an XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, subsample=0.8, colsample_bytree=0.7)

# Train the model
xgb_model.fit(x_train, reverse_mapper(y_pred_train))

# Make predictions
predictions_train = xgb_model.predict(x_train)
predictions_test = xgb_model.predict(x_test)

# Calculate accuracy
print("Train Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_train), predictions_train)))
print("Test Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_test), predictions_test)))

Train Accuracy: 0.9684453064996941
Test Accuracy: 0.6659781646503393


In [None]:
# Sorting by importance in descending order
sorted_mapped_results = sorted(
    list(zip(final_dataset.columns[:-1], list(xgb_model.feature_importances_))),
    key=lambda x: x[1],
    reverse=True
)

# Extracting sorted labels and their importances
sorted_labels = [label for label, importance in sorted_mapped_results]
sorted_importances = [importance for _, importance in sorted_mapped_results]

# Displaying results
for label, importance in sorted_mapped_results:
    print(f"{label}: {importance:.4f}")


ELO_DIFF: 0.0914
ELO_SURFACE_DIFF: 0.0448
BEST_OF: 0.0228
ATP_POINTS_DIFF: 0.0190
ATP_RANK_DIFF: 0.0166
AGE_DIFF: 0.0157
WIN_LAST_200_DIFF: 0.0142
H2H_DIFF: 0.0142
P_BP_SAVED_LAST_200_DIFF: 0.0140
P_1ST_WON_LAST_5_DIFF: 0.0140
ELO_GRAD_LAST_200_DIFF: 0.0138
H2H_SURFACE_DIFF: 0.0136
P_1ST_WON_LAST_10_DIFF: 0.0136
P_1ST_WON_LAST_3_DIFF: 0.0136
P_ACE_LAST_50_DIFF: 0.0136
P_2ND_WON_LAST_50_DIFF: 0.0136
P_1ST_IN_LAST_200_DIFF: 0.0135
P_2ND_WON_LAST_200_DIFF: 0.0135
P_1ST_WON_LAST_200_DIFF: 0.0134
P_1ST_WON_LAST_25_DIFF: 0.0133
P_2ND_WON_LAST_5_DIFF: 0.0133
P_1ST_IN_LAST_5_DIFF: 0.0133
P_DF_LAST_25_DIFF: 0.0132
P_2ND_WON_LAST_3_DIFF: 0.0132
ELO_GRAD_LAST_100_DIFF: 0.0132
N_GAMES_DIFF: 0.0132
P_DF_LAST_50_DIFF: 0.0131
P_2ND_WON_LAST_10_DIFF: 0.0131
P_1ST_IN_LAST_50_DIFF: 0.0131
P_BP_SAVED_LAST_5_DIFF: 0.0131
P_1ST_IN_LAST_100_DIFF: 0.0131
P_BP_SAVED_LAST_50_DIFF: 0.0130
P_1ST_IN_LAST_3_DIFF: 0.0130
P_1ST_WON_LAST_50_DIFF: 0.0130
P_1ST_IN_LAST_25_DIFF: 0.0129
P_DF_LAST_200_DIFF: 0.0129
P_ACE_L

Okay this is overfitting significantly, let's try regularization.

In [None]:
# Instantiate an XGBoost Classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_alpha=0.1,
    reg_lambda=1.0
)

# Train the model
xgb_model.fit(x_train, reverse_mapper(y_pred_train))

# Make predictions
predictions_train = xgb_model.predict(x_train)
predictions_test = xgb_model.predict(x_test)

# Calculate accuracy
print("Train Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_train), predictions_train)))
print("Test Accuracy: " + str(accuracy_score(reverse_mapper(y_pred_test), predictions_test)))

Train Accuracy: 0.6781395227743137
Test Accuracy: 0.6737238123340218


In [None]:
xgb_model.save_model("/content/drive/MyDrive/Tennis_Project//models/xgb_model.json")

That's slightly better. Let's run a gridsearch to really make sure.

In [None]:
# Define parameter grid with all specified parameters
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [5, 10],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.7],
    'colsample_bytree': [0.6],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.5, 1.0]
}

# Instantiate an XGBoost Classifier
xgb_model = XGBClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=3,
    n_jobs=-1
)
grid_search.fit(x_train, reverse_mapper(y_pred_train))

In [None]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions
predictions_train = best_xgb_model.predict(x_train)
predictions_test = best_xgb_model.predict(x_test)

# Calculate accuracy
print("Train Accuracy:", accuracy_score(reverse_mapper(y_pred_train), predictions_train))
print("Test Accuracy:", accuracy_score(reverse_mapper(y_pred_test), predictions_test))

In [None]:
best_xgb_model = grid_search.best_estimator_
best_xgb_model.save_model("./content/drive/MyDrive/Tennis_Project/models/best_xgb_model.json")