In [4]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

import seaborn as sns
import matplotlib.patches as mpatches

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier




In [5]:
def conf_matrix_classifier_from_predictions(classifier, X, y, name = ""):
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X,
        y,
        display_labels=['0','1'],
        cmap=plt.cm.Blues,
        normalize=None,
    )
    disp.ax_.set_title('Confusion Matrix: ' + name)
    plt.show()

## Load data

In [6]:
df_encoded = pd.read_csv("data/encoded_data.csv", sep="\t")
df = pd.read_csv("data/combined_dataframe_clean.csv", sep="\t")

## Split dataframe

In [7]:
train_years= [2019, 2020, 2021]
test_years = [2022]

y_train_years_GP = df[df['Year'].isin(train_years)]['GoodPitStop']
y_test_years_GP= df[df['Year'].isin(test_years)]['GoodPitStop']
X_train_years_GP = df_encoded.loc[y_train_years_GP.index]
X_test_years_GP = df_encoded.loc[y_test_years_GP.index]

y_train_years_HP = df[df['Year'].isin(train_years)]['HasPitLap']
y_test_years_HP= df[df['Year'].isin(test_years)]['HasPitLap']
X_train_years_HP = df_encoded.loc[y_train_years_HP.index]
X_test_years_HP = df_encoded.loc[y_test_years_HP.index]


from sklearn.model_selection import train_test_split 
X_train_GP, X_temp_GP, y_train_GP, y_temp_GP = train_test_split(df_encoded, df['GoodPitStop'], test_size=0.3, random_state=4815)
X_val_GP, X_test_GP, y_val_GP, y_test_GP = train_test_split(X_temp_GP, y_temp_GP, test_size=0.5, random_state=4815)
X_train_HP, X_temp_HP, y_train_HP, y_temp_HP = train_test_split(df_encoded, df['HasPitLap'], test_size=0.3, random_state=4815)
X_val_HP, X_test_HP, y_val_HP, y_test_HP = train_test_split(X_temp_HP, y_temp_HP, test_size=0.5, random_state=4815)

## PCA

In [8]:
from sklearn.decomposition import PCA

# Create a PCA object with the specified variance threshold
pca = PCA(n_components=0.95)

# Fit the PCA on the training data and transform both the training and test data
X_train_pca = pca.fit_transform(X_train_HP)
X_test_pca = pca.transform(X_test_HP)

# Print the number of features after PCA
print(f"Number of features after PCA: {X_train_pca.shape[1]}")

Number of features after PCA: 48


In [9]:
# Fit the PCA on the training data and transform both the training and validation data
X_train_pca = pca.fit_transform(X_train_HP)
X_val_pca = pca.transform(X_val_HP)
X_test_pca = pca.transform(X_test_HP)


## SVM

In [10]:

#Creation an SVM classifier
svm_model = svm.SVC(class_weight="balanced")

#Definition the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10]
}

# Perform cross-validation and grid search
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search.fit(X_val_HP, y_val_HP)

In [11]:
#best hyperparameters and evaluate on the validation set
best_svm = grid_search.best_estimator_
y_test_pred = best_svm.predict(X_test_HP)
f1_test = f1_score(y_test_HP, y_test_pred)
print("Best hyperparameters:", grid_search.best_params_)
print("Test F1 score:", f1_test)

Best hyperparameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Test F1 score: 0.41294298921417566


In [12]:
print("Grid search results:")
for params, mean_score, std_score in zip(grid_search.cv_results_['params'],
                                         grid_search.cv_results_['mean_test_score'],
                                         grid_search.cv_results_['std_test_score']):
    print("Hyperparameters:", params)
    print("Mean F1 score:", mean_score)
    print("Standard deviation:", std_score)
    print()

Grid search results:
Hyperparameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Mean F1 score: 0.20930654546778493
Standard deviation: 0.01200121528617797

Hyperparameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Mean F1 score: 0.27564234722626973
Standard deviation: 0.028016892918740495

Hyperparameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
Mean F1 score: 0.3544660299277186
Standard deviation: 0.029101825547442513

Hyperparameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Mean F1 score: 0.20930654546778493
Standard deviation: 0.01200121528617797

Hyperparameters: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Mean F1 score: 0.0
Standard deviation: 0.0

Hyperparameters: {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
Mean F1 score: 0.3780911649905501
Standard deviation: 0.04281219563079282

Hyperparameters: {'C': 0.1, 'gamma': 10, 'kernel': 'linear'}
Mean F1 score: 0.20930654546778493
Standard deviation: 0.01200121528617797

Hyperparameters: {'C': 0.1, 'gamma': 10, 'kernel': 'rbf'}
Mean

### With PCA

In [13]:
svm_model = svm.SVC(class_weight="balanced")

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10]
}

grid_search_pca = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_pca.fit(X_val_pca, y_train_HP)

ValueError: Found input variables with inconsistent numbers of samples: [9385, 43796]

In [None]:
#best hyperparameters 
best_svm = grid_search_pca.best_estimator_
y_test_pred = best_svm.predict(X_test_pca)
f1_test = f1_score(y_test_HP, y_test_pred)
print("Best hyperparameters:", grid_search_pca.best_params_)
print("Test F1 score:", f1_test)

Best hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Test F1 score: 0.5325238508239375


### With good pit stop

In [None]:
svm_model = svm.SVC(class_weight="balanced")

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10]
}

grid_search_gp = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_gp.fit(X_val_GP, y_val_GP)

In [None]:
best_svm = grid_search_gp.best_estimator_
y_test_pred = best_svm.predict(X_test_GP)
f1_test = f1_score(y_test_GP, y_test_pred)
print("Best hyperparameters:", grid_search_gp.best_params_)
print("Test F1 score:", f1_test)

Best hyperparameters: {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
Test F1 score: 0.4177449168207024


### With year split

In [None]:
#Definition the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10]
}

# Perform cross-validation and grid search
grid_search_years = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_years.fit(X_train_years_HP, y_train_years_HP)

In [None]:
#best hyperparameters and evaluate on the validation set
best_svm = grid_search_years.best_estimator_
y_test_pred = best_svm.predict(X_test_years_HP)
f1_test = f1_score(y_test_years_HP, y_test_pred)
print("Best hyperparameters:", grid_search_years.best_params_)
print("Test F1 score:", f1_test)

Best hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Test F1 score: 0.4101796407185629


## Random forest

In [None]:
rf_model = RandomForestClassifier(class_weight="balanced")

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform cross-validation and grid search on the training set
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_rf.fit(X_val_HP, y_val_HP)


In [None]:
# Retrieve the best hyperparameters and evaluate on the test set
best_rf = grid_search_rf.best_estimator_
y_test_pred = best_rf.predict(X_test_HP)
f1_test = f1_score(y_test_HP, y_test_pred)

# Print the best hyperparameters and the corresponding test F1 score
print("Best hyperparameters:", grid_search_rf.best_params_)
print("Test F1 score with best hyperparameters:", f1_test)

Best hyperparameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
Test F1 score with best hyperparameters: 0.3690637720488467


### With good pit stop

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform cross-validation and grid search on the training set
grid_search_rf_gp = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_rf_gp.fit(X_val_GP, y_val_GP)

In [None]:
# Retrieve the best hyperparameters and evaluate on the test set
best_rf = grid_search_rf_gp.best_estimator_
y_test_pred = best_rf.predict(X_test_GP)
f1_test = f1_score(y_test_GP, y_test_pred)

# Print the best hyperparameters and the corresponding test F1 score
print("Best hyperparameters:", grid_search_rf_gp.best_params_)
print("Test F1 score with best hyperparameters:", f1_test)

Best hyperparameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Test F1 score with best hyperparameters: 0.2037141249296567


### With PCA

In [None]:
# Perform cross-validation and grid search on the training set
grid_search_rf_pca = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search_rf_pca.fit(X_val_pca, y_train_HP)

In [None]:
# Retrieve the best hyperparameters and evaluate on the test set
best_rf = grid_search_rf_pca.best_estimator_
y_test_pred = best_rf.predict(X_test_pca)
f1_test = f1_score(y_test_HP, y_test_pred)

# Print the best hyperparameters and the corresponding test F1 score
print("Best hyperparameters:", grid_search_rf_pca.best_params_)
print("Test F1 score with best hyperparameters:", f1_test)

Best hyperparameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 300}
Test F1 score with best hyperparameters: 0.37016574585635365
