In [2]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

import seaborn as sns
import matplotlib.patches as mpatches

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV


In [None]:
def conf_matrix_classifier_from_predictions(classifier, X, y, name = ""):
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X,
        y,
        display_labels=['0','1'],
        cmap=plt.cm.Blues,
        normalize=None,
    )
    disp.ax_.set_title('Confusion Matrix: ' + name)
    plt.show()

## Load data

In [5]:
df_encoded = pd.read_csv("data/encoded_data.csv", sep="\t")
df = pd.read_csv("data/combined_dataframe_clean.csv", sep="\t")

## Split dataframe

In [None]:
train_years= [2019, 2020, 2021]
test_years = [2022]

y_train_years_GP = df[df['Year'].isin(train_years)]['GoodPitStop']
y_test_years_GP= df[df['Year'].isin(test_years)]['GoodPitStop']
X_train_years_GP = df_encoded.loc[y_train_years_GP.index]
X_test_years_GP = df_encoded.loc[y_test_years_GP.index]

y_train_years_HP = df[df['Year'].isin(train_years)]['HasPitLap']
y_test_years_HP= df[df['Year'].isin(test_years)]['HasPitLap']
X_train_years_HP = df_encoded.loc[y_train_years_HP.index]
X_test_years_HP = df_encoded.loc[y_test_years_HP.index]


from sklearn.model_selection import train_test_split 
X_train_GP, X_test_GP, y_train_GP, y_test_GP = train_test_split(df_encoded, df['GoodPitStop'], test_size=0.3, random_state=4815)
X_train_HP, X_test_HP, y_train_HP, y_test_HP = train_test_split(df_encoded, df['HasPitLap'], test_size=0.3, random_state=4815)

## PCA

In [None]:
from sklearn.decomposition import PCA

# Create a PCA object with the specified variance threshold
pca = PCA(n_components=0.95)

# Fit the PCA on the training data and transform both the training and test data
X_train_pca = pca.fit_transform(X_train_HP)
X_test_pca = pca.transform(X_test_HP)

# Print the number of features after PCA
print(f"Number of features after PCA: {X_train_pca.shape[1]}")

## SVM

In [None]:
from sklearn import svm

#Creation an SVM classifier
svm_model = svm.SVC(class_weight="balanced")

#Definition the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10]
}

# Perform cross-validation and grid search
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='f1', cv=5)
grid_search.fit(X_train_HP, y_train_HP)



In [None]:
#best hyperparameters and evaluate on the validation set
best_svm = grid_search.best_estimator_
y_test_pred = best_svm.predict(X_test_HP)
f1_test = f1_score(y_test_HP, y_test_pred)
print("Best hyperparameters:", grid_search.best_params_)
print("Test F1 score:", f1_test)

## Random forest