In [1]:
import pandas as pd
import numpy as np
import warnings
import statistics

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn import set_config

set_config(display='diagram')

In [2]:
df_all = pd.read_csv('../data/processed/0.3-feature-processing.csv', parse_dates=['date'], index_col=0)
df = df_all[df_all['year'] > 1959]
df = df.sort_values(by=['date']).reset_index(drop=True)
df = df.set_index('driverRef', append=True)
df = df.set_index('finished', append=True)

In [3]:
#The dates of the last 5 races of Season 2021
race_rounds_dates = {
    18: datetime(2021,11,7),
    19: datetime(2021,11,14),
    20: datetime(2021,11,21),
    21: datetime(2021,12,5),
    22: datetime(2021,12,12),
}

#Return Train/test split based on a race date (exclusive)
def splitByRaceDate(race_date):
    df_train = df.loc[df['date'] < race_date]
    df_test = df.loc[df['date'] >= race_date]

    X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
    X_test = df_test[df_test.columns.difference(['positionOrder'])]

    y_train = df_train['positionOrder']
    y_test = df_test[['positionOrder','round']]

    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = splitByRaceDate(race_rounds_dates[18])

In [5]:
one_hot_encoder = ColumnTransformer([
    ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), ['circuitId'])], remainder='passthrough')

In [6]:
def my_custom_loss_func(y_true, y_pred):
    y_rank = pd.DataFrame(y_pred).rank()
    return mean_squared_error(y_true=y_true, y_pred=y_rank)
    
scorer = make_scorer(my_custom_loss_func, greater_is_better=False)

In [7]:
tscv = TimeSeriesSplit(n_splits=55, test_size=20)

In [8]:
def calculateMSEgridstart(tscv):
    mse_scores = []
    for _, test_index in tscv.split(X_train):
        X_test_cv = X_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]
        score = mean_squared_error(y_test_cv, X_test_cv['gridStart'])
        mse_scores.append(score)

    return statistics.mean(mse_scores)

In [9]:
print('Mean MSE baseline: ', round(calculateMSEgridstart(tscv),3))

Mean MSE baseline:  25.565


In [10]:
def gridSearchCV_results(X_train, y_train, cv, scorer, pipeline, param_grid, output):
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv=cv,
        n_jobs=-1,
        scoring=scorer,
        verbose=1
    )
    model.fit(X_train,y_train.values.ravel())
    if output:
        print('Best parameters:\n', model.best_params_)
        print('Best mean accuracy score in cross-validation:\n', round(model.best_score_,3))
    return model.best_estimator_

### Ridge regression

In [11]:
pipeline = Pipeline([
    ('one_hot', one_hot_encoder),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])
param_grid = {
    'scaler':['passthrough', StandardScaler(with_mean=False)],
    'model__alpha':[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1],
}
#pipeline.get_params()

In [12]:
ridgeRegression = gridSearchCV_results(X_train, y_train, tscv, scorer, pipeline, param_grid, output=True)

Fitting 55 folds for each of 26 candidates, totalling 1430 fits




Best parameters:
 {'model__alpha': 0.15, 'scaler': StandardScaler(with_mean=False)}
Best mean accuracy score in cross-validation:
 -23.996
