In [25]:
import pandas as pd
import numpy as np
import warnings
import statistics

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.decomposition import PCA


from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn import set_config

from datetime import datetime

set_config(display='diagram')

In [4]:
df_all = pd.read_csv('../data/processed/0.3-feature-processing.csv', parse_dates=['date'], index_col=0)
df = df_all[df_all['year'] > 1959]
df = df.sort_values(by=['date']).reset_index(drop=True)
df = df.set_index('driverRef', append=True)
df = df.set_index('finished', append=True)

#18th round of the 2021 season
split_date = datetime(2021,11,7)

df_train = df.loc[df['date'] < split_date]
df_test = df.loc[df['date'] >= split_date]

In [30]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,driverId,constructorId,gridStart,positionOrder,year,round,circuitId,date,ageAtRace,ageAtDebut,...,constructorStandingsWins,previousRaceGridStart,previousRacePosition,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,percentageOfBestQuali
Unnamed: 0_level_1,driverRef,finished,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
23400,alonso,1.0,4,214,11,8,2021,22,24,2021-12-12,40,19,...,1.0,13.0,13.0,32.0,60.0,273.0,22.0,1.0,5.0,101.645374
23401,hamilton,1.0,1,131,2,2,2021,22,24,2021-12-12,36,22,...,9.0,1.0,1.0,103.0,24.0,262.0,103.0,124.0,61.0,100.451838
23402,mick_schumacher,1.0,854,210,19,14,2021,22,24,2021-12-12,22,22,...,0.0,19.0,20.0,0.0,3.0,18.0,0.0,0.0,48.0,103.406448
23403,max_verstappen,1.0,830,9,1,1,2021,22,24,2021-12-12,24,17,...,10.0,3.0,2.0,19.0,28.0,112.0,12.0,74.0,111.0,100.0
23404,mazepin,,853,210,20,20,2021,22,77,2021-12-12,22,22,...,0.0,19.0,18.0,0.0,4.0,2.0,0.0,0.0,46.0,115.0


In [31]:
X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
X_test = df_test[df_test.columns.difference(['positionOrder'])]

y_train = df_train['positionOrder']
y_test = df_test[['positionOrder','round']]

# X = df_train[['gridStart','lastRaceRank']]

In [32]:
one_hot_encoder = ColumnTransformer([
    ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), ['driverId','circuitId','constructorId'])], 
    remainder='passthrough')

In [33]:
pipeline = Pipeline([
    ('one_hot', one_hot_encoder),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])
param_grid = {
    'scaler':['passthrough', StandardScaler(with_mean=False)],
    'model__alpha':[0, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1],
}
#pipeline.get_params()

In [38]:
def my_custom_loss_func(y_true, y_pred):
    y_rank = pd.DataFrame(y_pred)
    return mean_squared_error(y_true=y_true, y_pred=y_rank)
    
scorer = make_scorer(my_custom_loss_func, greater_is_better=False)

In [37]:
tscv = TimeSeriesSplit(n_splits=55, test_size=20)

# for train_index, test_index in tscv.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

model = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    cv=tscv,
    n_jobs=-1,
    scoring=scorer,
    verbose=1
)
model.fit(X_train,y_train)

print('Best parameters:\n', model.best_params_)
print('Best mean score in cross-validation:\n', model.best_score_ )

Fitting 55 folds for each of 24 candidates, totalling 1320 fits




Best parameters:
 {'model__alpha': 0.005, 'scaler': 'passthrough'}
Best mean score in cross-validation:
 -23.954545454545453


In [14]:
def fitRidgeRegression(pipeline, param_grid, n_cv_splits, X_train, y_train, output):
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv=TimeSeriesSplit(n_splits=n_cv_splits, test_size=20),
        n_jobs=-1,
        scoring='neg_mean_squared_error',
        verbose=1
    )
    model.fit(X_train,y_train)
    if output:
        print('Best parameters:\n', model.best_params_)
        print('Best mean score in cross-validation:\n', round(model.best_score_,3))
    return model

In [15]:
model = fitRidgeRegression(pipeline, param_grid, 15, X_train, y_train, output=True)

Fitting 15 folds for each of 24 candidates, totalling 360 fits
Best parameters:
 {'model__alpha': 0.005, 'scaler': 'passthrough'}
Best mean score in cross-validation:
 -18.323182260215933


In [None]:
# clf = RidgeCV(alphas=[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1, 5, 20, 100],cv=tscv, scoring='neg_mean_squared_error').fit(X_train, y_train)

# print('Best alpha:\n', clf.alpha_)
# print('Best mean score in cross-validation:\n', round(clf.best_score_,3))
# print(clf.coef_)
# cdf = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(clf.coef_))], axis = 1)
# print(cdf)

In [None]:
# random_forest_pipeline = Pipeline([
#     ('one_hot', one_hot_encoder),
#     ('scaler', StandardScaler()),
#     ('model', RandomForestRegressor(random_state=42))
# ])

# random_forest = GridSearchCV(
#     estimator = random_forest_pipeline,
#     param_grid = {
#         'scaler':['passthrough', StandardScaler(with_mean=False)],
#         'model__max_depth':[2,3,5,None],
#         'model__min_samples_split':[2,3,4],
#         'model__min_samples_leaf': [2,3,4],
#         'model__n_estimators':[100],
#         'model__max_features':['auto'],
#     },
#     cv=tscv,
#     n_jobs=-1,
#     scoring='neg_mean_squared_error',
#     verbose=1
# )
# random_forest.fit(X_train, y_train)

# print('Best parameters:\n', random_forest.best_params_)
# print('Best mean score in cross-validation:\n', round(random_forest.best_score_,3))

In [16]:
#The dates of the last 5 races of Season 2021
race_rounds_dates = {
    18: datetime(2021,11,7),
    19: datetime(2021,11,14),
    20: datetime(2021,11,21),
    21: datetime(2021,12,5),
    22: datetime(2021,12,12),
}

#Return Train/test split based on a race date (exclusive)
def splitByRaceDate(race_date):
    df_train = df.loc[df['date'] < race_date]
    df_test = df.loc[df['date'] >= race_date]

    X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
    X_test = df_test[df_test.columns.difference(['positionOrder'])]

    y_train = df_train['positionOrder']
    y_test = df_test[['positionOrder','round']]

    return X_train, X_test, y_train, y_test

In [17]:
def createResultsTable(X_test, y_pred, y_test, round):
    X_test = X_test.reset_index(level=['driverRef'])
    X_test = X_test.reset_index(drop=True)

    df_results = pd.concat([X_test[['driverRef','gridStart']], pd.DataFrame(y_pred, columns=['y_pred'])], axis=1)

    df_results['rank_pred'] = df_results['y_pred'].rank().astype(int)
    df_results = df_results.drop('y_pred', axis=1)

    y_test = y_test.reset_index(level=['driverRef'])

    final_result = pd.merge(df_results, y_test[['driverRef','positionOrder']], on=['driverRef'])
    final_result = final_result.rename(columns={'positionOrder':'rank_true'})
    #final_result['y_pred'] = final_result['y_pred'].round(2)

    final_result = final_result.sort_values(by=['gridStart'])
    final_result['round'] = round
    return final_result

In [20]:
tscv=TimeSeriesSplit(n_splits=55, test_size=20)
alphas=[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1]

# X_train, X_test, y_train, y_test = splitByRaceDate(datetime(2021,11,7))

def gridSearchCrossVal(timeseries_cv):
    best_alpha = -1
    best_cv_score = np.Inf
    list_mse_baseline_scores = []

    for train_index, test_index in timeseries_cv.split(X_train):
        X_test_cv = X_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]
        score = mean_squared_error(y_test_cv, X_test_cv['gridStart'])
        list_mse_baseline_scores.append(score)

    mean_mse_baseline = statistics.mean(list_mse_baseline_scores)
    print('Mean MSE baseline: ', mean_mse_baseline)

    for alpha in alphas:
        #print('ALPHA: ', alpha)
        list_cv_scores = []
        # iterate over k folds
        for train_index, test_index in timeseries_cv.split(X_train):
            cv_score = np.Inf
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[test_index]

            clf = Pipeline(steps=[("onehot", one_hot_encoder), ("classifier", Ridge(alpha = alpha))])
            clf.fit(X_train_cv, y_train_cv)

            y_pred_cv = pd.DataFrame(clf.predict(X_test_cv))
            y_rank_cv = y_pred_cv.rank()

            score = mean_squared_error(y_test_cv, y_rank_cv)
            list_cv_scores.append(score)
            #print(score)
            
        cv_mean_score = statistics.mean(list_cv_scores)
        if cv_mean_score < best_cv_score:
            best_alpha = alpha
            best_model = clf
            best_cv_score = cv_mean_score
    print(f'Best alpha: {best_alpha} with MSE: {best_cv_score}')
    return best_model

In [21]:
best_model = gridSearchCrossVal(tscv)

Mean MSE baseline:  25.564545454545456




Best alpha: 0.001 with MSE: 23.954545454545453


In [22]:
df_every_round_results = pd.DataFrame()
mse_results = []
for round, date in race_rounds_dates.items():
    X_train, X_test, y_train, y_test = splitByRaceDate(date)
    X_test_current_round = X_test.loc[X_test['round'] == round]
    best_model = best_model.fit(X_train, y_train)
    y_pred_current_round = best_model.predict(X_test_current_round)
    y_test_current_round = y_test.loc[y_test['round'] == round]

    df_results = createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round)
    df_every_round_results = df_every_round_results.append(createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round))

    model_mse =  mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['rank_pred'])
    grid_start_mse = mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['gridStart'])

    mse_results.append([round, model_mse, grid_start_mse])

mse_results = pd.DataFrame(mse_results, columns=['round', 'model_mse', 'grid_start_mse'])



In [23]:
mse_results

Unnamed: 0,round,model_mse,grid_start_mse
0,18,19.4,22.4
1,19,12.8,15.9
2,20,23.2,26.0
3,21,20.6,20.8
4,22,17.3,14.0


**Test dataset: Predict the last 5 races of the 2021 season**

In [None]:
df_every_round_results = pd.DataFrame()
mse_results = []
for round, date in race_rounds_dates.items():
    X_train, X_test, y_train, y_test = splitByRaceDate(date)
    model = fitRidgeRegression(pipeline, param_grid, 17, X_train, y_train, output=False)
    X_test_current_round = X_test.loc[X_test['round'] == round]
    y_pred_current_round = model.best_estimator_.predict(X_test_current_round)
    y_test_current_round = y_test.loc[y_test['round'] == round]

    df_results = createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round)
    df_every_round_results = df_every_round_results.append(createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round))

    model_mse =  mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['rank_pred'])
    grid_start_mse = mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['gridStart'])

    mse_results.append([round, model_mse, grid_start_mse])

mse_results = pd.DataFrame(mse_results, columns=['round', 'model_mse', 'grid_start_mse'])

In [None]:
mse_results

In [None]:
df_every_round_results

In [None]:
df_every_round_results.to_csv('../data/processed/0.4-test_results.csv', index=False)