In [1]:
import pandas as pd
import numpy as np
import warnings

from IPython.utils import io

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn import set_config

from datetime import datetime

set_config(display='diagram')

In [2]:
df_all = pd.read_csv('../data/processed/0.3-feature-processing.csv', parse_dates=['date'], index_col=0)
df = df_all[df_all['year'] > 1959]
df = df.sort_values(by=['date']).reset_index(drop=True)
df = df.set_index('driverRef', append=True)

#18th round of the 2021 season
split_date = datetime(2021,11,7)

df_train = df.loc[df['date'] < split_date]
df_test = df.loc[df['date'] >= split_date]

In [3]:
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,driverId,constructorId,gridStart,positionOrder,year,round,circuitId,date,ageAtRace,ageAtDebut,...,constructorStandingsWins,previousRaceGridStart,previousRacePosition,racesWon,racesRetired,racesFinished,polePositions,racesWonByConstructor,racesRetiredByConstructor,percentageOfBestQuali
Unnamed: 0_level_1,driverRef,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,hill,289,66,3,18,1960,1,25,1960-02-07,30,29,...,0.0,-1.0,-1.0,0.0,12.0,4.0,0.0,1.0,30.0,100.0
1,creus,504,105,22,21,1960,1,25,1960-02-07,35,35,...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,9.0,197.0,100.0
2,chimeri,503,105,21,20,1960,1,25,1960-02-07,38,38,...,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,9.0,197.0,100.0
3,stacey,502,32,14,19,1960,1,25,1960-02-07,26,24,...,0.0,-1.0,-1.0,0.0,2.0,1.0,0.0,0.0,22.0,100.0
4,schell,501,170,9,15,1960,1,25,1960-02-07,38,28,...,0.0,-1.0,-1.0,0.0,31.0,31.0,0.0,5.0,15.0,100.0


In [4]:
X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
X_test = df_test[df_test.columns.difference(['positionOrder'])]

y_train = df_train['positionOrder']
y_test = df_test[['positionOrder','round']]

# X = df_train[['gridStart','lastRaceRank']]

In [5]:
one_hot_encoder = ColumnTransformer([
    ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), ['driverId','circuitId','constructorId'])], 
    remainder='passthrough')

In [6]:
pipeline = Pipeline([
    ('one_hot', one_hot_encoder),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])
param_grid = {
    'scaler':['passthrough', StandardScaler(with_mean=False)],
    'model__alpha':[0, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1],
}

In [7]:
#pipeline.get_params()

In [8]:
# tscv = TimeSeriesSplit(n_splits=15, test_size=20)

# # for train_index, test_index in tscv.split(X):
# #     print("TRAIN:", train_index, "TEST:", test_index)
# #     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
# #     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# model = GridSearchCV(
#     estimator = pipeline,
#     param_grid = param_grid,
#     cv=tscv,
#     n_jobs=-1,
#     scoring='neg_mean_squared_error',
#     verbose=1
# )
# model.fit(X_train,y_train)

# print('Best parameters:\n', model.best_params_)
# print('Best mean score in cross-validation:\n', round(model.best_score_,3))

In [9]:
def fitRidgeRegression(pipeline, param_grid, n_cv_splits, X_train, y_train, output):
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv=TimeSeriesSplit(n_splits=n_cv_splits, test_size=20),
        n_jobs=-1,
        scoring='neg_mean_squared_error',
        verbose=1
    )
    model.fit(X_train,y_train)
    if output:
        print('Best parameters:\n', model.best_params_)
        print('Best mean score in cross-validation:\n', model.best_score_)
    return model

In [10]:
model = fitRidgeRegression(pipeline, param_grid, 15, X_train, y_train, output=True)

Fitting 15 folds for each of 24 candidates, totalling 360 fits
Best parameters:
 {'model__alpha': 0.005, 'scaler': 'passthrough'}
Best mean score in cross-validation:
 -18.323182260215933


In [11]:
# tscv.split(X)
# alphas=[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1, 5, 20, 100]

# for alpha in alphas:
#     cv_mean_score = []
#     # iterate over k folds
#     for train_index, test_index in tscv.split(X):
#         cv_score = np.Inf
#         #print("TRAIN:", train_index, "TEST:", test_index)
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#         clf = Ridge(alpha = alpha)
#         clf.fit(X_train, y_train)
#         y_pred = clf.predict(X_test)
#         print(mean_squared_error(y_test, y_pred))

In [12]:
# clf = RidgeCV(alphas=[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1, 5, 20, 100],cv=tscv, scoring='neg_mean_squared_error').fit(X_train, y_train)

# print('Best alpha:\n', clf.alpha_)
# print('Best mean score in cross-validation:\n', round(clf.best_score_,3))
# print(clf.coef_)
# cdf = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(clf.coef_))], axis = 1)
# print(cdf)

In [13]:
# random_forest_pipeline = Pipeline([
#     ('one_hot', one_hot_encoder),
#     ('scaler', StandardScaler()),
#     ('model', RandomForestRegressor(random_state=42))
# ])

# random_forest = GridSearchCV(
#     estimator = random_forest_pipeline,
#     param_grid = {
#         'scaler':['passthrough', StandardScaler(with_mean=False)],
#         'model__max_depth':[2,3,5,None],
#         'model__min_samples_split':[2,3,4],
#         'model__min_samples_leaf': [2,3,4],
#         'model__n_estimators':[100],
#         'model__max_features':['auto'],
#     },
#     cv=tscv,
#     n_jobs=-1,
#     scoring='neg_mean_squared_error',
#     verbose=1
# )
# random_forest.fit(X_train, y_train)

# print('Best parameters:\n', random_forest.best_params_)
# print('Best mean score in cross-validation:\n', round(random_forest.best_score_,3))

In [14]:
#The dates of the last 5 races of Season 2021
race_rounds_dates = {
    18: datetime(2021,11,7),
    19: datetime(2021,11,14),
    20: datetime(2021,11,21),
    21: datetime(2021,12,5),
    22: datetime(2021,12,12),
}

def splitByRaceDate(race_date):
    df_train = df.loc[df['date'] < race_date]
    df_test = df.loc[df['date'] >= race_date]

    X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
    X_test = df_test[df_test.columns.difference(['positionOrder'])]

    y_train = df_train['positionOrder']
    y_test = df_test[['positionOrder','round']]

    return X_train, X_test, y_train, y_test


In [15]:
warnings.filterwarnings('ignore')

In [16]:
def createResultsTable(X_test, y_pred, y_test, round):
    X_test = X_test.reset_index(level=['driverRef'])
    X_test = X_test.reset_index(drop=True)

    df_results = pd.concat([X_test[['driverRef','gridStart']], pd.DataFrame(y_pred, columns=['y_pred'])], axis=1)

    df_results['rank_pred'] = df_results['y_pred'].rank().astype(int)
    df_results = df_results.drop('y_pred', axis=1)

    y_test = y_test.reset_index(level=['driverRef'])

    final_result = pd.merge(df_results, y_test[['driverRef','positionOrder']], on=['driverRef'])
    final_result = final_result.rename(columns={'positionOrder':'rank_true'})
    #final_result['y_pred'] = final_result['y_pred'].round(2)

    final_result = final_result.sort_values(by=['gridStart'])
    final_result['round'] = round
    return final_result


In [17]:
df_every_round_results = pd.DataFrame()
mse_results = []
for round, date in race_rounds_dates.items():
    X_train, X_test, y_train, y_test = splitByRaceDate(date)
    with io.capture_output() as captured:
        model = fitRidgeRegression(pipeline, param_grid, 15, X_train, y_train, output=False)
    X_test_current_round = X_test.loc[X_test['round'] == round]
    y_pred_current_round = model.best_estimator_.predict(X_test_current_round)
    y_test_current_round = y_test.loc[y_test['round'] == round]

    df_results = createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round)
    df_every_round_results = df_every_round_results.append(createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round))

    model_mse =  mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['rank_pred'])
    grid_start_mse = mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['gridStart'])

    mse_results.append([round, model_mse, grid_start_mse])

mse_results = pd.DataFrame(mse_results, columns=['round', 'model_mse', 'grid_start_mse'])



In [18]:
mse_results

Unnamed: 0,round,model_mse,grid_start_mse
0,18,19.4,22.4
1,19,12.8,15.9
2,20,23.2,26.0
3,21,20.6,20.8
4,22,15.2,14.0


In [19]:
df_every_round_results

Unnamed: 0,driverRef,gridStart,rank_pred,rank_true,round
13,bottas,1,2,15,18
18,hamilton,2,1,2,18
11,max_verstappen,3,3,1,18
10,perez,4,4,3,18
0,gasly,5,10,4,18
...,...,...,...,...,...
5,latifi,16,16,16,22
4,russell,17,18,18,22
14,raikkonen,18,17,19,22
17,mick_schumacher,19,19,14,22


In [20]:
df_every_round_results.to_csv('../data/processed/0.4-test_results.csv', index=False)