In [1]:
import pandas as pd
import numpy as np
import warnings
import statistics

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn import set_config

from sklearn.ensemble import GradientBoostingRegressor
#import xgboost as xgb

set_config(display='diagram')

In [2]:
df_all = pd.read_csv('../data/processed/0.3-feature-processing.csv', parse_dates=['date'], index_col=0)
df = df_all[df_all['year'] > 1959]
df = df.sort_values(by=['date']).reset_index(drop=True)
df = df.set_index('driverRef', append=True)
df = df.set_index('finished', append=True)

In [3]:
#The dates of the last 5 races of Season 2021
race_rounds_dates = {
    18: datetime(2021,11,7),
    19: datetime(2021,11,14),
    20: datetime(2021,11,21),
    21: datetime(2021,12,5),
    22: datetime(2021,12,12),
}

#Return Train/test split based on a race date (exclusive)
def splitByRaceDate(race_date):
    df_train = df.loc[df['date'] < race_date]
    df_test = df.loc[df['date'] >= race_date]

    X_train = df_train[df_train.columns.difference(['positionOrder', 'date', 'round', 'year'])]
    X_test = df_test[df_test.columns.difference(['positionOrder'])]

    y_train = df_train['positionOrder']
    y_test = df_test[['positionOrder','round']]

    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = splitByRaceDate(race_rounds_dates[18])

In [5]:
one_hot_encoder = ColumnTransformer([
    ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore'), ['circuitId'])], remainder='passthrough')

In [6]:
y_train

       driverRef       finished
0      hill            False       18
1      creus           False       21
2      chimeri         False       20
3      stacey          False       19
4      schell          False       15
                                   ..
23300  vettel          True        10
23301  raikkonen       True        13
23302  alonso          False       18
23303  hamilton        True         2
23304  max_verstappen  True         1
Name: positionOrder, Length: 23305, dtype: int64

In [7]:
x = np.random.randn(10, 1)
s = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
sx = pd.concat([s, pd.Series(x.reshape(10))], axis=1)

sx['rank'] = sx.iloc[:,1].rank()

print(sx)

          a         b         c         0  rank
0  0.011109 -0.683982  1.356180  0.826328   5.0
1 -1.593501 -2.170794 -1.126874  1.217106   1.0
2 -1.634479  1.161283 -1.907180  0.136741  10.0
3 -1.058746 -1.463932  0.329885 -0.227390   2.0
4  2.272271  0.745645  1.136777 -0.485857   9.0
5 -0.132640  0.199675 -0.935068  2.567139   8.0
6  0.632849 -0.550752 -0.111083 -2.811744   6.0
7 -1.063923 -0.060447  0.290072  1.156899   7.0
8  0.422541 -0.849876 -0.648573 -1.906803   4.0
9  0.794815 -0.944254  0.288905  0.855073   3.0


In [8]:
y_rank = pd.DataFrame(sx['a']).rank()

In [9]:
s1 = pd.Series([1, 2], index=['A', 'B'], name='s1')
s2 = pd.Series([3, 4], index=['A', 'B'], name='s2')

pd.concat([s1, s2], axis=1)

Unnamed: 0,s1,s2
A,1,3
B,2,4


**Make costum scoring function**

In [10]:
def my_custom_loss_func(y_true, y_pred):
    y_rank = pd.DataFrame(y_pred).rank(method='first')
    return mean_squared_error(y_true=y_true, y_pred=y_rank)
    
scorer = make_scorer(my_custom_loss_func, greater_is_better=False)

In [11]:
def costum_loss_no_retirements(y_true, y_pred):
    #y_true: df / y_pred: nd array
    y_rank = pd.DataFrame(y_pred).rank(method='first')
    y_true = y_true.reset_index(level=0)
    print(y_pred)
    print(y_rank.head())
    print(y_true.head())
    #n_rows = len(y_rank)
    #y_concat = pd.concat([y_true, pd.Series(x.reshape(n_rows))], axis=1)
    y_concat = pd.concat([y_rank, y_true], axis=1)
    print(y_concat.head())
    y_true_dropped = y_concat[y_concat.finished == True]
    #y_true_dropped['rank'] = y_true_dropped.iloc[:,0].rank(method='first')

    return mean_squared_error(y_true=y_true_dropped.iloc[:,1], y_pred=y_true_dropped.iloc[:,0],)

scorer_droped = make_scorer(costum_loss_no_retirements, greater_is_better=False)

In [31]:
def gridStart_mse_drop(y_true, y_pred):
    y_pred_df = pd.DataFrame(y_pred)
    y_true = y_true.reset_index(level=0, drop=True)
    print(y_true.head())
    print(y_pred_df + y_true)

In [12]:
tscv = TimeSeriesSplit(n_splits=55, test_size=20)

In [32]:
def calculateMSEgridstart(tscv):
    mse_scores = []
    for _, test_index in tscv.split(X_train):
        X_test_cv = X_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]
        #score = mean_squared_error(y_test_cv, X_test_cv['gridStart'])
        score = gridStart_mse_drop(y_test_cv, X_test_cv['gridStart'].values)
        mse_scores.append(score)

    return statistics.mean(mse_scores)

In [33]:
print('Mean MSE baseline: ', round(calculateMSEgridstart(tscv),3))

driverRef  finished
sainz      False       20
russell    True        16
albon      True        14
norris     True        12
leclerc    True         5
Name: positionOrder, dtype: int64


ValueError: cannot join with no overlapping index names

In [None]:
def gridSearchCV_results(X_train, y_train, cv, scorer, pipeline, param_grid, output):
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv=cv,
        n_jobs=-1,
        scoring=scorer,
        verbose=1
    )
    model.fit(X_train,y_train)
    if output:
        print('Best parameters:\n', model.best_params_)
        print('Best mean accuracy score in cross-validation:\n', round(model.best_score_,3))
    return model.best_estimator_

### Ridge regression

In [None]:
ridge_pipeline = Pipeline([
    ('one_hot', one_hot_encoder),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])
ridge_param_grid = {
    'scaler':['passthrough', StandardScaler(with_mean=False)],
    'model__alpha':[0, 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1],
}
#pipeline.get_params()

In [None]:
ridgeRegression_model = gridSearchCV_results(X_train, y_train, tscv, scorer, ridge_pipeline, ridge_param_grid, output=True)

### XGBoost

In [None]:
xgb_pipeline = Pipeline([
    ('one_hot', one_hot_encoder),
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor(random_state=42))
])

xgb_param_grid = {
    'scaler':['passthrough', StandardScaler(with_mean=False)],
    'model__learning_rate':[0.05, 0.1],
    # 'model__max_depth':[2,3,4,5,6],
    # 'model__min_samples_split':[2,3,4],
    # 'model__min_samples_leaf':[2,3,4,5],
    'model__subsample': [0.8],
    'model__n_estimators':[100]
},

In [None]:
xgboost_model = gridSearchCV_results(X_train, y_train, tscv, scorer, xgb_pipeline, xgb_param_grid, output=True)

## Test on 5 last races of 2021 season

In [None]:
def createResultsTable(X_test, y_pred, y_test, round):
    X_test = X_test.reset_index(level=['driverRef'])
    X_test = X_test.reset_index(drop=True)

    df_results = pd.concat([X_test[['driverRef','gridStart']], pd.DataFrame(y_pred, columns=['y_pred'])], axis=1)

    df_results['rank_pred'] = df_results['y_pred'].rank(method='first').astype(int)
    df_results = df_results.drop('y_pred', axis=1)

    y_test = y_test.reset_index(level=['driverRef'])

    final_result = pd.merge(df_results, y_test[['driverRef','positionOrder']], on=['driverRef'])
    final_result = final_result.rename(columns={'positionOrder':'rank_true'})
    #final_result['y_pred'] = final_result['y_pred'].round(2)

    final_result = final_result.sort_values(by=['gridStart'])
    final_result['round'] = round
    return final_result

In [None]:
df_every_round_results = pd.DataFrame()
mse_results = []

model = ridgeRegression_model

for round, date in race_rounds_dates.items():
    X_train, X_test, y_train, y_test = splitByRaceDate(date)
    model.fit(X_train, y_train)
    
    X_test_current_round = X_test.loc[X_test['round'] == round]
    y_pred_current_round = model.predict(X_test_current_round)
    y_test_current_round = y_test.loc[y_test['round'] == round]

    df_results = createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round)
    df_every_round_results = df_every_round_results.append(createResultsTable(X_test_current_round, y_pred_current_round, y_test_current_round, round))

    model_mse =  mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['rank_pred'])
    grid_start_mse = mean_squared_error(y_true=df_results['rank_true'], y_pred=df_results['gridStart'])

    mse_results.append([round, model_mse, grid_start_mse])

mse_results = pd.DataFrame(mse_results, columns=['round', 'model_mse', 'grid_start_mse'])

In [None]:
mse_results

In [None]:
df_every_round_results.reset_index(drop=True)

In [None]:
df_every_round_results.to_csv('../data/processed/0.4-test_results.csv', index=False)