In [1]:
import pandas as pd
import numpy as np

from collections import OrderedDict

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn import set_config
set_config(display='diagram')

In [2]:
df_all = pd.read_csv('../data/processed/0.3-feature-processing.csv', index_col=0)
df = df_all[df_all['year'] > 1980]

In [3]:
df = df.sort_values(by=['date']).reset_index(drop=True)

In [4]:
# df = df.drop(['year','round','date'],1)

**Group by race (year,round)**

In [5]:
# groups_year = df.groupby(['year','round']).groups

# new_groups_year = dict((str(key[0])+'-'+str(key[1]), value.tolist()) for (key, value) in groups_year.items())
# new_groups_year = OrderedDict(new_groups_year)

# sorted_groups = [value for (_, value) in sorted(groups_year.items())] 

**Define train/test splits based on year and round of the race**

In [6]:
# list(new_groups_year.keys()).index('2019-21')

In [7]:
# sorted_groups = [value for (_, value) in sorted(new_groups_year.items())] 

**Add to cross validation iterable list of race from 2019 round 1 to 2019 round 21 (first to last race)**

In [8]:
# cv = []
# for idx in range(655,675):
#     cv.append((sum(sorted_groups[:idx], []), sorted_groups[idx]))

# cv = [(sum(sorted_groups[:655], []), sorted_groups[655])]

In [9]:
one_hot_encoder = ColumnTransformer([
    ('one_hot', OneHotEncoder(drop='first'), ['driverId','circuitId','constructorId'])], 
    remainder='passthrough')

In [10]:
pipeline = Pipeline([
    # ('one_hot', one_hot_encoder),
    # ('scaller', StandardScaler(with_mean=False)),
    # ('pca', PCA()),
    ('model', Ridge())
])
param_grid = {
    'model__alpha': np.arange(0, 0.2, 0.01),
    'model_random_state': 42
}

In [11]:
pipeline

In [12]:
pipeline.get_params()

{'memory': None,
 'steps': [('model', Ridge())],
 'verbose': False,
 'model': Ridge(),
 'model__alpha': 1.0,
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__max_iter': None,
 'model__normalize': 'deprecated',
 'model__positive': False,
 'model__random_state': None,
 'model__solver': 'auto',
 'model__tol': 0.001}

In [13]:
# X = df.loc[:, df.columns != 'positionOrder']
X = df[['gridStart','lastRaceRank']]
y = df['positionOrder']

tscv = TimeSeriesSplit(n_splits=10, test_size=20)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

model = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    cv=tscv,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    verbose=1
)


model.fit(X,y)

print('Best parameters:\n', model.best_params_)
print('Best mean score in cross-validation:\n', round(model.best_score_,3))


TRAIN: [    0     1     2 ... 16886 16887 16888] TEST: [16889 16890 16891 16892 16893 16894 16895 16896 16897 16898 16899 16900
 16901 16902 16903 16904 16905 16906 16907 16908]
TRAIN: [    0     1     2 ... 16906 16907 16908] TEST: [16909 16910 16911 16912 16913 16914 16915 16916 16917 16918 16919 16920
 16921 16922 16923 16924 16925 16926 16927 16928]
TRAIN: [    0     1     2 ... 16926 16927 16928] TEST: [16929 16930 16931 16932 16933 16934 16935 16936 16937 16938 16939 16940
 16941 16942 16943 16944 16945 16946 16947 16948]
TRAIN: [    0     1     2 ... 16946 16947 16948] TEST: [16949 16950 16951 16952 16953 16954 16955 16956 16957 16958 16959 16960
 16961 16962 16963 16964 16965 16966 16967 16968]
TRAIN: [    0     1     2 ... 16966 16967 16968] TEST: [16969 16970 16971 16972 16973 16974 16975 16976 16977 16978 16979 16980
 16981 16982 16983 16984 16985 16986 16987 16988]
TRAIN: [    0     1     2 ... 16986 16987 16988] TEST: [16989 16990 16991 16992 16993 16994 16995 16996 16997 

AttributeError: 'GridSearchCV' object has no attribute 'coef_'

In [14]:
df.columns

Index(['driverId', 'constructorId', 'gridStart', 'positionOrder', 'year',
       'round', 'circuitId', 'date', 'ageAtRace', 'ageAtDebut',
       'yearsExperience', 'racingAtHome', 'driverStandingsPoints',
       'driverStandingsPosition', 'driverStandingsWins', 'lastRaceRank',
       'constructorStandingsPoints', 'constructorStandingsPosition',
       'constructorStandingsWins', 'previousRaceGridStart',
       'previousRacePosition', 'racesWon', 'racesRetired', 'racesFinished',
       'polePositions', 'racesWonByConstructor', 'racesRetiredByConstructor',
       'percentageOfBestQuali'],
      dtype='object')

In [15]:
one_hot_columns = ['driverId','circuitId']

In [16]:
df = df[['gridStart','year','round','positionOrder']]

In [17]:
df_train=df[(df['year'] <= 2021) & (df['round']<19)]
df_validation=df[(df['year'] == 2021) & (df['round']==19)]

In [18]:
df_train.head(3)

Unnamed: 0,gridStart,year,round,positionOrder
0,7,1981,1,19
1,25,1981,1,28
2,25,1981,1,27


In [19]:
reg = LinearRegression().fit(df_train.iloc[:,:-3], df_train.iloc[:,-1:])
reg.score(df_train.iloc[:,:-3], df_train.iloc[:,-1:])

0.33956727142582255

In [20]:
prediction = reg.predict(df_validation.iloc[:,:-3])
df_prediction = pd.DataFrame(data=prediction,columns=['predictedOrder'])
df_validation = df_validation.reset_index(drop=True)

In [21]:
df_compare = pd.concat([df_validation, df_prediction], axis=1)

In [22]:
df_compare.head(20)

Unnamed: 0,gridStart,year,round,positionOrder,predictedOrder
0,7,2021,19,7,9.399022
1,6,2021,19,5,8.786607
2,19,2021,19,17,16.748005
3,14,2021,19,20,13.685929
4,16,2021,19,16,14.910759
5,15,2021,19,15,14.298344
6,18,2021,19,18,16.13559
7,17,2021,19,13,15.523175
8,13,2021,19,14,13.073514
9,5,2021,19,10,8.174192
