In [65]:
import pandas as pd
import numpy as np
import warnings

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [66]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [67]:
### Season to test results

N = 2018

In [68]:
data.head()

Unnamed: 0,season,round,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,...,driver_sainz,driver_sirotkin,driver_stevens,driver_stroll,driver_sutil,driver_tsunoda,driver_vandoorne,driver_vergne,driver_vettel,driver_wehrlein
0,2014,1,3,1,25,1,1,25,1,2,...,0,0,0,0,0,0,0,0,0,0
1,2014,1,4,2,18,0,2,33,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2014,1,10,3,15,0,3,33,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2014,1,5,4,12,0,4,18,0,3,...,0,0,0,0,0,0,0,0,0,0
4,2014,1,15,5,10,0,5,10,0,4,...,0,0,0,0,0,0,0,0,0,0


In [69]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season != N]

scaler = StandardScaler()
X_train = train.drop(['podium', 'driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 'constructor_wins', 'constructor_standings_pos'], axis=1)
# X_train = train.drop(['podium','date', 'fastest_lap', 'season', 'round', 'pos'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
# X_train = np.asarray(X_train)

y_train = np.asarray(train.podium.values)

In [70]:
X_train.columns

Index(['season', 'round', 'grid', 'final_time', 'q_delta',
       'circuit_id_albert_park', 'circuit_id_americas', 'circuit_id_bahrain',
       'circuit_id_baku', 'circuit_id_catalunya',
       ...
       'driver_sainz', 'driver_sirotkin', 'driver_stevens', 'driver_stroll',
       'driver_sutil', 'driver_tsunoda', 'driver_vandoorne', 'driver_vergne',
       'driver_vettel', 'driver_wehrlein'],
      dtype='object', length=107)

In [71]:
def score_classification(model):
    probabilities = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        # X_test = test.drop(['podium'], axis = 1)
        X_test = test.drop(['podium', 'driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 'constructor_wins', 'constructor_standings_pos'], axis=1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df.reset_index(inplace=True, drop=True)

        probabilities += list(prediction_df['proba_1'].values)

    return probabilities

In [72]:
svm_wins = svm.SVC(probability=True, kernel="linear",gamma='auto')   # Initialize the SVM model

svm_wins.fit(X_train, y_train)                                      # Train the SVM model

In [73]:
probabilities = score_classification(svm_wins)

test = merged[(merged.season == N)]
test['prob'] = probabilities
test = test[['round', 'driver', 'prob', 'grid', 'podium']]
max_prob = test.groupby('round').agg({'prob': max}).reset_index()

predict_wins = pd.merge(test, max_prob, how='inner', on=['round', 'prob']).drop('prob', axis=1)

accuracy = (predict_wins.query('podium == 1').podium.count() / predict_wins['round'].count()) * 100

print('Accuracy: {:.2f}%'.format(accuracy))

Accuracy: 47.62%


In [74]:
predict_wins

Unnamed: 0,round,driver,grid,podium
0,1,hamilton,1,2
1,2,hamilton,9,3
2,3,hamilton,4,4
3,4,hamilton,2,1
4,5,hamilton,1,1
5,6,hamilton,3,3
6,7,hamilton,4,5
7,8,hamilton,1,1
8,9,hamilton,2,16
9,10,hamilton,1,2


In [75]:
total = max(df[df.season == N]['round'].unique())
missing = list(set([i for i in range(1, total + 1)]) - set(df[df.season == N]['round'].unique()))

print('Missing rounds: {}'.format(missing))

Missing rounds: []


In [76]:
### Now where are those missing rounds?!

merged.query('season == 2021 & round in @missing')

Unnamed: 0,season,round,circuit_id,country,lat,long,date,driver,nationality,constructor,...,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,pos,final_time,stage,q_delta,driver_age
