In [19]:
import pandas as pd
import numpy as np
import warnings

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [20]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [21]:
### Season to test results

N = 2022

In [22]:
comparison_dict = {
    'model':[],
    'gamma': [],
    'C': [],
    'kernel': [],
    'score': []
    }

In [23]:
data.columns

Index(['season', 'round', 'podium', 'driver_points', 'driver_wins',
       'driver_standings_pos', 'constructor_points', 'constructor_wins',
       'constructor_standings_pos', 'q_delta', 'starting_grid',
       'driver_points_per_race', 'driver_points_from_race',
       'constructor_points_per_race', 'constructor_points_from_race',
       'driver_points_last_3_races', 'driver_points_before_race', 'stage_q1',
       'stage_q2', 'stage_q3', 'circuit_id_albert_park', 'circuit_id_americas',
       'circuit_id_bahrain', 'circuit_id_baku', 'circuit_id_catalunya',
       'circuit_id_hockenheimring', 'circuit_id_hungaroring',
       'circuit_id_imola', 'circuit_id_interlagos', 'circuit_id_istanbul',
       'circuit_id_jeddah', 'circuit_id_losail', 'circuit_id_marina_bay',
       'circuit_id_miami', 'circuit_id_monaco', 'circuit_id_monza',
       'circuit_id_mugello', 'circuit_id_nurburgring', 'circuit_id_portimao',
       'circuit_id_red_bull_ring', 'circuit_id_ricard', 'circuit_id_rodriguez'

In [24]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season < N]

scaler = StandardScaler()
X_train = train.drop([
    'podium',
    'driver_points_from_race',
    'constructor_points_from_race', 
    'driver_points_before_race'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.podium.values)

In [25]:
def score_classification(model):
    correct_predictions = 0
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop([
            'podium',
            'driver_points_from_race',
            'constructor_points_from_race', 
            'driver_points_before_race'], axis=1)
            
        y_test = test.podium

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Make Predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns=['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop=True)
        prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        prediction_df.reset_index(inplace = True, drop=True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        correct_predictions += precision_score(prediction_df.actual, prediction_df.predicted)

    return correct_predictions / df[df.season == N]['round'].nunique()

In [26]:
# Support Vector Machines

params={'gamma': np.logspace(-4, -1, 20),
        'C': np.logspace(-2, 1, 20),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVC(probability=True, gamma=gamma, C=c, kernel=kernel)
            model.fit(X_train, y_train)
            
            model_score = score_classification(model)
            
            comparison_dict['model'].append('svm_classifier')
            comparison_dict['gamma'].append(gamma)
            comparison_dict['C'].append(c)
            comparison_dict['kernel'].append(kernel)
            comparison_dict['score'].append(model_score)

In [27]:
comparison_df = pd.DataFrame(comparison_dict)

comparison_df.sort_values('score', ascending=False)

Unnamed: 0,model,gamma,C,kernel,score
186,svm_classifier,0.000207,0.088587,rbf,0.681818
727,svm_classifier,0.002637,0.014384,sigmoid,0.681818
182,svm_classifier,0.000207,0.061585,rbf,0.681818
509,svm_classifier,0.000886,0.127427,poly,0.636364
329,svm_classifier,0.000428,0.020691,poly,0.636364
...,...,...,...,...,...
344,svm_classifier,0.000428,0.088587,linear,0.045455
1368,svm_classifier,0.048329,0.020691,linear,0.045455
648,svm_classifier,0.001833,0.020691,linear,0.045455
200,svm_classifier,0.000207,0.379269,linear,0.000000


In [1]:
chosen_inputs = comparison_df.query('score > 0.5').sort_values('score', ascending=False)

chosen_inputs.to_csv(path+'parameters/svm_classifier.csv')

NameError: name 'comparison_df' is not defined

In [29]:
# total = max(df[df.season == N]['round'].unique())
# missing = list(set([i for i in range(1, total + 1)]) - set(df[df.season == N]['round'].unique()))

# print('Missing rounds: {}'.format(missing))