In [34]:
import pandas as pd
import numpy as np
import warnings

from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [35]:
ml_data = pd.read_csv(path+'data/ml_input.csv')
data = pd.read_csv(path+'data/merged.csv')

In [36]:
### Season to test results

N = 2022

In [37]:
comparison_dict = {
    'model':[],
    'gamma': [],
    'C': [],
    'kernel': [],
    'score': []
    }

In [38]:
def format_points(x):
    scoring = {
    1: 25.,
    2: 18.,
    3: 15.,
    4: 12.,
    5: 10.,
    6: 8.,
    7: 6.,
    8: 4.,
    9: 2.,
    10: 1.
}

    if x in scoring:
        return scoring[x]
    else:
        return 0

In [46]:
df = ml_data.copy()
df['points'] = df.podium.map(lambda x: format_points(x))

train = df[df.season < N]

scaler = StandardScaler()
X_train = train.drop(['podium', 'points'], axis=1)

print(X_train.shape)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.points.values)

(2680, 48)


In [54]:
def score_classification(model):
    correct_predictions = 0
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium', 'points'], axis=1)
        y_test = test.points

        # Scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


        # Make Predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test))

        # prediction_df['actual'] = y_test.reset_index(drop=True)
        # prediction_df.sort_values('proba_1', ascending=False, inplace=True)
        # prediction_df.reset_index(inplace = True, drop=True)
        # prediction_df['predicted'] = prediction_df.index

        # correct_predictions += precision_score(prediction_df.actual, prediction_df.predicted)

    return prediction_df
    # return correct_predictions / df[df.season == N]['round'].nunique()

In [55]:
# Logistic Regression

params={'penalty': ['l1', 'l2'],
        'solver': ['saga', 'liblinear'],
        'C': np.logspace(-3,1,20)}

for penalty in params['penalty']:
    for solver in params['solver']:
        for c in params['C']:
            model_params = (penalty, solver, c)
            model = LogisticRegression(penalty=penalty, solver=solver, C=c, max_iter=10000)
            model.fit(X_train, y_train)
            
            model_score = score_classification(model)
            break
        break
    break
            
            # comparison_dict['model'].append('logistic_regression')
            # comparison_dict['params'].append(model_params)
            # comparison_dict['score'].append(model_score)

In [56]:
model_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
1,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
2,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
3,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
4,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
5,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
6,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
7,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
8,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605
9,0.486408,0.051952,0.052336,0.050491,0.051644,0.052036,0.052021,0.050489,0.050491,0.050527,0.051605


In [None]:
comparison_df = pd.DataFrame(comparison_dict)

comparison_df.sort_values('score', ascending=False)