In [66]:
import pandas as pd
import numpy as np
import warnings

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [67]:
data = pd.read_csv(path+'data/ml_input.csv')
merged = pd.read_csv(path+'data/merged.csv')

In [68]:
### Season to test results

N = 2022

In [69]:
df = data.copy()

train = df[df.season < N]

scaler = StandardScaler()
X_train = train.drop(['podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

y_train = np.asarray(train.driver_points_from_race.values)

In [70]:
X_train.columns

Index(['season', 'round', 'driver_points', 'driver_wins',
       'driver_standings_pos', 'constructor_points', 'constructor_wins',
       'constructor_standings_pos', 'starting_grid', 'stage_q1', 'stage_q2',
       'stage_q3', 'circuit_id_albert_park', 'circuit_id_americas',
       'circuit_id_bahrain', 'circuit_id_baku', 'circuit_id_catalunya',
       'circuit_id_hockenheimring', 'circuit_id_hungaroring',
       'circuit_id_imola', 'circuit_id_interlagos', 'circuit_id_istanbul',
       'circuit_id_jeddah', 'circuit_id_losail', 'circuit_id_marina_bay',
       'circuit_id_miami', 'circuit_id_monaco', 'circuit_id_monza',
       'circuit_id_mugello', 'circuit_id_nurburgring', 'circuit_id_portimao',
       'circuit_id_red_bull_ring', 'circuit_id_ricard', 'circuit_id_rodriguez',
       'circuit_id_sepang', 'circuit_id_shanghai', 'circuit_id_silverstone',
       'circuit_id_sochi', 'circuit_id_spa', 'circuit_id_suzuka',
       'circuit_id_villeneuve', 'circuit_id_yas_marina',
       'circuit

In [71]:
comparison_dict ={'model':[],
                  'criterion': [],
                  'max_features': [],
                  'max_depth': [],
                  'score': []
                  }

In [72]:
def score_regression(model):
    correct_predictions = 0
    predictions = []
    
    for circuit in df[df.season == N]['round'].unique():

        test = df[(df.season == N) & (df['round'] == circuit)]
        X_test = test.drop(['podium', 'driver_points_from_race', 'constructor_points_from_race', 'qualifying_pos'], axis=1)
        y_test = test.driver_points_from_race

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns=['results'])
        prediction_df['points'] = y_test.reset_index(drop=True)
        predictions += list(prediction_df.results.values)
        # prediction_df['actual'] = prediction_df.podium.map(lambda x: 1 if x == 1 else 0)
        # prediction_df.sort_values('results', ascending=False, inplace=True)
        # prediction_df.reset_index(inplace=True, drop=True)
        # prediction_df['predicted'] = prediction_df.index
        # prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        # correct_predictions += precision_score(prediction_df.actual, prediction_df.predicted)
    return predictions
    return correct_predictions / df[df.season == N]['round'].nunique()

In [73]:
# Random Forest Regressor

test = merged.copy()
test = test[['season', 'round', 'driver', 'starting_grid', 'podium', 'driver_points_from_race']].query('season == 2022'
)

params={'criterion': ['squared_error'],
        'max_features': [0.8, 1.0],
        'max_depth': list(map(int, (np.linspace(5, 55, 26))))
        }

for criterion in params['criterion']:
    for max_features in params['max_features']:
        for max_depth in params['max_depth']:

            model_params = (criterion, max_features, max_depth)
            model = RandomForestRegressor(criterion = criterion, max_features = max_features, max_depth = max_depth)
            model.fit(X_train, y_train)
            
            score = score_regression(model)
            test['predicted_points'] = score
            break
        break
    break

            # comparison_dict['model'].append('random_forest_regressor')
            # comparison_dict['criterion'].append(criterion)
            # comparison_dict['max_features'].append(max_features)
            # comparison_dict['max_depth'].append(max_depth)
            # comparison_dict['score'].append(score)

In [89]:
test.query('round == 14').sort_values('predicted_points', ascending=False)

Unnamed: 0,season,round,driver,starting_grid,podium,driver_points_from_race,predicted_points
3443,2022,14,carlos_sainz,1,3,15.0,16.143021
3442,2022,14,sergio_perez,2,2,18.0,14.240996
3441,2022,14,max_verstappen,14,1,26.0,12.912266
3446,2022,14,charles_leclerc,15,6,8.0,11.466202
3460,2022,14,lewis_hamilton,4,20,0.0,10.665043
3445,2022,14,fernando_alonso,3,5,10.0,10.290122
3444,2022,14,george_russell,5,4,12.0,8.007546
3450,2022,14,alexander_albon,6,10,1.0,4.367018
3455,2022,14,daniel_ricciardo,7,15,0.0,3.554697
3447,2022,14,esteban_ocon,16,7,6.0,3.195881


In [75]:
# comparison_df = pd.DataFrame(comparison_dict)

# chosen_inputs = comparison_df.query('score > 0.3').sort_values('score', ascending=False)

# chosen_inputs.head()

In [76]:
# chosen_inputs.to_csv(path+'parameters/random-forest-regression.csv')