In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.feature_selection import SelectKBest, f_regression, f_classif

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'

warnings.filterwarnings("ignore", category=RuntimeWarning) 
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [162]:
merged = pd.read_csv(path+'data/merged.csv')
processed = pd.read_csv(path+'data/processed.csv')

In [163]:
merged = merged[merged.season < 2022]
merged.podium = merged.podium.map(lambda x: 1 if x == 1 else 0)
merged = pd.get_dummies(merged, columns=['driver', 'constructor', 'circuit_id', 'country', 'nationality', 'stage']).drop(['date_of_birth', 'lat', 'long', 'date', 'season', 'round'], axis=1)

In [164]:
processed = processed[processed.season < 2022]
processed.podium = processed.podium.map(lambda x: 1 if x == 1 else 0)

processed = pd.get_dummies(processed, columns=['stage']).drop([
    'season', 'round', 'driver', 'constructor', 'circuit_id',
    'qual_time', 'q_delta',
    'stage_q1', 'stage_q2',
    'driver_points_from', 'constructor_points_from', 
    'driver_points_after', 'constructor_points_after',
    'driver_wins_after', 'constructor_wins_after',
    'driver_standings_pos_after', 'constructor_standings_pos_after'], axis=1)

In [165]:
processed.columns

Index(['podium', 'qualifying_pos', 'starting_grid', 'driver_points_before',
       'constructor_points_before', 'driver_points_per', 'points_percentage',
       'driver_last_3', 'constructor_last_3', 'stage_q3'],
      dtype='object')

In [166]:
df = processed.copy()

X = df.drop('podium', axis=1)
y = df.podium

# Select the top k best features using the f_regression method
selector = SelectKBest(f_classif, k=9)
selector.fit(X, y)

# Get the names and scores of the selected features
scores = selector.scores_
feature_names = X.columns

# Sort the features by score

features = sorted(zip(scores, feature_names), reverse=True)

scores, feature_names = zip(*sorted(zip(scores, feature_names), reverse=True))

In [167]:
# linear regression feature importance
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.linear_model import LinearRegression

# define and fit the model
model = LinearRegression()
model.fit(X, y)

# Calculate the permutation importance of the features
perm = PermutationImportance(model).fit(X, y)

# Sort the features by importance
imp_df = eli5.explain_weights_df(perm, feature_names=list(X.columns))
imp_df = imp_df.sort_values("weight", ascending=False)

# Print the top k features
print(imp_df[:15])

                     feature    weight       std
0          driver_points_per  0.301907  0.005017
1  constructor_points_before  0.196333  0.005140
2       driver_points_before  0.121361  0.008413
3              starting_grid  0.089064  0.004086
4                   stage_q3  0.051782  0.004937
5          points_percentage  0.026971  0.004124
6              driver_last_3  0.010656  0.001193
7             qualifying_pos  0.000211  0.000143
8         constructor_last_3  0.000011  0.000087
