In [10]:
import pandas as pd
import numpy as np
import matplotlib as plt
from datetime import datetime

races = pd.read_csv('data/races.csv')
qualifying = pd.read_csv('data/qualifying.csv')
constructors = pd.read_csv('data/constructor_standings.csv')
drivers = pd.read_csv('data/driver_standings.csv')
results = pd.read_csv('data/results.csv')
weather = pd.read_csv('data/weather.csv')


In [11]:
qualifying.rename(columns={'grid_pos': 'grid'}, inplace=True)

In [12]:
df1 = pd.merge(races, weather, how='inner', on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'], axis = 1)
df2 = pd.merge(df1, results, how='inner', on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time'], axis = 1)
df3 = pd.merge(df2, drivers, how='left', on=['season', 'round', 'driver']) 
df4 = pd.merge(df3, constructors, how='left', on=['season', 'round', 'constructor'])

merged_df = pd.merge(df4, qualifying, how='inner', on=['season', 'round', 'grid']).drop(['driver_y', 'constructor_y'], axis=1)
merged_df.rename(columns={'driver_x': 'driver', 'constructor_x': 'constructor'}, inplace=True)

In [13]:
from dateutil.relativedelta import *

merged_df['date'] = pd.to_datetime(merged_df.date)
merged_df['date_of_birth'] = pd.to_datetime(merged_df.date_of_birth)
merged_df['driver_age'] = merged_df.apply(lambda x: relativedelta(x['date'], x['date_of_birth']).years, axis=1)
merged_df.drop(['date', 'date_of_birth'], axis = 1, inplace = True)

In [14]:
# fill/drop nulls

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins' , 'constructor_standings_pos']:
    merged_df[col].fillna(0, inplace = True)
    merged_df[col] = merged_df[col].map(lambda x: int(x))
    
merged_df.dropna(inplace = True )

In [15]:
x = merged_df['qualy_time'][0]
x = '1:02.921'
float(str(x).split(':')[1])
print(x[:8])

1:02.921


In [16]:
def format_qualifying(x):
    if ':' in x:
        return float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0
    else:
        return(float(x))

In [17]:
merged_df['qualy_time'] = merged_df.qualy_time.map(lambda x: format_qualifying(x))

merged_df['qualy_time'].describe()

count    687.000000
mean      83.817259
std       14.045557
min       53.377000
25%       75.980500
50%       81.640000
75%       90.854000
max      132.909000
Name: qualy_time, dtype: float64

In [18]:
# calculate difference in qualifying times

merged_df = merged_df[merged_df['qualy_time'] != 0]
merged_df.sort_values(['season', 'round', 'grid'], inplace = True)
merged_df['time_difference'] = merged_df.groupby(['season', 'round']).qualy_time.diff()
merged_df['q_delta'] = merged_df.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
merged_df.drop('time_difference', axis = 1, inplace = True)

dummies_df = pd.get_dummies(merged_df, columns=['constructor', 'circuit_id', 'nationality'])

dummies_df.to_csv('data/merged.csv', index=False)
dummies_df.head()

Unnamed: 0,season,round,warm,cold,dry,wet,cloudy,driver,grid,podium,...,nationality_French,nationality_German,nationality_Italian,nationality_Japanese,nationality_Mexican,nationality_Monegasque,nationality_Polish,nationality_Russian,nationality_Spanish,nationality_Thai
0,2020,1,True,False,False,False,False,bottas,1,1,...,0,0,0,0,0,0,0,0,0,0
19,2020,1,True,False,False,False,False,max_verstappen,2,20,...,0,0,0,0,0,0,0,0,0,0
2,2020,1,True,False,False,False,False,norris,3,3,...,0,0,0,0,0,0,0,0,0,0
12,2020,1,True,False,False,False,False,albon,4,13,...,0,0,0,0,0,0,0,0,0,1
3,2020,1,True,False,False,False,False,hamilton,5,4,...,0,0,0,0,0,0,0,0,0,0
