In [1]:
import pandas as pd

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [6]:
races = pd.read_csv(path+'data/races.csv')
qualifying = pd.read_csv(path+'data/qualifying.csv')
constructors = pd.read_csv(path+'data/constructor_standings.csv')
drivers = pd.read_csv(path+'data/driver_standings.csv')
results = pd.read_csv(path+'data/results.csv')

### Redundant until new weather collection method is implemented
# weather = pd.read_csv(path+'data/weather.csv')

In [2]:
laps1 = pd.read_csv(path+'data/laps-2014-2017.csv')
laps2 = pd.read_csv(path+'data/laps-2017-22.csv')

In [29]:
laps = pd.concat([laps1, laps2]).drop_duplicates()

laps.shape

(168937, 6)

In [30]:
laps[laps.duplicated()]

Unnamed: 0,season,round,lap,driver,position,time


In [12]:
# laps.to_csv(path+'data/laps.csv', index=False)

In [7]:
def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df

In [8]:
for col in ['driver_points', 'driver_wins', 'driver_standings_pos']:
    drivers = lookup(drivers, 'driver', col)

drivers.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'], axis=1, inplace=True)

In [9]:
for col in ['constructor_points', 'constructor_wins', 'constructor_standings_pos']:
    constructors = lookup(constructors, 'constructor', col)

constructors.drop(['constructor_points_after_race', 'constructor_wins_after_race', 'constructor_standings_pos_after_race'], axis=1, inplace=True)

In [10]:
df1 = pd.merge(races, results, how='inner', on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time', 'grid'], axis=1)
df2 = pd.merge(df1, drivers, how='left', on=['season', 'round', 'driver']) 
df3 = pd.merge(df2, constructors, how='left', on=['season', 'round', 'constructor'])

merged_df = pd.merge(df3, qualifying, how='inner', on=['season', 'round', 'circuit_id', 'driver'])

merged_df.shape

(3671, 23)

In [14]:
merged_df[['season','round','driver','grid', 'podium']].query('season == 2015 & round == 1').sort_values('grid')

Unnamed: 0,season,round,driver,grid,podium
405,2015,1,hamilton,1,1
406,2015,1,rosberg,2,2
408,2015,1,massa,3,4
407,2015,1,vettel,4,3
416,2015,1,raikkonen,5,12
422,2015,1,bottas,6,18
410,2015,1,ricciardo,7,6
413,2015,1,sainz,8,9
418,2015,1,grosjean,9,14
419,2015,1,maldonado,10,15


In [16]:
merged_df.columns

Index(['season', 'round', 'circuit_id', 'country', 'lat', 'long', 'date',
       'driver', 'date_of_birth', 'nationality', 'constructor', 'podium',
       'fastest_lap', 'driver_points', 'driver_wins', 'driver_standings_pos',
       'constructor_points', 'constructor_wins', 'constructor_standings_pos',
       'grid', 'fastest_time', 'stage', 'q_delta'],
      dtype='object')

In [24]:
merged_df.isna().any()

season                       False
round                        False
circuit_id                   False
country                      False
lat                          False
long                         False
date                         False
driver                       False
date_of_birth                False
nationality                  False
constructor                  False
grid                         False
podium                       False
fastest_lap                   True
driver_points                False
driver_wins                  False
driver_standings_pos         False
constructor_points           False
constructor_wins             False
constructor_standings_pos    False
fastest_time                  True
stage                         True
q_delta                      False
dtype: bool

In [17]:
# fill/drop nulls
### Data is being lossed here

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins', 'constructor_standings_pos']:
    merged_df[col].fillna(0, inplace=True)
    merged_df[col] = merged_df[col].map(lambda x: int(x))
    
merged_df.dropna(inplace=True)

merged_df.shape

(3462, 23)

In [26]:
from dateutil.relativedelta import *

merged_df['date_of_birth'] = pd.to_datetime(merged_df.date_of_birth)
merged_df['driver_age'] = merged_df.apply(lambda x: relativedelta(pd.to_datetime(x['date']), x['date_of_birth']).years, axis=1)
merged_df.drop(['date_of_birth'], axis=1, inplace = True)

In [27]:
merged_df.to_csv(path+'data/merged.csv', index=False)