In [1]:
import pandas as pd

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [2]:
races = pd.read_csv(path+'data/races.csv')
qualifying = pd.read_csv(path+'data/qualifying.csv')
constructors = pd.read_csv(path+'data/constructor_standings.csv')
drivers = pd.read_csv(path+'data/driver_standings.csv')
results = pd.read_csv(path+'data/results.csv')

### Redundant until new weather collection method is implemented
# weather = pd.read_csv(path+'data/weather.csv')

In [3]:
def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df

In [4]:
drivers = lookup(drivers, 'driver', 'driver_points')
drivers = lookup(drivers, 'driver', 'driver_wins')
drivers = lookup(drivers, 'driver', 'driver_standings_pos')

drivers.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'], axis=1, inplace=True)

In [5]:
constructors = lookup(constructors, 'constructor', 'constructor_points')
constructors = lookup(constructors, 'constructor', 'constructor_wins')
constructors = lookup(constructors, 'constructor', 'constructor_standings_pos')

constructors.drop(['constructor_points_after_race', 'constructor_wins_after_race', 'constructor_standings_pos_after_race'], axis=1, inplace=True)

In [6]:
df1 = pd.merge(races, results, how='inner', on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time'], axis = 1)
df2 = pd.merge(df1, drivers, how='left', on=['season', 'round', 'driver']) 
df3 = pd.merge(df2, constructors, how='left', on=['season', 'round', 'constructor'])

merged_df = pd.merge(df3, qualifying, how='inner', on=['season', 'round', 'grid']).drop(['driver_y', 'constructor_y'], axis=1)

merged_df.rename(columns={'driver_x': 'driver', 'constructor_x': 'constructor'}, inplace=True)

merged_df.shape

(3191, 24)

In [7]:
missing = [10, 19, 12, 14]

qualifying.query('season == 2021 & round in @missing')

Unnamed: 0,grid,pos,season,round,driver,constructor,final_time,stage,q_delta
2961,1,2,2021,10,Max Verstappen VER,Red Bull Racing Honda,,q3,0.0
2962,2,1,2021,10,Lewis Hamilton HAM,Mercedes,,q3,0.0
2963,3,3,2021,10,Valtteri Bottas BOT,Mercedes,,q3,0.0
2964,4,4,2021,10,Charles Leclerc LEC,Ferrari,,q3,0.0
2965,5,6,2021,10,Lando Norris NOR,McLaren Mercedes,,q3,0.0
...,...,...,...,...,...,...,...,...,...
3152,16,16,2021,19,Nicholas Latifi LAT,Williams Mercedes,,q1,0.0
3153,17,17,2021,19,George Russell RUS,Williams Mercedes,,q1,0.0
3154,18,18,2021,19,Mick Schumacher MSC,Haas Ferrari,,q1,0.0
3155,19,19,2021,19,Nikita Mazepin MAZ,Haas Ferrari,,q1,0.0


In [8]:
merged_df.isna().any()

season                       False
round                        False
circuit_id                   False
country                      False
lat                          False
long                         False
date                         False
driver                       False
date_of_birth                False
nationality                  False
constructor                  False
grid                         False
podium                       False
fastest_lap                   True
driver_points                False
driver_wins                  False
driver_standings_pos         False
constructor_points           False
constructor_wins             False
constructor_standings_pos    False
pos                          False
final_time                    True
stage                         True
q_delta                      False
dtype: bool

In [9]:
merged_df.query('season == 2021 & round == @missing[0]')

Unnamed: 0,season,round,circuit_id,country,lat,long,date,driver,date_of_birth,nationality,...,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,pos,final_time,stage,q_delta
2942,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,hamilton,1985-01-07,British,...,150.0,3.0,2.0,242.0,3.0,2.0,1,,q3,0.0
2943,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,leclerc,1997-10-16,Monegasque,...,62.0,0.0,6.0,122.0,0.0,4.0,4,,q3,0.0
2944,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,bottas,1989-08-28,Finnish,...,92.0,0.0,5.0,242.0,3.0,2.0,3,,q3,0.0
2945,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,norris,1999-11-13,British,...,101.0,0.0,4.0,141.0,0.0,3.0,6,,q3,0.0
2946,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,ricciardo,1989-07-01,Australian,...,40.0,0.0,8.0,141.0,0.0,3.0,7,,q3,0.0
2947,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,sainz,1994-09-01,Spanish,...,60.0,0.0,7.0,122.0,0.0,4.0,9,,q3,0.0
2948,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,alonso,1981-07-29,Spanish,...,20.0,0.0,11.0,32.0,0.0,7.0,11,,q2,0.0
2949,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,stroll,1998-10-29,Canadian,...,14.0,0.0,12.0,44.0,0.0,6.0,15,,q2,0.0
2950,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,ocon,1996-09-17,French,...,12.0,0.0,13.0,32.0,0.0,7.0,13,,q2,0.0
2951,2021,10,silverstone,UK,52.0786,-1.01694,2021-07-18 14:00:00,tsunoda,2000-05-11,Japanese,...,9.0,0.0,14.0,48.0,0.0,5.0,16,,q1,0.0


In [10]:
# fill/drop nulls
### Data is being lossed here

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins', 'constructor_standings_pos']:
    merged_df[col].fillna(0, inplace=True)
    merged_df[col] = merged_df[col].map(lambda x: int(x))
    
merged_df.dropna(inplace=True)

merged_df.shape

(2896, 24)

In [11]:
merged_df.query('season == 2014 & round == 1')

Unnamed: 0,season,round,circuit_id,country,lat,long,date,driver,date_of_birth,nationality,...,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,pos,final_time,stage,q_delta
0,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,rosberg,1985-06-27,German,...,0,0,0,0,0,0,3,104.595,q3,0.364
1,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,kevin_magnussen,1992-10-05,Danish,...,0,0,0,0,0,0,4,105.745,q3,1.514
2,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,button,1980-01-19,British,...,0,0,0,0,0,0,11,104.437,q2,0.206
3,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,alonso,1981-07-29,Spanish,...,0,0,0,0,0,0,5,105.819,q3,1.588
4,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,bottas,1989-08-28,Finnish,...,0,0,0,0,0,0,10,108.147,q3,3.916
5,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,hulkenberg,1987-08-19,German,...,0,0,0,0,0,0,7,106.03,q3,1.799
6,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,raikkonen,1979-10-17,Finnish,...,0,0,0,0,0,0,12,104.494,q2,0.263
7,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,vergne,1990-04-25,French,...,0,0,0,0,0,0,6,105.864,q3,1.633
8,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,kvyat,1994-04-26,Russian,...,0,0,0,0,0,0,8,107.368,q3,3.137
9,2014,1,albert_park,Australia,-37.8497,144.968,2014-03-16 06:00:00,perez,1990-01-26,Mexican,...,0,0,0,0,0,0,16,107.293,q2,3.062


In [12]:
from dateutil.relativedelta import *

merged_df['date_of_birth'] = pd.to_datetime(merged_df.date_of_birth)
merged_df['driver_age'] = merged_df.apply(lambda x: relativedelta(pd.to_datetime(x['date']), x['date_of_birth']).years, axis=1)
merged_df.drop(['date_of_birth'], axis=1, inplace = True)

In [13]:
merged_df.to_csv(path+'data/merged.csv', index=False)