In [57]:
import pandas as pd

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'
    # path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [58]:
def format_time(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return round(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])), 3) if x != 0 else 0
        else:
            return(round(float(x), 3))
    else:
        return x

In [95]:
def reverse_name(x):
    if str(x) == 'guanyu_zhou':
        name = x.split('_')
        return name[1] + '_' + name[0]
    else:
        return x

In [96]:
races = pd.read_csv(path+'data/races.csv')
qualifying = pd.read_csv(path+'data/qualifying.csv')
constructors = pd.read_csv(path+'data/constructor_standings.csv')
drivers = pd.read_csv(path+'data/driver_standings.csv')
results = pd.read_csv(path+'data/results.csv')
starting_grid = pd.read_csv(path+'data/starting-grid.csv')

### Redundant until new weather collection method is implemented
# weather = pd.read_csv(path+'data/weather.csv')

In [97]:
starting_grid.query('season == 2022 & round ==1')

Unnamed: 0,grid,driver,season,round
3229,1,charles_leclerc,2022,1
3230,2,max_verstappen,2022,1
3231,3,carlos_sainz,2022,1
3232,4,sergio_perez,2022,1
3233,5,lewis_hamilton,2022,1
3234,6,valtteri_bottas,2022,1
3235,7,kevin_magnussen,2022,1
3236,8,fernando_alonso,2022,1
3237,9,george_russell,2022,1
3238,10,pierre_gasly,2022,1


In [62]:
results.query('grid == 0').shape

(50, 13)

In [23]:
laps1 = pd.read_csv(path+'data/laps-2014-2017.csv')
laps2 = pd.read_csv(path+'data/laps-2017-22.csv')

In [5]:
laps = pd.concat([laps1, laps2]).drop_duplicates()

laps.shape

(168937, 6)

In [6]:
laps[laps.duplicated()]

Unnamed: 0,season,round,lap,driver,position,time


In [7]:
laps['time'] = laps.time.apply(lambda x: format_time(str(x)))

In [8]:
# laps.to_csv(path+'data/laps.csv', index=False)

In [79]:
def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df

In [98]:
for col in ['driver_points', 'driver_wins', 'driver_standings_pos']:
    drivers = lookup(drivers, 'driver', col)

In [99]:
for col in ['constructor_points', 'constructor_wins', 'constructor_standings_pos']:
    constructors = lookup(constructors, 'constructor', col)

In [115]:
df1 = pd.merge(races, results, how='inner', on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time', 'grid'], axis=1)
df2 = pd.merge(df1, drivers, how='left', on=['season', 'round', 'driver']) 
df3 = pd.merge(df2, constructors, how='left', on=['season', 'round', 'constructor'])
df4 = pd.merge(df3, qualifying, how='inner', on=['season', 'round', 'circuit_id', 'driver']).drop(['fastest_lap'], axis=1)
df4.driver = df4.driver.apply(lambda x: reverse_name(x))

final_df = pd.merge(df4, starting_grid, how='inner', on=['season', 'round', 'driver'])
final_df.rename(columns = {'grid': 'starting_grid'}, inplace=True)

final_df.rename(columns = {'fastest_time': 'qual_time', 'grid': 'qualifying_pos'}, inplace=True)

final_df.shape

(3651, 29)

In [116]:
starting_grid.query('season == 2022 & round == 1')

Unnamed: 0,grid,driver,season,round
3229,1,charles_leclerc,2022,1
3230,2,max_verstappen,2022,1
3231,3,carlos_sainz,2022,1
3232,4,sergio_perez,2022,1
3233,5,lewis_hamilton,2022,1
3234,6,valtteri_bottas,2022,1
3235,7,kevin_magnussen,2022,1
3236,8,fernando_alonso,2022,1
3237,9,george_russell,2022,1
3238,10,pierre_gasly,2022,1


In [59]:
# final_df['fastest_lap'] = final_df['fastest_lap'].apply(lambda x: format_time(str(x)))

In [60]:
# final_df[['fastest_lap', 'qual_time']].head()

Unnamed: 0,fastest_lap,qual_time
0,92.478,104.595
1,93.066,105.745
2,92.917,104.437
3,93.186,105.819
4,92.616,108.147


In [117]:
final_df.columns

Index(['season', 'round', 'circuit_id', 'country', 'lat', 'long', 'date',
       'driver', 'date_of_birth', 'nationality', 'constructor', 'podium',
       'driver_points_after_race', 'driver_wins_after_race',
       'driver_standings_pos_after_race', 'driver_points', 'driver_wins',
       'driver_standings_pos', 'constructor_points_after_race',
       'constructor_wins_after_race', 'constructor_standings_pos_after_race',
       'constructor_points', 'constructor_wins', 'constructor_standings_pos',
       'grid_x', 'qual_time', 'stage', 'q_delta', 'grid_y'],
      dtype='object')

In [118]:
null_cols = final_df.columns[final_df.isna().any()].tolist()

null_rows = final_df[final_df.isna().any(axis=1)]
null_rows = null_rows[['season', 'round', 'driver', 'podium', 'qual_time', 'stage']]

null_rows.shape

(32, 6)

In [119]:
# fill/drop nulls
### Data is being lossed here

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points', 
            'constructor_wins', 'constructor_standings_pos']:
            
    final_df[col].fillna(0, inplace=True)
    final_df[col] = final_df[col].map(lambda x: int(x))
    
final_df.dropna(inplace=True)

final_df.shape

(3619, 29)

In [20]:
final_df.to_csv(path+'data/merged.csv', index=False)