In [335]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import plotly.graph_objects as go
import statistics

In [336]:
# Takes a string lap time and converts it to a nanosecond equivalent
def time_to_nanoseconds(raw_time):
    try:
        dirty = datetime.strptime(raw_time, '%M:%S.%f').time()
        #clean = timedelta(minutes=dirty.minute, seconds=dirty.second, microseconds=dirty.microsecond)
        nanoseconds = (dirty.minute*6e10)+(dirty.second*1e9)+(dirty.microsecond*1e3)
        #nanoseconds = (dirty.microsecond*1000)
        return nanoseconds/1e9
    # Catch NaaN
    except:
        return raw_time

In [337]:
# find percent different between driver time and average time
def percent_difference(driver_time,average_time):
    diff = abs((driver_time - average_time)/((driver_time + average_time)/2))*100
    if driver_time > average_time:
        return -abs(diff)
    return diff

Pulled from data_cleaning.ipynb
Creates two data frames

original_df = holds drivers and laps in time format
working_df = converts driver times to nanosecond equivalents for graphing and math purposes

In [338]:
filepath = Path('../data/races/2021/22.json')
with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)

# Gets list of all drivers at the start of the race, sorts them alphabetically
drivers = []
for driver in jsondata['Laps'][0]['Timings']:
    drivers.append(driver['driverId'])

drivers.sort()

# Create original dataframe which just holds a column of the drivers
original_df = pd.DataFrame()
original_df['Drivers'] = drivers

# Goes through a race.json and adds the times for all laps per driver
for lap in jsondata['Laps']:
    lap_data = []
    
    for _ in range(0, len(drivers)):
        try:
            driver = next(item for item in lap['Timings'] if item['driverId']==drivers[_])
            # Create a tuple of driver and lap time
            lap_data.append([driver['driverId'],driver['time']])
        except:
            lap_data.append([drivers[_],None])

    # Sort it so it matches the rows
    lap_data.sort(key = lambda x: x[0])
    # Add new column of lap time
    time_list = list(list(zip(*lap_data))[1])
    original_df[f"Lap {lap['number']}"] = time_list


Create Working DF

In [339]:
working_df = original_df.copy()
# Convert each string laptime to nanosecond equivolent
for col in working_df.columns[1:]: working_df[col] = working_df[col].apply(lambda x : time_to_nanoseconds(x))

Shows all the times from drivers per lap

Determining if average is a good measure, since some laps drivers take pit stop, or a car might malfunction, etc... alot of factors can affect the average of a lap time

In [340]:
fig = go.Figure()
for col in working_df.columns[1:]:
    fig.add_trace(go.Box(y=working_df[col].values.tolist(), name=col, boxmean=True))
    fig.update_layout(title='All Lap times per lap')
fig.update_yaxes(type="log")
fig.show()

Plot the Median of each lap

In [341]:
median = working_df[1:].median(axis=0, skipna=True).tolist()
average = working_df[1:].mean(axis=0, skipna=True).tolist()

fig = go.Figure()
fig.add_trace(go.Box(x=median, name='Median',boxpoints='all',boxmean=True))
fig.add_trace(go.Box(x=average, name='Average',boxpoints='all',boxmean=True))
fig.update_layout(title='Average & Median for all laps')
fig.update_xaxes(type="log")
fig.show()







Compare a single drivers lap times to the average lap times

This is a on a per lap basis, each lap time is only compared to that individuals average lap time, this accounts for laps were there might be a crash or a pace car since that affects all drivers times

Visuallizes how we assign consistancy points

*   Times that are below the average are positivly rewarded
*   Times that are above the average are negativly rewarded

In [342]:
average = working_df[1:].mean(axis=0, skipna=True).tolist()
median = working_df[1:].median(axis=0, skipna=True).tolist()
driver = working_df.loc[working_df['Drivers'] == 'alonso']

fig = go.Figure()
# Average Line
fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=average, name='Average', mode='lines', line = dict(dash='dash')))
fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=median, name='Median', mode='lines', line = dict(dash='dash')))
# Drivers Line
fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=driver.iloc[0].values[1:].flatten().tolist(), name='Alonso'))
fig.update_layout(title='Alonso time vs Average Time vs Median Time')
fig.update_yaxes(type="log")
fig.show()






plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




Boxplot that compares all drivers to each other.

Each point on a driver is a laptime, showing their range. 

*   The higher the points, slower the time
*   The lower the points, faster the time

Each driver is expected to have some outliers due to accidents on track, but its the consistancy of the range that shows how good a driver is.

This does not factor in the team of the driver. There are always good and bad teams which is a big factor on how fast a driver is, but is not a big factor in terms of consistancy. They do contribute to a small degree, but this graph does not weight that in

Also, the average line is the total lap average across the entire race. Because of this, any accidents that affect lap times will impact the average. For this we used the median

In [343]:
average = working_df[1:].mean(axis=0, skipna=True).tolist()
average_list = []
for _ in range(len(average)): average_list.append(statistics.mean(average))
median = working_df[1:].median(axis=0, skipna=True).tolist()
median_list = []
for _ in range(len(median)): median_list.append(statistics.median(median))

fig = go.Figure()
fig.add_trace(go.Box(x=drivers,y=average_list, name='Average',marker_color = 'black'))
fig.add_trace(go.Box(x=drivers,y=median_list, name='Median',marker_color = 'black'))
for driver in drivers:
    temp = working_df.loc[working_df['Drivers'] == driver]
    lap_time = temp.iloc[0].values[1:].flatten().tolist()
    fig.add_trace(go.Box(y=lap_time, name=driver))
fig.update_yaxes(type="log")
fig.show()







Getting a Consistancy Score

*   One score based upon the average
*   One score based upon the median

In [344]:
def sort_scores(score_list):
    cleaned_list = []
    nan_list = []
    
    for score in score_list:
        if str(score[0]) != 'nan':
            cleaned_list.append(score)
        else:
            nan_list.append(score)
    
    cleaned_list = sorted(cleaned_list,key=lambda x: x[0], reverse=True)
    if cleaned_list[-1][0] < 0:
        diff = round(abs(cleaned_list[-1][0])+cleaned_list[0][0],5)
    else:
        diff = round(abs(cleaned_list[-1][0]-cleaned_list[0][0]),5)
    cleaned_list.extend(nan_list)
    return [cleaned_list,diff]

In [345]:
average_total = []
median_total = []

for driver in drivers:
    score_avg = 0
    score_med = 0
    driver_row = working_df.loc[working_df['Drivers'] == driver]
    for lap in driver_row.columns[1:]:
        score_avg += percent_difference(driver_row[lap].values[0],working_df[lap].mean())
        score_med += percent_difference(driver_row[lap].values[0],working_df[lap].median())
    average_total.append((score_avg,driver))
    median_total.append((score_med,driver))

average_total, avg_diff = sort_scores(average_total)
median_total, med_diff = sort_scores(median_total)

print(f'{"Drivers": <25}Range: {avg_diff: <15}Range: {med_diff: <15}')
for driver in range(len(average_total)):
    print(f'{average_total[driver][1]: <25}Avg:   {round(average_total[driver][0],5): <15}Med:   {round(median_total[driver][0],5): <15}')


Drivers                  Range: 90.36525       Range: 90.82972       
max_verstappen           Avg:   84.56255       Med:   51.73298       
hamilton                 Avg:   84.02651       Med:   51.31142       
sainz                    Avg:   53.66626       Med:   20.66184       
bottas                   Avg:   50.53349       Med:   17.50519       
tsunoda                  Avg:   50.03063       Med:   16.949         
gasly                    Avg:   46.5638        Med:   13.53458       
norris                   Avg:   10.82656       Med:   -22.47332      
alonso                   Avg:   4.30588        Med:   -29.00701      
leclerc                  Avg:   0.88335        Med:   -32.43202      
ocon                     Avg:   0.49227        Med:   -32.82262      
vettel                   Avg:   -5.80271       Med:   -39.09674      
giovinazzi               Avg:   nan            Med:   nan            
latifi                   Avg:   nan            Med:   nan            
mick_schumacher     