In [81]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import plotly.graph_objects as go
import statistics

In [82]:
# Takes a string lap time and converts it to a nanosecond equivalent
def time_to_nanoseconds(raw_time):
    try:
        dirty = datetime.strptime(raw_time, '%M:%S.%f').time()
        #clean = timedelta(minutes=dirty.minute, seconds=dirty.second, microseconds=dirty.microsecond)
        nanoseconds = (dirty.minute*6e10)+(dirty.second*1e9)+(dirty.microsecond*1e3)
        #nanoseconds = (dirty.microsecond*1000)
        return nanoseconds/1e9
    # Catch NaaN
    except:
        pass
    try:
        dirty = datetime.strptime(raw_time, '%I:%M:%S.%f').time()
        return None
    except:
        return raw_time

In [83]:
# find percent different between driver time and median time
def percent_difference(driver_time,median_time):
    diff = abs((driver_time - median_time)/((driver_time + median_time)/2))*100
    if driver_time > median_time:
        return -abs(diff)
    return diff

Pulled from data_cleaning.ipynb
Creates two data frames

original_df = holds drivers and laps in time format
working_df = converts driver times to nanosecond equivalents for graphing and math purposes

In [84]:
filepath = Path('../data/races/2012/1.json')
with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)

# Gets list of all drivers at the start of the race, sorts them alphabetically
drivers = []
for driver in jsondata['Laps'][0]['Timings']:
    drivers.append(driver['driverId'])

drivers.sort()

# Create original dataframe which just holds a column of the drivers
original_df = pd.DataFrame()
original_df['Drivers'] = drivers

# Goes through a race.json and adds the times for all laps per driver
for lap in jsondata['Laps']:
    lap_data = []
    
    for _ in range(0, len(drivers)):
        try:
            driver = next(item for item in lap['Timings'] if item['driverId']==drivers[_])
            # Create a tuple of driver and lap time
            lap_data.append([driver['driverId'],driver['time']])
        except:
            lap_data.append([drivers[_],None])

    # Sort it so it matches the rows
    lap_data.sort(key = lambda x: x[0])
    # Add new column of lap time
    time_list = list(list(zip(*lap_data))[1])
    original_df[f"Lap {lap['number']}"] = time_list


Create Working DF

In [85]:
working_df = original_df.copy()
working_df

Unnamed: 0,Drivers,Lap 1,Lap 2,Lap 3,Lap 4,Lap 5,Lap 6,Lap 7,Lap 8,Lap 9,...,Lap 49,Lap 50,Lap 51,Lap 52,Lap 53,Lap 54,Lap 55,Lap 56,Lap 57,Lap 58
0,alonso,1:44.733,1:35.866,1:34.081,1:34.186,1:34.220,1:35.651,1:34.207,1:34.465,1:33.893,...,1:31.317,1:31.140,1:30.506,1:30.277,1:30.593,1:30.675,1:30.894,1:31.025,1:31.019,1:33.838
1,bruno_senna,2:16.893,1:42.348,1:36.241,1:36.368,1:35.740,1:35.524,1:35.434,1:35.925,1:35.029,...,1:30.855,1:31.999,1:33.469,1:31.119,,,,,,
2,button,1:39.264,1:33.414,1:33.350,1:33.131,1:32.984,1:33.117,1:33.244,1:33.124,1:33.394,...,1:30.081,1:29.858,1:30.049,1:30.047,1:30.124,1:29.697,1:29.645,1:29.187,1:29.738,1:30.846
3,glock,1:50.819,1:38.975,1:38.691,1:37.576,1:37.679,1:37.845,1:39.003,1:37.121,1:36.844,...,1:34.497,1:34.407,1:34.723,1:34.784,1:34.451,1:39.855,1:37.174,1:39.441,1:49.259,
4,grosjean,1:43.730,,,,,,,,,...,,,,,,,,,,
5,hamilton,1:40.622,1:34.297,1:33.566,1:33.347,1:33.446,1:33.380,1:33.315,1:33.461,1:33.561,...,1:30.179,1:30.019,1:30.052,1:29.927,1:30.188,1:30.041,1:29.696,1:29.641,1:29.538,1:29.867
6,kobayashi,1:46.880,1:37.177,1:35.312,1:37.945,1:34.491,1:34.858,1:34.529,1:34.347,1:35.434,...,1:31.767,1:31.235,1:31.086,1:32.677,1:32.316,1:31.699,1:30.620,1:31.190,1:32.628,1:35.335
7,kovalainen,1:53.018,1:37.690,1:38.084,1:37.656,1:37.540,1:37.799,1:35.634,1:35.239,1:35.140,...,,,,,,,,,,
8,maldonado,1:44.212,1:36.857,1:34.569,1:34.068,1:40.441,1:34.096,1:34.874,1:34.983,1:34.510,...,1:31.460,1:31.243,1:30.641,1:30.316,1:30.254,1:30.873,1:30.921,1:30.713,1:31.075,
9,massa,1:46.714,1:36.908,1:35.111,1:35.243,1:35.208,1:34.631,1:34.628,1:35.261,1:36.496,...,,,,,,,,,,


In [86]:

# Convert each string laptime to nanosecond equivolent
for col in working_df.columns[1:]: working_df[col] = working_df[col].apply(lambda x : time_to_nanoseconds(x))

In [87]:
working_df

Unnamed: 0,Drivers,Lap 1,Lap 2,Lap 3,Lap 4,Lap 5,Lap 6,Lap 7,Lap 8,Lap 9,...,Lap 49,Lap 50,Lap 51,Lap 52,Lap 53,Lap 54,Lap 55,Lap 56,Lap 57,Lap 58
0,alonso,104.733,95.866,94.081,94.186,94.22,95.651,94.207,94.465,93.893,...,91.317,91.14,90.506,90.277,90.593,90.675,90.894,91.025,91.019,93.838
1,bruno_senna,136.893,102.348,96.241,96.368,95.74,95.524,95.434,95.925,95.029,...,90.855,91.999,93.469,91.119,,,,,,
2,button,99.264,93.414,93.35,93.131,92.984,93.117,93.244,93.124,93.394,...,90.081,89.858,90.049,90.047,90.124,89.697,89.645,89.187,89.738,90.846
3,glock,110.819,98.975,98.691,97.576,97.679,97.845,99.003,97.121,96.844,...,94.497,94.407,94.723,94.784,94.451,99.855,97.174,99.441,109.259,
4,grosjean,103.73,,,,,,,,,...,,,,,,,,,,
5,hamilton,100.622,94.297,93.566,93.347,93.446,93.38,93.315,93.461,93.561,...,90.179,90.019,90.052,89.927,90.188,90.041,89.696,89.641,89.538,89.867
6,kobayashi,106.88,97.177,95.312,97.945,94.491,94.858,94.529,94.347,95.434,...,91.767,91.235,91.086,92.677,92.316,91.699,90.62,91.19,92.628,95.335
7,kovalainen,113.018,97.69,98.084,97.656,97.54,97.799,95.634,95.239,95.14,...,,,,,,,,,,
8,maldonado,104.212,96.857,94.569,94.068,100.441,94.096,94.874,94.983,94.51,...,91.46,91.243,90.641,90.316,90.254,90.873,90.921,90.713,91.075,
9,massa,106.714,96.908,95.111,95.243,95.208,94.631,94.628,95.261,96.496,...,,,,,,,,,,


Shows all the times from drivers per lap

Determining if average is a good measure, since some laps drivers take pit stop, or a car might malfunction, etc... alot of factors can affect the average of a lap time

In [88]:
fig = go.Figure()
for col in working_df.columns[1:]:
    fig.add_trace(go.Box(y=working_df[col].values.tolist(), name=col, boxmean=True))
    fig.update_layout(title='All Lap times per lap')
fig.update_yaxes(type="log")
fig.show()

Plot the Median of each lap

In [89]:
median = working_df[1:].median(axis=0, skipna=True).tolist()
average = working_df[1:].mean(axis=0, skipna=True).tolist()

fig = go.Figure()
fig.add_trace(go.Box(y=median, name='Median',boxpoints='all',boxmean=True))
fig.add_trace(go.Box(y=average, name='Average',boxpoints='all',boxmean=True))
fig.update_layout(title='Average & Median for all laps', autosize=False,height=1000,)
# fig.update_yaxes(type="log")
fig.show()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Compare a single drivers lap times to the average lap times

This is a on a per lap basis, each lap time is only compared to that individuals average lap time, this accounts for laps were there might be a crash or a pace car since that affects all drivers times

Visuallizes how we assign consistancy points

*   Times that are below the average are positivly rewarded
*   Times that are above the average are negativly rewarded

In [90]:
average = working_df[1:].mean(axis=0, skipna=True).tolist()
median = working_df[1:].median(axis=0, skipna=True).tolist()
driver = working_df.loc[working_df['Drivers'] == 'alonso']
driver1 = working_df.loc[working_df['Drivers'] == 'hamilton']

fig = go.Figure()
# Average Line
fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=average, name='Average', mode='lines', line = dict(dash='dash')))
fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=median, name='Median', mode='lines'))
# Drivers Line
# fig.add_trace(go.Line(x=driver.columns[1:].values.flatten().tolist(),y=driver.iloc[0].values[1:].flatten().tolist(), name='Alonso'))
# fig.add_trace(go.Line(x=driver1.columns[1:].values.flatten().tolist(),y=driver1.iloc[0].values[1:].flatten().tolist(), name='Hamilton'))
fig.update_layout(title='Average Time vs Median Time')
fig.update_yaxes(type="log")
fig.show()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




Boxplot that compares all drivers to each other.

Each point on a driver is a laptime, showing their range. 

*   The higher the points, slower the time
*   The lower the points, faster the time

Each driver is expected to have some outliers due to accidents on track, but its the consistancy of the range that shows how good a driver is.

This does not factor in the team of the driver. There are always good and bad teams which is a big factor on how fast a driver is, but is not a big factor in terms of consistancy. They do contribute to a small degree, but this graph does not weight that in

Also, the average line is the total lap average across the entire race. Because of this, any accidents that affect lap times will impact the average. For this we used the median

In [91]:
average = working_df[1:].mean(axis=0, skipna=True).tolist()
average_list = []
for _ in range(len(average)): average_list.append(statistics.mean(average))
median = working_df[1:].median(axis=0, skipna=True).tolist()
median_list = []
for _ in range(len(median)): median_list.append(statistics.median(median))

fig = go.Figure()
fig.add_trace(go.Box(x=drivers,y=average_list, name='Average',marker_color = 'black'))
fig.add_trace(go.Box(x=drivers,y=median_list, name='Median',marker_color = 'black'))
for driver in drivers:
    temp = working_df.loc[working_df['Drivers'] == driver]
    lap_time = temp.iloc[0].values[1:].flatten().tolist()
    fig.add_trace(go.Box(y=lap_time, name=driver))
fig.update_yaxes(type="log")
fig.show()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Getting a Consistancy Score

*   One score based upon the average
*   One score based upon the median

In [92]:
def sort_scores(score_list):
    cleaned_list = []
    nan_list = []
    
    for score in score_list:
        if str(score[0]) != 'nan':
            cleaned_list.append(score)
        else:
            nan_list.append(score)
    
    cleaned_list = sorted(cleaned_list,key=lambda x: x[0], reverse=True)
    # if cleaned_list[-1][0] < 0:
    #     diff = round(abs(cleaned_list[-1][0])+cleaned_list[0][0],5)
    # else:
    #     diff = round(abs(cleaned_list[-1][0]-cleaned_list[0][0]),5)
    cleaned_list.extend(nan_list)
    return cleaned_list

In [93]:
driver_row = working_df.loc[working_df['Drivers'] == 'hamilton']
driver_median = driver_row.values[0][1:].tolist()
driver_median = statistics.median(driver_median)
driver_median

93.1335

In [94]:
average_total = []
median_total = []

for driver in drivers:
    score_avg = 0
    score_med = 0
    driver_row = working_df.loc[working_df['Drivers'] == driver]
    for lap in driver_row.columns[1:]:
        # score_avg += percent_difference(driver_row[lap].values[0],working_df[lap].mean())
        score_med += percent_difference(driver_row[lap].values[0],working_df[lap].median())
    # average_total.append((score_avg,driver))
    median_total.append((score_med,driver))

# average_total, avg_diff = sort_scores(average_total)
median_total = sort_scores(median_total)

# print(f'{"Drivers": <25}Range: {avg_diff: <15}Range: {med_diff: <15}')
for driver in range(len(median_total)):
    print(f'{median_total[driver][1]: <20}Score:{round(median_total[driver][0],5): >15}')


button              Score:       17.76838
vettel              Score:       15.65909
hamilton            Score:        8.89086
webber              Score:        6.02738
alonso              Score:      -17.72023
raikkonen           Score:      -33.16688
perez               Score:      -36.74718
kobayashi           Score:      -38.41618
ricciardo           Score:      -39.16071
vergne              Score:       -39.5187
resta               Score:      -43.24014
rosberg             Score:      -54.65684
bruno_senna         Score:            nan
glock               Score:            nan
grosjean            Score:            nan
kovalainen          Score:            nan
maldonado           Score:            nan
massa               Score:            nan
michael_schumacher  Score:            nan
petrov              Score:            nan
pic                 Score:            nan
