In [144]:
import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go

In [3]:
def path_exist(filepath):
    '''Checks if a filepath exist or not'''
    if os.path.exists(filepath):
        return True
    return False

In [4]:
def get_current_drivers():
    '''Returns list of current years drivers'''
    filepath = Path('../data/current_drivers.json')
    jsondata = dict()
    # Checks if data is already stored
    with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)

    current_drivers = list()
    for driver in jsondata:
        current_drivers.append(driver['driverId'])
    return current_drivers

current_drivers = get_current_drivers()
current_drivers.sort()

In [5]:
def get_current_circuits():
    '''Returns list of current circuits'''
    filepath = Path('../data/scheduled/2022.json')
    jsondata = dict()
    # Checks if data is already stored
    with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)
    current_circuits = list()
    for circuit in jsondata:
        current_circuits.append(circuit['Circuit']['circuitId'])
    return current_circuits
current_circuits = get_current_circuits()

In [6]:
def get_round(year,circuitId):
    filepath = Path(f'../data/scheduled/{year}.json')
    jsondata = dict()
    with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)
    rounds = list()
    for circuit in jsondata:
        if circuit['Circuit']['circuitId'] == circuitId:
            return circuit['round']
    return None

In [7]:
# Takes a string lap time and converts it to a nanosecond equivalent
def time_to_nanoseconds(raw_time):
    try:
        dirty = datetime.strptime(raw_time, '%M:%S.%f').time()
        #clean = timedelta(minutes=dirty.minute, seconds=dirty.second, microseconds=dirty.microsecond)
        nanoseconds = (dirty.minute*6e10)+(dirty.second*1e9)+(dirty.microsecond*1e3)
        #nanoseconds = (dirty.microsecond*1000)
        return nanoseconds/1e9
    # Catch NaaN
    except:
        return None

In [8]:
# find percent different between driver time and average time
def percent_difference(driver_time,average_time):
    diff = abs((driver_time - average_time)/((driver_time + average_time)/2))*100
    if driver_time > average_time:
        return -abs(diff)
    return diff

In [77]:
def sort_scores(score_list):
    cleaned_list = []
    
    for score in score_list:
        cleaned_list.append(score)
    
    cleaned_list = sorted(cleaned_list,key=lambda x: x[1], reverse=True)
    return cleaned_list

In [135]:
def get_performance_score(year,round,current_drivers):
    filepath = Path(f'../data/races/{year}/{round}.csv')
    if path_exist(filepath):
        original_df = pd.read_csv(filepath)
    else:
        filepath = Path(f'../data/races/{year}/{round}.json')
        with open(filepath, 'r', encoding='utf-8') as infile: jsondata = json.load(infile)

        # Gets list of all drivers at the start of the race, sorts them alphabetically
        drivers = []
        for driver in jsondata['Laps'][0]['Timings']:
            drivers.append(driver['driverId'])

        drivers.sort()

        # Create original dataframe which just holds a column of the drivers
        original_df = pd.DataFrame()
        original_df['Drivers'] = drivers

        # Goes through a race.json and adds the times for all laps per driver
        for lap in jsondata['Laps']:
            lap_data = []
            
            for _ in range(0, len(drivers)):
                try:
                    driver = next(item for item in lap['Timings'] if item['driverId']==drivers[_])
                    # Create a tuple of driver and lap time
                    lap_data.append([driver['driverId'],driver['time']])
                except:
                    lap_data.append([drivers[_],None])

            # Sort it so it matches the rows
            lap_data.sort(key = lambda x: x[0])
            # Add new column of lap time
            time_list = list(list(zip(*lap_data))[1])
            original_df[f"Lap {lap['number']}"] = time_list
            original_df.to_csv(Path(f'../data/races/{year}/{round}.csv'),index=False)

    # Copy Original DF
    working_df = original_df.copy()

    # Convert all time columns to a nanosecond representation (actually seconds)
    for col in working_df.columns[1:]: working_df[col] = working_df[col].apply(lambda x : time_to_nanoseconds(x))
    

    # Track scores for current drivers
    score_list = []
    for driver in current_drivers:
        score_avg = 50
        driver_row = working_df.loc[working_df['Drivers'] == driver]
        # Skips drivers who did not participate in this race
        if not driver_row.empty:
            # Skips driver column
            for lap in driver_row.columns[1:]:
                score_avg += percent_difference(driver_row[lap].values[0],working_df[lap].median())
                # Handle Nan times
                if np.isnan(score_avg):
                    score_avg = 0
        else:
            score_avg = 0
        score_list.append(score_avg)
    return score_list
        


Get Performance Score for given track

In [139]:
temp = pd.DataFrame()
temp['Driver'] = ['latifi','hamilton']

for year in range(2012,2022):
    if year < 2020:
        temp[year] = ['False','True']
    else:
        temp[year] = ['True','True']

temp
    

Unnamed: 0,Driver,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,latifi,False,False,False,False,False,False,False,False,True,True
1,hamilton,True,True,True,True,True,True,True,True,True,True


In [11]:
score_list = get_performance_score(2012,1,current_drivers)

print(f'{"Season: 2012":<18}\tRound: 1')
print(f'{"Driver":<18}\tScore')
for _ in range(len(current_drivers)):
    print(f'{current_drivers[_]:<18}\t{score_list[_]}')


Season: 2012      	Round: 1
Driver            	Score
albon             	0
alonso            	32.279772535402884
bottas            	0
gasly             	0
hamilton          	58.89086459151089
kevin_magnussen   	0
latifi            	0
leclerc           	0
max_verstappen    	0
mick_schumacher   	0
norris            	0
ocon              	0
perez             	13.252821312263636
ricciardo         	10.839285449475764
russell           	0
sainz             	0
stroll            	0
tsunoda           	0
vettel            	65.65908803878693
zhou              	0


Get Performance Score for all circuits

In [44]:
pp_df = pd.DataFrame()
pp_df['Driver'] = current_drivers

for circuitId in current_circuits:
    # Blank score per circuit
    score_list = [0]*len(current_drivers)
    for year in range(2012,2022):
        round = get_round(year, circuitId)
        # Not all circuits are in a year
        if round != None:
            scores = get_performance_score(year,round,current_drivers)
            #combine scores
            score_list = [score_list[i] + scores[i] for i in range(len(score_list))]
    pp_df[circuitId] = score_list

pp_df

Unnamed: 0,Driver,bahrain,jeddah,albert_park,imola,miami,catalunya,monaco,baku,villeneuve,...,hungaroring,spa,zandvoort,monza,marina_bay,suzuka,americas,rodriguez,interlagos,yas_marina
0,albon,89.103159,0.0,0.0,7.598544,0,71.043959,27.213295,0.0,0.0,...,35.792931,103.396781,0.0,77.586709,33.741853,79.542212,55.691723,102.02111,16.987207,207.156396
1,alonso,71.676867,0.0,190.424411,-15.106234,0,215.308146,260.309117,73.410707,231.554612,...,329.283765,193.660083,0.0,152.253192,284.760054,40.55067,228.861667,0.0,237.09537,159.528233
2,bottas,453.832656,85.697144,364.190539,80.799936,0,685.263162,277.871271,277.05272,529.965931,...,349.57006,516.095875,126.956188,619.542243,285.312233,493.460558,490.834636,389.897135,340.759251,594.480013
3,gasly,134.499444,63.277383,0.0,-28.781454,0,88.51015,229.811209,46.224608,0.0,...,103.262744,152.624251,0.0,69.488936,23.466647,0.0,0.0,74.705417,14.429883,113.271328
4,hamilton,778.088462,112.600267,654.949432,192.534699,0,1088.833158,793.841292,363.696561,915.717513,...,1129.77685,703.027946,175.536347,789.788483,528.132249,778.165133,1012.221355,640.161554,723.226151,943.357389
5,kevin_magnussen,33.591365,0.0,-51.403464,0.0,0,67.514199,65.668659,44.472498,11.073528,...,-6.827716,93.144069,0.0,36.353996,-8.312951,19.76404,32.606955,0.0,24.995591,7.510592
6,latifi,0.0,12.421981,0.0,34.046139,0,0.0,0.0,30.353808,0.0,...,0.867753,69.256529,0.0,50.957697,0.0,0.0,0.0,0.0,0.0,0.0
7,leclerc,94.418262,70.316228,64.867981,224.19105,0,178.835802,0.0,128.583367,156.261442,...,113.007413,197.069461,0.0,179.862385,52.833605,0.0,161.911496,173.791183,127.07191,177.035386
8,max_verstappen,272.363704,105.659382,312.203597,234.621749,0,614.52033,402.019513,143.379764,305.800506,...,474.64751,307.054515,200.409732,165.53963,309.77962,355.007048,549.37717,773.06107,446.874338,675.81401
9,mick_schumacher,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.15464,0.0,...,0.0,45.182533,0.0,-17.334905,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
pp_df.to_csv(Path('../data/pp.csv'),index=False)

Normalizing Performance Score with zscore

In [47]:
norm_df = pp_df.copy()

# Take a drivers score - mean of everybody's / STD of all
for circuit in norm_df.columns[1:]:
    mean = norm_df[circuit].mean()
    std = norm_df[circuit].std()
    norm_df[circuit] = (norm_df[circuit]-mean)/std
norm_df

Unnamed: 0,Driver,bahrain,jeddah,albert_park,imola,miami,catalunya,monaco,baku,villeneuve,...,hungaroring,spa,zandvoort,monza,marina_bay,suzuka,americas,rodriguez,interlagos,yas_marina
0,albon,-0.329396,-0.894404,-0.586893,-0.709745,,-0.480867,-0.736162,-0.981789,-0.6173,...,-0.525295,-0.567377,-0.401698,-0.574852,-0.563838,-0.318448,-0.55164,-0.234812,-0.637925,-0.100402
1,alonso,-0.414551,-0.894404,0.302082,-0.965761,,-0.030848,0.226083,-0.35577,0.186877,...,0.413181,-0.129549,-0.401698,-0.237928,0.693438,-0.477317,0.079416,-0.655072,0.386837,-0.272923
2,bottas,1.4529,1.243828,1.11329,0.115663,,1.435133,0.298582,1.380811,1.223243,...,0.478049,1.434447,1.626452,1.87066,0.696204,1.368041,1.034082,0.951046,0.869467,1.302574
3,gasly,-0.107561,0.684431,-0.586893,-1.11996,,-0.426382,0.100185,-0.587603,-0.6173,...,-0.309551,-0.328596,-0.401698,-0.611392,-0.615303,-0.642538,-0.754588,-0.347335,-0.649831,-0.440476
4,hamilton,3.037416,1.915088,2.470665,1.375567,,2.694032,2.428562,2.119676,2.562937,...,2.972864,2.341173,2.402529,2.638876,1.912417,2.528055,2.934088,1.98197,2.650126,2.566289
5,kevin_magnussen,-0.600661,-0.894404,-0.826864,-0.795425,,-0.491877,-0.577414,-0.602544,-0.578842,...,-0.66158,-0.617109,-0.401698,-0.76091,-0.774478,-0.562011,-0.635764,-0.655072,-0.60064,-0.823566
6,latifi,-0.764809,-0.584463,-0.586893,-0.411526,,-0.702482,-0.848501,-0.722943,-0.6173,...,-0.636973,-0.732977,-0.401698,-0.695012,-0.732841,-0.642538,-0.754588,-0.655072,-0.717013,-0.850771
7,leclerc,-0.303423,0.860058,-0.284064,1.73252,,-0.14462,-0.848501,0.114721,-0.074612,...,-0.278391,-0.113012,-0.401698,-0.113345,-0.468213,-0.642538,-0.16456,0.060833,-0.125401,-0.209508
8,max_verstappen,0.56613,1.741906,0.870595,1.850135,,1.214457,0.811079,0.240899,0.444729,...,0.878001,0.420478,2.799886,-0.177974,0.818754,0.803919,1.247419,2.529428,1.36351,1.597185
9,mick_schumacher,-0.764809,-0.894404,-0.586893,-0.795425,,-0.702482,-0.848501,-0.983108,-0.6173,...,-0.639748,-0.849749,-0.401698,-1.003175,-0.732841,-0.642538,-0.754588,-0.655072,-0.717013,-0.850771


In [48]:
norm_df.to_csv(Path('../data/pp_norm.csv'),index=False)

Assign points to predicted outcome for a circuit

In [82]:
def merge(order,points):
    merged = list(zip(order,points))
    return merged

In [126]:
champ_df = norm_df.copy()
# for circuit in champ_df['miami']:
for circuit in champ_df.columns[1:]:
    order = champ_df[circuit].sort_values(ascending=False)
    if str(order.values[0]) == 'nan':
        champ_df[circuit] = [0]*len(champ_df['Driver'])
        continue
    points = [25,18,15,12,10,8,6,4,2,1,0,0,0,0,0,0,0,0,0,0]
    champ_points = merge(list(order.index),points)
    champ_points.sort(key = lambda x: x[0])
    temp = []
    for i in champ_points:
        temp.append(i[1])
    champ_df[circuit] = temp

champ_df

Unnamed: 0,Driver,bahrain,jeddah,albert_park,imola,miami,catalunya,monaco,baku,villeneuve,...,hungaroring,spa,zandvoort,monza,marina_bay,suzuka,americas,rodriguez,interlagos,yas_marina
0,albon,0,0,0,0,0,0,0,0,0,...,0,0,12,0,1,6,0,4,0,4
1,alonso,0,0,8,0,0,8,4,1,8,...,8,2,0,1,8,1,6,0,10,1
2,bottas,15,15,15,4,0,15,8,15,15,...,12,15,15,18,10,15,12,12,12,12
3,gasly,4,8,0,0,0,1,2,0,0,...,4,1,0,0,0,0,0,2,0,0
4,hamilton,25,25,25,15,0,25,25,18,25,...,25,25,18,25,18,25,25,18,25,25
5,kevin_magnussen,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,latifi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,leclerc,1,12,4,18,0,6,0,2,6,...,6,6,0,8,2,0,2,6,4,2
8,max_verstappen,12,18,12,25,0,12,12,8,12,...,15,8,25,4,12,10,15,25,15,15
9,mick_schumacher,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
champ_df.to_csv(Path('../data/driver_champ.csv'),index=False)

In [146]:
champ_df['Total'] = champ_df.sum(axis=1)
champ_df.sort_values('Total',ascending=False)


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



Unnamed: 0,Driver,bahrain,jeddah,albert_park,imola,miami,catalunya,monaco,baku,villeneuve,...,spa,zandvoort,monza,marina_bay,suzuka,americas,rodriguez,interlagos,yas_marina,Total
4,hamilton,25,25,25,15,0,25,25,18,25,...,25,18,25,18,25,25,18,25,25,974
18,vettel,18,0,18,1,0,18,18,25,18,...,18,8,15,25,18,18,15,18,18,652
8,max_verstappen,12,18,12,25,0,12,12,8,12,...,8,25,4,12,10,15,25,15,15,590
2,bottas,15,15,15,4,0,15,8,15,15,...,15,15,18,10,15,12,12,12,12,572
13,ricciardo,10,6,10,8,0,10,15,10,10,...,12,0,12,15,12,10,8,8,10,410
12,perez,8,0,2,6,0,2,10,12,4,...,10,10,10,6,8,8,10,6,6,268
7,leclerc,1,12,4,18,0,6,0,2,6,...,6,0,8,2,0,2,6,4,2,222
15,sainz,2,4,6,10,0,4,6,6,0,...,0,2,0,4,2,4,1,2,8,144
1,alonso,0,0,8,0,0,8,4,1,8,...,2,0,1,8,1,6,0,10,1,144
11,ocon,0,10,1,0,0,0,0,0,2,...,4,0,6,0,4,0,0,1,0,60


In [143]:
fig = go.Figure()
for driver in current_drivers:
    temp = champ_df.loc[champ_df['Driver'] == driver]
    points = temp.iloc[0].values[1:].flatten().tolist()
    fig.add_trace(go.line(x=current_circuits,y=points, mode='lines'))

fig.show()

AttributeError: module 'plotly.graph_objects' has no attribute 'line'