In [13]:
#Imports
import pandas as pd
import numpy as np
import fastf1
import fastf1.plotting
import matplotlib as mpl
from fastf1 import utils
import requests
from datetime import datetime, timedelta
import warnings
import math

In [14]:
#To remove warnings 
warnings.filterwarnings("ignore", category=FutureWarning)

In [15]:
# Enable the cache
# Direction of the cache on the computer
fastf1.Cache.enable_cache('/Volumes/LORETO/cache') 

# Variables

In [16]:
#TODO Check wich date to start 
year_start= 2020
year_finish= 2020

# Functions

## FastF1

#### Create dataframe

In [17]:
def add_info_fastF1 (data_driver, session, laps , year, round):
    """Adds the information to the dataframe from FastF1

    Args:
        session (Sesion): session where to extract the data
        laps (Laps): laps from the specific race to extract the data 
        data_driver (DataFrame): DataFrame where add the data
        year(int): year of the race
        round(int): race number (1 is the first race of the season, 2 the second...)

    Returns:
        Dataframe: with the new data added 
    """
    
    # Get all drivers
    drivers = pd.unique(laps['Driver']) 

    #drivers = ['ALO'] #! TODO for all the drivers
    for driver in drivers:

        #Get the laps for the specific driver
        driver_laps = laps.pick_driver(driver)

        rows_to_concat = []
        for lap in driver_laps.iterlaps():

            #Data that is the same for all the laps
            year = year  
            round = round 
            race_name = session.event.EventName
            driver_number = lap[1].DriverNumber
            driver_name = lap[1].Driver
            team = lap[1].Team
            grid_position = int(session.get_driver(driver).GridPosition)

            #Data that changes every lap
            lap_number = int(lap[1].LapNumber)
            compound = lap[1].Compound
            tyre_life = lap[1].TyreLife
            track_status = lap[1].TrackStatus
            #If it has change the tire tipes, O if not 1 if yes 
            if (not pd.DataFrame(rows_to_concat).empty and pd.DataFrame(rows_to_concat)['Compound'].nunique() >1 ):
                tyres_change= 1
            else: 
                tyres_change= 0

            #Creation of the new row
            new_row = {'Year': year, 'RaceNumber': round, 'RaceName': race_name, 'DriverNumber': driver_number,
                    'Driver': driver_name, 'Team': team, 'GridPosition': grid_position,
                    'LapNumber': lap_number, 'Compound': compound,
                    'TyreLife': tyre_life, 'TyresChange': tyres_change,
                    'TrackStatus': track_status}
            
            rows_to_concat.append(new_row)

        #is concat beacuse append with be deprecated in a future version  
        data_driver = pd.concat([data_driver, pd.DataFrame(rows_to_concat)], ignore_index=True)
    
    return data_driver


## Ergast

### Variables

#### Driver codes  
* This is necesary to be able to iteract between the two sources

In [18]:
# This is necessary because in the lap table we have the user id, and in fastF1 what we have is the code 
dict_code_drivers ={}

for year in list(range(year_start,year_finish+1)):
    url = 'http://ergast.com/api/f1/{}/drivers.json?limit=10000'
    r = requests.get(url.format(year))
    json = r.json()

    for item in json['MRData']['DriverTable']['Drivers']:
        if (item['driverId'] not in dict_code_drivers):
            dict_code_drivers[item['driverId']] = item['code']

### Function update 
#### Calculate the position, time lap and time in the race of each driver

In [19]:
laps_position = pd.DataFrame()

def add_info_Ergast (laps_position, numberLaps, year, round):
    """Create and add the information form Ergast
    

    Args:
        laps_position (DataFrame): DataFrame where add the data
        numberLaps (int): number of laps of this race
        year(int): year of the race
        round(int): race number

    Returns:
        DataFrame: with the data added 
    """

    #-----AUXILIARY INFORMATION NEDED-----

    #Qualy results 
    results_qualy = {}
    url = 'http://ergast.com/api/f1/{}/{}/qualifying.json?limit=100000'
    r = requests.get(url.format(year,round))
    json = r.json()
    for item in json['MRData']['RaceTable']['Races'][0]['QualifyingResults']:
        results_qualy[item['Driver']['code']] = item['position']
        
    #Driver standing and points before starting the race
    #? TODO is this useful ? and the tams championship
    drivers_standings = {}
    #points_championship = {}
    #In the fist race there is no results 
    if (round-1 >0): 
        url = 'http://ergast.com/api/f1/{}/{}/driverStandings.json?limit=10000'
        r = requests.get(url.format(year,round-1))
        json = r.json()
        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']: 
            drivers_standings[item['Driver']['code']] = item['position']
            #points_championship[item['Driver']['code']] = item['points']

    #Race Status (if finish or has a problem)
    results_status = {}
    url = 'http://ergast.com/api/f1/{}/{}/results.json?limit=10000'
    r = requests.get(url.format(year,round))
    json = r.json()
    for item in json['MRData']['RaceTable']['Races'][0]['Results']: 
        results_status[item['Driver']['code']] = item['status']

    #-----ADD VALUES-----

    # dictionary to store total lap times for each driver
    driver_lap_times = {}

    #Get information from each lap
    for lapN in range(1, numberLaps+1):
        url = 'http://ergast.com/api/f1/{}/{}/laps/{}.json?limit=10000'
        r = requests.get(url.format(year, round, lapN)) 
        json = r.json()
        #Iterate over le response
        if  len(json['MRData']['RaceTable']['Races'])>0 and len(json['MRData']['RaceTable']['Races'][0]['Laps'])>0 and len(json['MRData']['RaceTable']['Races'][0]['Laps'][0]['Timings'])>0:
            for item in json['MRData']['RaceTable']['Races'][0]['Laps'][0]['Timings']:
                driver = dict_code_drivers.get(item['driverId'])
                qualyPosition = results_qualy.get(driver)
                position = item['position']
                lap_time = datetime.strptime(item['time'], '%M:%S.%f').time()
                resultStatus = results_status.get(driver)

                #Standings and points
                if len(drivers_standings) == 0 : #In the fist race there is no results 
                    driverStandings= 0
                    #pointsChampionship= 0
                else:
                    driverStandings= drivers_standings.get(driver)
                    #pointsChampionship = points_championship.get(driver)

                # get total lap time for the current driver
                if driver in driver_lap_times:
                    total_time = driver_lap_times[driver] + timedelta(minutes=lap_time.minute, seconds=lap_time.second, microseconds=lap_time.microsecond)
                else:
                    total_time = timedelta(minutes=lap_time.minute, seconds=lap_time.second, microseconds=lap_time.microsecond)

                # update the dictionary with the total lap time for the current driver
                driver_lap_times[driver] = total_time
                
                # append the lap data to the dataframe
                laps_position = laps_position.append({'LapNumber': lapN, 'Driver': driver, 'ResultStatus':resultStatus, 
                                        'QualyPosition': qualyPosition, 'DriverStandings' :driverStandings, 
                                        'Position': position, 'TimeLap': lap_time, 'RaceTimeProgress': total_time 
                                        }, ignore_index=True)

    return laps_position

#### Calculate the distance between the driver ahead and behind,and the distance to the leader

In [20]:
def calculate_add_distances (laps_position):
    """Add for each driver:
            the time distance to the driver ahead
            the time distance to the driver behind
            the time distance to the leader (also call gap)
            
    This new colums are calculated with the previus information that we already have in the dataframe
    That is the reason that is done later, we need the complete information
            
    Args:
        laps_position (DataFrame): where add the data

    Returns:
        Dataframe: with the data added
    """

    #Diference with the car ahead 
    laps_position['NextTime'] = laps_position.groupby('LapNumber')['RaceTimeProgress'].shift(1)
    laps_position['TimeDiffAhead'] = laps_position['RaceTimeProgress']  - laps_position['NextTime']  
    laps_position['TimeDiffAhead'] = laps_position['TimeDiffAhead'].fillna(value=pd.Timedelta(0)) #The first driver does't have diference with others
    laps_position = laps_position.drop(columns=['NextTime'])

    #Diference with the car behind 
    laps_position['NextTime'] = laps_position.groupby('LapNumber')['RaceTimeProgress'].shift(-1)
    laps_position['TimeDiffBehind'] =  laps_position['NextTime']  - laps_position['RaceTimeProgress']  
    laps_position['TimeDiffBehind'] = laps_position['TimeDiffBehind'].fillna(value=pd.Timedelta(0)) #The last driver does't have diference with others
    laps_position = laps_position.drop(columns=['NextTime'])

    #Diference with the leader
    grouped = laps_position.groupby('LapNumber')
    leader_progress = grouped['RaceTimeProgress'].transform(lambda x: x[x.idxmin()]) # calculate leader's race time progress in each group
    laps_position['TimeDiffToLeader'] = laps_position['RaceTimeProgress'] - leader_progress     # calculate difference to leader for each driver in each group
    laps_position = laps_position.drop(columns=['RaceTimeProgress'])


    return laps_position

#### Pit stops

In [21]:
def pit_stops_data (laps_position, year, round):
    """Added the following new colums, related with the pit stops, for each lap:
            * HasPitLap: if ther is a pit stop in the lap (0: no, 1:yes)
            * TimePit: time of the pit stop, in case that there is pit (0 if not)
            * NumberPitStops: number of pit stops that the driver has done till this point

            * DriverAhead: what is the driver ahead (the first doesn't have any)
            * DriverBehind: what is the driver behind (the last doesn't have any)

            * DriverAheadPit: if the driver that is ahead has pit in this stint 
            * DriverBehindPit: if the driver that is behind has pit in this stint 
                - 0: no, 1:yes for both 
                - a stint refers to the period of time during a race when a driver stays out on the track without pitting 
    Args:
        laps_position (DataFrame): DataFrame where add the data
        year(int): year of the race
        round(int): race number

    Returns:
        DataFrame: with the data added 
    """

    #--------CREATION AUXILIAR DATAFRAME -------
    #Creation of the dataframe with the information of the pit stops for the race
    pitStops = pd.DataFrame()
    url = 'http://ergast.com/api/f1/{}/{}/pitstops.json?limit=10000'
    r = requests.get(url.format(year,round))
    json = r.json()

    for item in json['MRData']['RaceTable']['Races'][0]['PitStops']: 
        #Get info
        driver = dict_code_drivers.get(item['driverId'])
        lap = item['lap']
        stopNumber = item['stop']
        duration = item['duration'] 
        #Add to the dataframe
        pitStops = pitStops.append({'driver': driver, 'lap': lap, 'stopNumber': stopNumber, 'duration': duration}, ignore_index=True)

    #---------- ADD TO THE DATAFRAME--------
    #Creation of new colums that will be added in the dataframe
    #Numeric
    laps_position["TimePit"] = 0
    laps_position["NumberPitStops"] = 0
    #binari 0 no 1 yes (If the car has stop is this stint)
    laps_position["HasPitLap"] = 0

    #------------- LAP, LAP TIME, NUMBER OF PITS --------
    #Itereate over the dataframe to add Pit, time,number of pits
    for index, row in laps_position.iterrows():   

        #information of the pit for the driver in the specific lap 
        value_pit = pitStops[(pitStops['driver'] == row['Driver']) & (pitStops['lap'] == str(row['LapNumber']))]

        if not value_pit.empty: #There is a pit stop 
            laps_position.loc[index, 'HasPitLap'] = 1
            laps_position.loc[index, 'TimePit'] = value_pit['duration'].item()
            
            #For have a count of the number of pit stops (needed for later)
            #The number of stops on the previous lap is displayed, if there has been a pit stop this number is increased. 
            if (row['LapNumber']>1):
                laps_position.loc[index, 'NumberPitStops'] = laps_position.loc[(laps_position['Driver'] == row['Driver']) & (laps_position['LapNumber'] == (row['LapNumber']-1)), 'NumberPitStops'].item()+1
        elif (row['LapNumber']>1): 
            laps_position.loc[index, 'NumberPitStops'] = laps_position.loc[(laps_position['Driver'] == row['Driver']) & (laps_position['LapNumber'] == (row['LapNumber']-1)), 'NumberPitStops'].item()
        
    #-----------DRIVER AHEAD AND BEHIND HAS STOP --------
    #Creation of columns with the car ahead and behind, 
    #We have nulls, whether it is the first or the last one. 
    #? TODO delete them 
    laps_position['DriverAhead'] = laps_position.groupby('LapNumber')['Driver'].shift(1)
    laps_position['DriverBehind'] = laps_position.groupby('LapNumber')['Driver'].shift(-1)

    #Calculation if the driver ahead an behind has stop
    #binari 0 no 1 yes
    laps_position["DriverAheadPit"] = 0
    laps_position["DriverBehindPit"] = 0

    #It is done in another for, because I need the values that are calculated in the previous one
    for index, row in laps_position.iterrows():  
        #Has the driver ahead do a pit stop 
        #If the driver ahead is Nan is because is the fist one 
        if type(row['DriverAhead']) == str: ## if is not str is when is Nan 
            n_pits_car_ahead = laps_position.loc[(laps_position['Driver'] == row['DriverAhead']) & (laps_position['LapNumber'] == (row['LapNumber']))]['NumberPitStops'].item()
            if (n_pits_car_ahead > row['NumberPitStops']):
                laps_position.loc[index, 'DriverAheadPit'] = 1

        #Has the driver behind do a pit stop 
        #If the driver behind is Nan is because is the last one 
        if type(row['DriverBehind']) == str: ## if is not str is when is Nan 
            n_pits_car_ahead = laps_position.loc[(laps_position['Driver'] == row['DriverBehind']) & (laps_position['LapNumber'] == (row['LapNumber']))]['NumberPitStops'].item()
            if (n_pits_car_ahead > row['NumberPitStops']):
                laps_position.loc[index, 'DriverBehindPit'] = 1
    
    #return the dataframe with all the new data
    return laps_position

### Was a good pit stop

In [22]:
def good_pit (laps_position, numberLaps):
    """ Cretes and calculate a new colum in the dataframe, to check if was a good pit
    A good pit is calculated if at the end of the stint the position is equal to or better than before the pit was made

    Args:
        laps_position (DataFrame): DataFrame where add the data
        numberLaps (int): number of laps of this race

    Returns:
       DataFrame: with the data added 
    """

    #creation of the new colum 0 no 1 yes
    laps_position["GoodPitStop"] = 0

    #iterate over the information and calculate if was a good pit 
    for index, row in laps_position.iterrows():  
        #if in this lap there is a pit stop 
        if row["HasPitLap"] ==1 :
            posInicial = row['Position']
            #Check if in this stint he has improved his position or stayed the same (with that you can say it is a good pit)
            for l in range(row['LapNumber'] +1 , numberLaps+1):
                
                #New laps to compare to the original 
                compare = laps_position.loc[(laps_position['Driver'] == row['Driver']) & (laps_position['LapNumber'] == l)]
                
                if ( not compare.empty): #check that there is information for this lap, if the car ends before 
                    if (row['NumberPitStops'] != compare['NumberPitStops'].item()): #change of stint
                        if (row['Position'] >= compare['Position'].item()): #compare if the new position is better or the same
                            laps_position.loc[index, 'GoodPitStop'] = 1
                        break #we have the information that we want

                    if (l == numberLaps): #last lap, is also an end of stint 
                        if (row['Position'] >= compare['Position'].item()): #compare if the new position is better or the same
                            laps_position.loc[index, 'GoodPitStop'] = 1

    return laps_position

# Create Dataframe 

In [23]:
#Creation of the dataframe 
df = pd.DataFrame()


In [24]:
#NOTE The frist time (if is not in chache)
# Since there is a lot of data that needs to be download this process takes some time 
# For each race takes like 1 min to load the data
# And then a bit more than 1 min to all the calculations 

#For each year 
for year in list(range(year_start,year_finish+1)):
    
    df_year= pd.DataFrame()

    #Get number of circuits in that year 
    url = 'http://ergast.com/api/f1/{}/circuits.json?limit=10000'
    r = requests.get(url.format(year,round))
    json = r.json()
    numberCircuitsSeason = int(json['MRData']['total'])
    #numberCircuitsSeason= 1 #! TODO change later, so the tests doesn't take for ever

    #Since the season 2023 is still going ()
    #The numberCircuitsSeason will be 23, but we don't have info of the race
    #So we stop the for in the last race that we have info 
    url_last = 'https://ergast.com/api/f1/current/last/results.json'
    r_last = requests.get(url_last.format(year))
    json_last = r_last.json()
    round_last= int (json_last['MRData']['RaceTable']['round'])
    year_last = int (json_last['MRData']['RaceTable']['season'])
    #For each race in the year 
    for round in range(1, numberCircuitsSeason+1) : 
        #If that race of the season has not occur yet we stop 
        if year == year_last and round > round_last:
            break
        #Creation of the Dataframes to add the data 
        df_FastF1 = pd.DataFrame()
        df_Ergast = pd.DataFrame()

        #creation the session for this race to acces FastF1
        session = fastf1.get_session(year, round, 'R')
        session.load()

        #Load the laps of this session 
        laps = session.load_laps(with_telemetry=True)
        
        #Add data from FastF1
        df_FastF1 = add_info_fastF1 (df_FastF1, session, laps, year, round)

        #Get number of laps (neded for Ergast)
        numberLaps  = int(max(df_FastF1['LapNumber']))

        #Add data from Ergast
        df_Ergast =  add_info_Ergast (df_Ergast, numberLaps, year, round)

        #Calculate and add distances to driver ahead, behind and leader 
        df_Ergast = calculate_add_distances (df_Ergast)

        #Calculate and add the information related to pit stops 
        df_Ergast = pit_stops_data (df_Ergast, year, round)

        #Calculate if the pit was a good pit 
        df_Ergast = good_pit (df_Ergast, numberLaps)

        #Combine both dataframes , the commun colums are LapNumber and Driver
        data_merge = pd.merge(df_FastF1, df_Ergast, on=['LapNumber', 'Driver'])

        #Add the data of this race to de year DataFrame 
        df_year = pd.concat([df_year, data_merge], ignore_index=True)

    #Export Dataframe per year
    name_file = 'data/' + str(year) + '.csv'
    df_year.to_csv(name_file, index=False, header=True, sep ='\t')

    #add the data to the full dataframe 
    df = pd.concat([df, df_year], ignore_index=True)
    
#Export dataframe 
df.to_csv('data/combined_dataframe.csv', index=False, header=True, sep ='\t' )

core           INFO 	Loading data for Austrian Grand Prix - Race [v2.3.0]
api            INFO 	No cached data found for driver_info. Loading data...
api            INFO 	Fetching driver list...
api            INFO 	Data has been written to cache!
api            INFO 	No cached data found for timing_data. Loading data...
api            INFO 	Fetching timing data...
api            INFO 	Parsing timing data...
This might be a bug and should be reported.
This might be a bug and should be reported.
api            INFO 	Data has been written to cache!
api            INFO 	No cached data found for timing_app_data. Loading data...
api            INFO 	Fetching timing app data...
api            INFO 	Data has been written to cache!
core           INFO 	Processing timing data...
api            INFO 	No cached data found for session_status_data. Loading data...
api            INFO 	Fetching session status data...
api            INFO 	Data has been written to cache!
api            INFO 	No cached 

KeyboardInterrupt: 