# Racing Profile

Thinking about all the features that would go into a single observation. 
Observation unit: driver
Dependent variable: Grand Prix finishing position

Independent variables:??

    * Model 1: Features from Practice 1-3 and Qualifying
    * Model 2: Model 1 + features from first half of race
    * Model 3: Model 2 + features from past performance

What Features?

Model 1: Features from Practice 1-3 and Qualifying - Following features for each session (x4)

    * Min/Max/Avg lap times
    * Number of stints (stint_number max)
    * Sum of # of practice laps?
    * Summary stats for each lap? or stint? (brake, rpm, throttle, speed, drs) (min/max/avg)
        * max brake, min/max/avg rpm, max/avg throttle, min/max/avg speed
    * (Maybe) Number of gear changes (n_gear) 
    * Avg Pit duration
    * Number of Pits?


In [114]:
#| label: import
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
import pandas as pd
import json
from datetime import datetime
import time

query_base = "https://api.openf1.org/v1/"

In [30]:
query_meetings = query_base+"meetings?year>2023"

response = urlopen(query_meetings)
data = json.loads(response.read().decode('utf-8'))
meetings_df = pd.json_normalize(data)

print(meetings_df)

    circuit_key  circuit_short_name  meeting_key meeting_code  \
0            63              Sakhir         1229          BRN   
1           149              Jeddah         1230          KSA   
2            10           Melbourne         1231          AUS   
3            46              Suzuka         1232          JPN   
4            49            Shanghai         1233          CHN   
5           151               Miami         1234          USA   
6             6               Imola         1235          ITA   
7            22         Monte Carlo         1236          MON   
8            23            Montreal         1237          CAN   
9            15           Catalunya         1238          ESP   
10           19           Spielberg         1239          AUT   
11            2         Silverstone         1240          GBR   
12            4         Hungaroring         1241          HUN   
13            7   Spa-Francorchamps         1242          BEL   
14           55          

In [38]:
# convert meetings to a list
meeting_list = meetings_df['meeting_key'].to_list()

# create meeting session list
valid_meeting_sessions = []

# loop through each meeting
for meeting in meeting_list:
    query_sessions = query_base + "sessions?meeting_key=" + str(meeting)

    response = urlopen(query_sessions)
    data = json.loads(response.read().decode('utf-8'))
    sessions_df = pd.json_normalize(data)
    
    # check for the 3 practice rounds, qualifier, and race
    session_types = set(sessions_df['session_type'].unique())  # Get unique session types for the meeting

    required_session_types = {'Practice', 'Qualifying', 'Race'}

    # we need 3 Practice sessions, 1 Qualifying, and 1 Race
    practice_sessions = [session for session in sessions_df['session_type'] if session == 'Practice']
    qualifying_sessions = [session for session in sessions_df['session_type'] if session == 'Qualifying']
    race_sessions = [session for session in sessions_df['session_type'] if session == 'Race']

    # check if the meeting has exactly 3 practice sessions, 1 qualifying, and 1 race
    if len(practice_sessions) == 3 and len(qualifying_sessions) == 1 and len(race_sessions) == 1:

        # loop through the valid sessions and add to a list
        for session in sessions_df.itertuples():
            valid_meeting_sessions.append({
                'meeting_key': meeting,
                'session_key': session.session_key,
                'session_type': session.session_type })

# convner to a DF 
valid_sessions_df = pd.DataFrame(valid_meeting_sessions)

print(valid_sessions_df)

    meeting_key  session_key session_type
0          1229         9465     Practice
1          1229         9466     Practice
2          1229         9467     Practice
3          1229         9468   Qualifying
4          1229         9472         Race
..          ...          ...          ...
95         1256         9999     Practice
96         1256        10000     Practice
97         1256        10001     Practice
98         1256        10002   Qualifying
99         1256        10006         Race

[100 rows x 3 columns]


In [39]:
## create a data frame of all the session_keys that are the actual races
# these will be used to get the label (final race position) later
race_session_df = valid_sessions_df[valid_sessions_df['session_type']=="Race"]

print(race_session_df.head())
print(len(race_session_df))

# now get the list of viable meetings to loop through
valid_meeting_list = race_session_df['meeting_key'].to_list()

print(valid_meeting_list)
print(len(valid_meeting_list))

    meeting_key  session_key session_type
4          1229         9472         Race
9          1230         9480         Race
14         1231         9488         Race
19         1232         9496         Race
24         1235         9515         Race
20
[1229, 1230, 1231, 1232, 1235, 1236, 1237, 1238, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1248, 1250, 1252, 1254, 1256]
20


In [None]:
sub_meeting_list = valid_meeting_list[1:2]

## loop through all the valid meeting keys
for meeting in sub_meeting_list:
    # get session numbers for practice and qualifiying
    query_sessions = query_base+"sessions?meeting_key="+str(meeting)

    response = urlopen(query_sessions)
    data = json.loads(response.read().decode('utf-8'))
    sessions_df = pd.json_normalize(data)

    # get just the session keys 
    sessions = list(sessions_df['session_key'])
    # get race session num
    race_session_num = sessions[4]

    # remove the real race
    del sessions[-1]
    print(sessions)
    # we will loop through the sessions later

    ## query the drivers for each race session so that we know they raced in the grand prix
    query_drivers = query_base+"drivers?session_key="+str(race_session_num)

    response = urlopen(query_drivers)
    data = json.loads(response.read().decode('utf-8'))
    drivers_df = pd.json_normalize(data)

    # get all the driver numbers to loop through later
    drivers = list(drivers_df['driver_number'])

    # print(drivers)

    # add sleep time to not overload requests
    time.sleep(0.5)

    ## Loop through the drivers
    for driver in drivers:
        # create list to hold the observation
        driver_feats = [driver]

        driver_number = str(driver)

        ## Loop through all the sessions
        for session in sessions:
            # creat list to store all data for this session
            session_feats = []

            # LAPS QUERY
            query_laps = query_base+"laps?driver_number="+driver_number+"&session_key="+str(session)

            try:
                # Call API and convert to DataFrame
                response = urlopen(query_laps)
                data = json.loads(response.read().decode('utf-8'))
                laps_df = pd.json_normalize(data)
                
                # Check if the DataFrame is empty (no laps data returned)
                if laps_df.empty:
                    print(f"No data returned for driver {driver} and session {session}. Skipping.")
                    continue  # Skip to the next session


                # extract lap infor for current session
                min_lap = laps_df['lap_duration'].min()
                max_lap = laps_df['lap_duration'].max()
                avg_lap = float(round(laps_df['lap_duration'].mean(),3))
                num_laps = laps_df['lap_number'].max()

                # PARSE THE LAPS DATA BY TIME
                # laps_times_df will be used for car_data queries
                lap_times = laps_df[['lap_number','date_start','lap_duration']].copy()

                # strip the time zone since its the same for all sessions
                lap_times['date_start'] = lap_times['date_start'].str.replace(r':\+.*$', '', regex=True)

                # Convert date_start to datetime if it's not already in datetime format
                lap_times['date_start'] = pd.to_datetime(lap_times['date_start'], errors='coerce')

                # use the next lap start as the end time exept for the last lap, which will be calculated with lap duration
                lap_times['date_end'] = lap_times['date_start'].shift(-1).fillna(lap_times['date_start'] + pd.to_timedelta(lap_times['lap_duration'], unit='s'))

                # convert back to string
                lap_times['date_start'] = lap_times['date_start'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f') + lap_times['date_start'].dt.strftime('%z').str[:3] + ':' + lap_times['date_start'].dt.strftime('%z').str[3:]
                lap_times['date_end'] = lap_times['date_end'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f') + lap_times['date_end'].dt.strftime('%z').str[:3] + ':' + lap_times['date_end'].dt.strftime('%z').str[3:]


                # find the lap number for their best lap
                min_lap_num = laps_df[laps_df['lap_duration']==min_lap]['lap_number'].to_list()[0]

            except (HTTPError, URLError) as e:
                # Handle the error if the API call fails
                print(f"Error occurred for driver {driver} and session {session}: {e}. Skipping this session.")

            # CONDUCT A CAR DATA QUERY ON THE MINIMUM LAP
            # base car data query for time filter to be added to
            query_car_base = query_base+"car_data?driver_number="+driver_number+"&session_key="+str(session)

            # create staret and end time for car data query
            start_time = lap_times[lap_times['lap_number']==min_lap_num]['date_start'].to_list()[0]
            end_time = lap_times[lap_times['lap_number']==min_lap_num]['date_end'].to_list()[0]

            # query for lap specific times
            query_car = query_car_base + "&date>="+str(start_time)+"&date<="+str(end_time)

            try:
                # call api for car data with lap time query
                response = urlopen(query_car)
                data = json.loads(response.read().decode('utf-8'))
                car_df = pd.json_normalize(data)
                
                # Check if the DataFrame is empty (no laps data returned)
                if car_df.empty:
                    print(f"No data returned for driver {driver} and session {session}. Skipping.")
                    continue  # Skip to the next session

                # get summary stats for the lap
                max_brake = car_df['brake'].max()
                max_rpm = car_df['rpm'].max()
                min_rpm = car_df['rpm'].min()
                avg_rpm = round(car_df['rpm'].mean())
                max_throttle = car_df['throttle'].max()
                avg_throttle = float(round(car_df['throttle'].mean()))
                min_speed = car_df['speed'].min()
                max_speed = car_df['speed'].max()
                avg_speed = round(car_df['speed'].mean())

                # create list of car_data stats per lap
                min_lap_stats = [max_brake, min_rpm, max_rpm, avg_rpm, max_throttle, avg_throttle, min_speed, max_speed, avg_speed]

            except (HTTPError, URLError) as e:
                # Handle the error if the API call fails
                print(f"Error occurred for driver {driver} and session {session}: {e}. Skipping this session.")

            # STINTS QUERY FOR NUMBER OF STINTS
            query_stints = query_base + "stints?driver_number="+driver_number+"&session_key="+str(session)

            # call api and convert to df
            response = urlopen(query_stints)
            data = json.loads(response.read().decode('utf-8'))
            stints_df = pd.json_normalize(data)

            # extract max stint number
            num_stints = stints_df['stint_number'].max()


            # PTIS QUERY
            query_pits = query_base + "pit?driver_number="+driver_number+"&session_key="+str(session)

            # call api and convert to df
            response = urlopen(query_pits)
            data = json.loads(response.read().decode('utf-8'))
            pits_df = pd.json_normalize(data)

            # extract num of pits and avg pit duration
            num_pits = len(pits_df)
            avg_pit_time = float(round(pits_df['pit_duration'].mean(),1))

            # WEATHER QUERY
            query_wx = query_base + "weather?&session_key="+str(session)

            # call api and convert to df
            response = urlopen(query_wx)
            data = json.loads(response.read().decode('utf-8'))
            weather_df = pd.json_normalize(data)

            ### parse weather data

            did_rain = weather_df['rainfall'].max()
            max_wind = weather_df['wind_speed'].max()
            avg_air_temp = float(round(weather_df['air_temperature'].mean(),3))
            avg_track_temp = float(round(weather_df['track_temperature'].mean(),3))

            wx_stats = [did_rain, max_wind, avg_air_temp, avg_track_temp]

            # AFTER ALL QUERIES PER SESSION
            # append to driver features list
            session_feats = [min_lap, max_lap, avg_lap, num_laps, num_stints] + [num_pits, avg_pit_time] + wx_stats + min_lap_stats 

            driver_feats.extend(session_feats)
        
        print(driver_feats)
        # add sleep time to not overload requests
        time.sleep(2)



[9473, 9474, 9475, 9476]
[1, 89.659, 806.932, 159.431, 24, 3, 3, 536.3, 0, 6.8, 26.058, 35.126, 100, 6509, 12018, 10801, 100, 81.0, 88, 333, 246, 89.158, 624.73, 144.354, 27, 4, 4, 310.3, 0, 4.1, 25.371, 30.597, 100, 5758, 11929, 10825, 100, 82.0, 80, 330, 247, 88.412, 1284.065, 215.065, 13, 2, 2, 1182.1, 0, 5.8, 25.984, 39.559, 100, 5751, 11935, 10861, 100, 82.0, 84, 335, 249, 87.472, 1026.91, 207.627, 18, 6, 6, 345.7, 0, 2.9, 25.106, 30.786, 100, 6356, 12191, 10932, 100, 84.0, 88, 331, 254]
[2, 90.966, 457.346, 136.172, 27, 3, 3, 341.2, 0, 6.8, 26.058, 35.126, 100, 6212, 12062, 10765, 100, 79.0, 86, 327, 242, 89.934, 569.212, 143.761, 27, 4, 4, 297.5, 0, 4.1, 25.371, 30.597, 100, 6099, 12176, 10762, 100, 80.0, 84, 329, 242, 147.994, 147.994, 147.994, 2, 1, 1, nan, 0, 5.8, 25.984, 39.559, 100, 3778, 12212, 8094, 100, 37.0, 34, 327, 148, 89.526, 242.809, 126.848, 9, 2, 2, 127.4, 0, 2.9, 25.106, 30.786, 100, 6418, 12577, 10840, 100, 80.0, 86, 330, 245]
[3, 90.917, 874.249, 151.203, 25, 

In [54]:
# get session numbers for practice and qualifiying, preloaded meeting_number for now
query_sessions = query_base+"sessions?meeting_key=1229"

response = urlopen(query_sessions)
data = json.loads(response.read().decode('utf-8'))
sessions_df = pd.json_normalize(data)

# get just the session keys 
sessions = list(sessions_df['session_key'])
# remove the real race
del sessions[-1]
print(sessions)

[9465, 9466, 9467, 9468]


In [52]:
print(sessions)

[9473, 9474, 9475, 9476]


Loop to create an individual driver observation

In [118]:
# I'll just do one driver for now
driver_number = str(16)

# create list of features for a individual driver
driver_feats = [driver_number]

session_sub = [9473, 9474, 9475, 9476]


for session in session_sub:
    # creat list to store all data for this session
    session_feats = []

    # LAPS QUERY
    query_laps = query_base+"laps?driver_number="+driver_number+"&session_key="+str(session)

    # call api and convert to df
    response = urlopen(query_laps)
    data = json.loads(response.read().decode('utf-8'))
    laps_df = pd.json_normalize(data)

    # extract lap infor for current session
    min_lap = laps_df['lap_duration'].min()
    max_lap = laps_df['lap_duration'].max()
    avg_lap = float(round(laps_df['lap_duration'].mean(),3))
    num_laps = laps_df['lap_number'].max()

    # PARSE THE LAPS DATA BY TIME
    # laps_times_df will be used for car_data queries
    lap_times = laps_df[['lap_number','date_start','lap_duration']].copy()

    # strip the time zone since its the same for all sessions
    lap_times['date_start'] = lap_times['date_start'].str.replace(r':\+.*$', '', regex=True)

    # Convert date_start to datetime if it's not already in datetime format
    lap_times['date_start'] = pd.to_datetime(lap_times['date_start'], errors='coerce')

    # use the next lap start as the end time exept for the last lap, which will be calculated with lap duration
    lap_times['date_end'] = lap_times['date_start'].shift(-1).fillna(lap_times['date_start'] + pd.to_timedelta(lap_times['lap_duration'], unit='s'))

    # convert back to string
    lap_times['date_start'] = lap_times['date_start'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f') + lap_times['date_start'].dt.strftime('%z').str[:3] + ':' + lap_times['date_start'].dt.strftime('%z').str[3:]
    lap_times['date_end'] = lap_times['date_end'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f') + lap_times['date_end'].dt.strftime('%z').str[:3] + ':' + lap_times['date_end'].dt.strftime('%z').str[3:]


    # find the lap number for their best lap
    min_lap_num = laps_df[laps_df['lap_duration']==min_lap]['lap_number'].to_list()[0]

    # CONDUCT A CAR DATA QUERY ON THE MINIMUM LAP
    # base car data query for time filter to be added to
    query_car_base = query_base+"car_data?driver_number="+driver_number+"&session_key="+str(session)

    # create staret and end time for car data query
    start_time = lap_times[lap_times['lap_number']==min_lap_num]['date_start'].to_list()[0]
    end_time = lap_times[lap_times['lap_number']==min_lap_num]['date_end'].to_list()[0]

    # query for lap specific times
    query_car = query_car_base + "&date>="+str(start_time)+"&date<="+str(end_time)

    # call api for car data with lap time query
    response = urlopen(query_car)
    data = json.loads(response.read().decode('utf-8'))
    car_df = pd.json_normalize(data)


    # get summary stats for the lap
    max_brake = car_df['brake'].max()
    max_rpm = car_df['rpm'].max()
    min_rpm = car_df['rpm'].min()
    avg_rpm = round(car_df['rpm'].mean())
    max_throttle = car_df['throttle'].max()
    avg_throttle = float(round(car_df['throttle'].mean()))
    min_speed = car_df['speed'].min()
    max_speed = car_df['speed'].max()
    avg_speed = round(car_df['speed'].mean())

    # create list of car_data stats per lap
    min_lap_stats = [max_brake, min_rpm, max_rpm, avg_rpm, max_throttle, avg_throttle, min_speed, max_speed, avg_speed]


    # STINTS QUERY FOR NUMBER OF STINTS
    query_stints = query_base + "stints?driver_number="+driver_number+"&session_key="+str(session)

    # call api and convert to df
    response = urlopen(query_stints)
    data = json.loads(response.read().decode('utf-8'))
    stints_df = pd.json_normalize(data)

    # extract max stint number
    num_stints = stints_df['stint_number'].max()


    # PTIS QUERY
    query_pits = query_base + "pit?driver_number="+driver_number+"&session_key="+str(session)

    # call api and convert to df
    response = urlopen(query_pits)
    data = json.loads(response.read().decode('utf-8'))
    pits_df = pd.json_normalize(data)

    # extract num of pits and avg pit duration
    num_pits = len(pits_df)
    avg_pit_time = float(round(pits_df['pit_duration'].mean(),1))

    # WEATHER QUERY
    query_wx = query_base + "weather?&session_key="+str(session)

    # call api and convert to df
    response = urlopen(query_wx)
    data = json.loads(response.read().decode('utf-8'))
    weather_df = pd.json_normalize(data)

    ### parse weather data

    did_rain = weather_df['rainfall'].max()
    max_wind = weather_df['wind_speed'].max()
    avg_air_temp = float(round(weather_df['air_temperature'].mean(),3))
    avg_track_temp = float(round(weather_df['track_temperature'].mean(),3))

    wx_stats = [did_rain, max_wind, avg_air_temp, avg_track_temp]

    # AFTER ALL QUERIES PER SESSION
    # append to driver features list
    session_feats = [min_lap, max_lap, avg_lap, num_laps, num_stints] + [num_pits, avg_pit_time] + wx_stats + min_lap_stats 

    driver_feats.extend(session_feats)



print(driver_feats)


['16', 90.03, 832.062, 162.989, 24, 4, 4, 370.5, 0, 6.8, 26.058, 35.126, 100, 7509, 11950, 10832, 100, 81.0, 84, 329, 241, 89.18, 712.401, 155.104, 25, 5, 5, 271.7, 0, 4.1, 25.371, 30.597, 100, 6701, 12149, 10848, 100, 81.0, 87, 331, 247, 88.608, 982.957, 207.677, 16, 4, 4, 719.1, 0, 5.8, 25.984, 39.559, 100, 6715, 12238, 10921, 100, 82.0, 92, 335, 250, 87.791, 636.95, 179.428, 23, 6, 6, 422.9, 0, 2.9, 25.106, 30.786, 100, 6750, 12228, 10902, 100, 83.0, 89, 330, 250]


In [None]:
print(len(driver_feats))
print(type(driver_feats))

print(driver_feats)

colnames = ['driver_num',
            'min_lap_p1', 'max_lap_p1', 'avg_lap_p1', 'num_laps_p1', 'num_stints_p1', 'num_pits_p1', 'avg_pit_time_p1',
           'max_brake_p1', 'min_rpm_p1', 'max_rpm_p1', 'avg_rpm_p1', 'max_throttle_p1', 'avg_throttle_p1', 'min_speed_p1',
           'max_speed_p1', 'avg_speed_p1', 'did_rain_p1', 'max_wind_p1', 'avg_air_temp_p1', 'avg_track_temp_p1',
           'min_lap_p2', 'max_lap_p2', 'avg_lap_p2', 'num_laps_p2', 'num_stints_p2', 'num_pits_p2', 'avg_pit_time_p2',
           'max_brake_p2', 'min_rpm_p2', 'max_rpm_p2', 'avg_rpm_p2', 'max_throttle_p2', 'avg_throttle_p2', 'min_speed_p2',
           'max_speed_p2', 'avg_speed_p2', 'did_rain_p2', 'max_wind_p2', 'avg_air_temp_p2', 'avg_track_temp_p2',
           'min_lap_p3', 'max_lap_p3', 'avg_lap_p3', 'num_laps_p3', 'num_stints_p3', 'num_pits_p3', 'avg_pit_time_p3',
           'max_brake_p3', 'min_rpm_p3', 'max_rpm_p3', 'avg_rpm_p3', 'max_throttle_p3', 'avg_throttle_p3', 'min_speed_p3',
           'max_speed_p3', 'avg_speed_p3','did_rain_p3', 'max_wind_p3', 'avg_air_temp_p3', 'avg_track_temp_p3', 
           'min_lap_q', 'max_lap_q', 'avg_lap_q', 'num_laps_q', 'num_stints_q','num_pits_q', 'avg_pit_time_q',
           'max_brake_q', 'min_rpm_q', 'max_rpm_q', 'avg_rpm_q', 'max_throttle_q', 'avg_throttle_q', 'min_speed_q', 
           'max_speed_q', 'avg_speed_q','did_rain_q', 'max_wind_q', 'avg_air_temp_q', 'avg_track_temp_q']

print(len(colnames))

81
<class 'list'>
['16', 93.268, 888.191, 161.351, 24, 4, 4, 370.1, 100, 5550, 12128, 10177, 99, 70, 66, 312, 207, 0, 4.7, np.float64(19.373), np.float64(34.216), 91.113, 590.536, 148.184, 26, 4, 4, 319.0, 100, 5682, 12129, 10354, 100, 74, 69, 316, 214, 0, 4.0, np.float64(18.658), np.float64(26.748), 91.094, 789.228, 191.598, 17, 4, 4, 408.3, 100, 5411, 12053, 10301, 100, 74, 67, 314, 212, 0, 4.0, np.float64(20.26), np.float64(32.013), 89.165, 736.725, 202.377, 19, 7, 7, 280.4, 100, 5703, 12137, 10371, 99, 74, 65, 319, 216, 0, 3.1, np.float64(18.049), np.float64(21.514)]
81
