# OCI Data Science - pull data

This notebook provides tools and techniques to pull required data for the project.

Using fastF1 API, we'll pull data that involves::
- Lap
- Weather
- Car
- Results
- Position 

In [None]:
path = '/home/datascience/redbull-pit-strategy/notebooks'
data_path = '../../redbull-pit-strategy/data/'

In [2]:
import os
os.chdir(path)
import pandas as pd
import logging
import json
import pickle
import requests
import numpy as np
import fastf1
import matplotlib.pyplot as plt
import pickle

fastf1.Cache.enable_cache(data_path)

Now, we create a couple of functions to return data in a structure that we want. 

In [3]:
def get_lap_data(session, schedule, evnt, ses, EventDate):
    '''get lap data from a session and
       retun as a dataframe
    '''
    if len(session.laps)==0:
        return []
    lap = session.laps.to_dict()
    lap = pd.DataFrame.from_dict(lap)
    lap['RoundNumber'] = schedule['RoundNumber'][schedule['EventName']==evnt].values[0]
    lap['EventName'] = evnt
    lap['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    lap['session'] = ses
    lap['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return lap

def get_weather_data(session, schedule, evnt, ses,EventDate): 
    '''get weather data from a session and
       retun as a dataframe
    '''
    if len(session.weather_data)==0:
        return []
    weather = session.weather_data.to_dict()
    weather = pd.DataFrame.from_dict(weather)
    weather['RoundNumber'] = schedule['RoundNumber'][schedule['EventName']==evnt].values[0]
    weather['EventName'] = evnt
    weather['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    weather['session'] = ses
    weather['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return weather

def get_car_data(session, schedule, evnt, ses,EventDate):
    '''get car_data from a session and
       retun as a dataframe
    '''
    if len(session.car_data)==0:
        return []
    session.weather_data
    for ii in session.car_data:
        car_data = session.car_data[ii].to_dict()
        car_data = pd.DataFrame.from_dict(car_data)
        car_data['driver'] = ii
    car_data['RoundNumber'] = schedule['RoundNumber'][schedule['EventName']==evnt].values[0]
    car_data['EventName'] = evnt
    car_data['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    car_data['session'] = ses
    car_data['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return car_data

def get_position_data(session, schedule, evnt, ses, EventDate):
    '''get position_data from a session and
       retun as a dataframe
    '''    
    if len(session.pos_data)==0:
        return []
    for ii in session.pos_data:
        position = session.pos_data[ii].to_dict()
        position = pd.DataFrame.from_dict(position)
        position['driver'] = ii
    position['RoundNumber'] = schedule['RoundNumber'][schedule['EventName']==evnt].values[0]
    position['EventName'] = evnt
    position['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    position['session'] = ses
    position['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return position

def get_results(session, schedule, evnt, ses, EventDate):
    '''get results data from a session and
       retun as a dataframe
    '''
    if len(session.results)==0:
        return []
    result = session.results.to_dict()
    result = pd.DataFrame.from_dict(result).reset_index()
    result['RoundNumber'] = schedule['RoundNumber'][schedule['EventName']==evnt].values[0]
    result['EventName'] = evnt
    result['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    result['session'] = ses
    result['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return result

In [None]:
# Pull data and store it in the data directory

sessionDateMap = {'Race': 'Session5Date',
          'Qualifying': 'Session4Date',
            'FP1': 'Session1Date',
            'FP2': 'Session2Date',
            'FP3': 'Session3Date',
            'S':  'Session4Date',
            'SQ':  'Session4Date'}

for year in [2018, 2019, 2020, 2021, 2022]:
    laps = []
    weathers = []
    results = []
    
    sch = fastf1.get_event_schedule(year).to_dict()
    sch = pd.DataFrame.from_dict(sch)

    ## year 2022 doesn't have data after June (yet)
    if year == 2022:
        sch = sch[sch.index<12]
        
    EventName = [s for s in sch['EventName'] if 'Grand' in s]
    
    for event in EventName:
        for session_type in ['FP1','FP2','FP3','S','SQ','Qualifying','Race']:
            
            eventDateColName = sessionDateMap[session_type]
            
            session = None
            
            try:
                session = fastf1.get_session(year, event, session_type) # call fastf1 to extract data
            except:
                print('Session: {} does not exist {}.'.format(session_type, event))
                session = None
            if session is not None:
                try:
                    session.load()
                except:
                    print('Session: ' + session_type + ' does not provide usable data {}.'.format(event))
                    session = None
            
            # if the session exists, extract all data with auxiliary functions from the above cells
            if session is not None:
                ## get lap data for a session
                laps.append(get_lap_data(session, sch, event, 
                                         session_type, eventDateColName))

                ## get weather data for a session
                weathers.append(get_weather_data(session, sch, event, 
                                                 session_type, eventDateColName))

                ## get results for a session
                results.append(get_results(session, sch, event, 
                                           session_type, eventDateColName))


    ## save all extracted session data in a year           
    file = open('{}{}_{}.pkl'.format(data_path, 'laps', year), 'wb')
    pickle.dump(laps,file)
    file = open('{}{}_{}.pkl'.format(data_path, 'weathers', year), 'wb')
    pickle.dump(weathers,file)
    file = open('{}{}_{}.pkl'.format(data_path, 'results', year), 'wb')
    pickle.dump(results,file)

Now, all that's left to do is to concatenate all relevant pickle files from each type, and save them into a single pickle for each one of the types.

In [5]:
def concat_data_and_save(data_source, data_path):
    '''
    e.g. data_source is weather
    1. finds all pkl files in the data_path folder
    2. concat all
    3. saves the in 'data_source'.pickle
    '''
    data = []
    files = os.listdir(data_path)
    file = [x for x in files if x.endswith('.pkl') if data_source in x]
    
    for fl in file:
        print(fl)
        file = open('{}{}'.format(data_path, fl), 'rb')
        data.extend(pickle.load(file))
    data = pd.concat(data, axis=0)

    file = open('{}{}_data.pickle'.format(data_path, data_source), 'wb')
    pickle.dump(data,file)
    return

In [6]:
concat_data_and_save('weathers', data_path)
concat_data_and_save('laps', data_path)
concat_data_and_save('results', data_path)

weathers_2020.pkl
weathers_2021.pkl
weathers_2018.pkl
weathers_2019.pkl
weathers_2022.pkl
laps_2020.pkl
laps_2018.pkl
laps_2019.pkl
laps_2021.pkl
laps_2022.pkl
results_2022.pkl
results_2019.pkl
results_2020.pkl
results_2021.pkl
results_2018.pkl


At this point, we've finished the data extraction process.