In [1]:
import re
from datetime import timedelta

import requests
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

API_PATH = 'http://ergast.com/api/f1'

In [2]:
def getDriversByYear(year, fmt='json'):
    url = '{}/{}/drivers.{}'.format(API_PATH, year, fmt)
    r = requests.get(url)
    drivers_data = r.json()
    return drivers_data['MRData']['DriverTable']['Drivers']


def getDriversByYearDf(year):
    drivers_records = getDriversByYear(year)
    return pd.DataFrame.from_records(drivers_records)
    
drivers_2014 = getDriversByYearDf(2014)
drivers_2014

Unnamed: 0,code,dateOfBirth,driverId,familyName,givenName,nationality,permanentNumber,url
0,ALO,1981-07-29,alonso,Alonso,Fernando,Spanish,14,http://en.wikipedia.org/wiki/Fernando_Alonso
1,BIA,1989-08-03,jules_bianchi,Bianchi,Jules,French,17,http://en.wikipedia.org/wiki/Jules_Bianchi
2,BOT,1989-08-29,bottas,Bottas,Valtteri,Finnish,77,http://en.wikipedia.org/wiki/Valtteri_Bottas
3,BUT,1980-01-19,button,Button,Jenson,British,22,http://en.wikipedia.org/wiki/Jenson_Button
4,CHI,1991-04-21,chilton,Chilton,Max,British,4,http://en.wikipedia.org/wiki/Max_Chilton
5,ERI,1990-09-02,ericsson,Ericsson,Marcus,Swedish,9,http://en.wikipedia.org/wiki/Marcus_Ericsson
6,GRO,1986-04-17,grosjean,Grosjean,Romain,French,8,http://en.wikipedia.org/wiki/Romain_Grosjean
7,GUT,1991-08-05,gutierrez,Gutiérrez,Esteban,Mexican,21,http://en.wikipedia.org/wiki/Esteban_Guti%C3%A...
8,HAM,1985-01-07,hamilton,Hamilton,Lewis,British,44,http://en.wikipedia.org/wiki/Lewis_Hamilton
9,HUL,1987-08-19,hulkenberg,Hülkenberg,Nico,German,27,http://en.wikipedia.org/wiki/Nico_H%C3%BClkenberg


In [3]:
def getRacesByYear(year, fmt='json'):
    url = '{}/{}.{}'.format(API_PATH, year, fmt)
    r = requests.get(url)
    races_data = r.json()
    return races_data['MRData']['RaceTable']['Races']


def getRacesByYearDf(year):
    races_records = getRacesByYear(year)
    return pd.DataFrame({
        'round': [item['round'] for item in races_records],
        'raceName': [item['raceName'] for item in races_records],
        'circuitId': [item['Circuit']['circuitId'] for item in races_records]
    })

races_2014 = getRacesByYearDf(2014)
races_2014

Unnamed: 0,circuitId,raceName,round
0,albert_park,Australian Grand Prix,1
1,sepang,Malaysian Grand Prix,2
2,bahrain,Bahrain Grand Prix,3
3,shanghai,Chinese Grand Prix,4
4,catalunya,Spanish Grand Prix,5
5,monaco,Monaco Grand Prix,6
6,villeneuve,Canadian Grand Prix,7
7,red_bull_ring,Austrian Grand Prix,8
8,silverstone,British Grand Prix,9
9,hockenheimring,German Grand Prix,10


In [4]:
def lapTimeToSeconds(lapTime):
    parts = re.split(':|\.', lapTime)
    parts = list(map(int, parts))
    duration = timedelta(minutes=parts[0], seconds=parts[1], milliseconds=parts[2])
    return duration.total_seconds()


def fastLapTime(item):
    try:
        return lapTimeToSeconds(item['FastestLap']['Time']['time'])
    except KeyError:
        return np.nan
    

def fastLapNum(item):
    try:
        return item['FastestLap']['lap']
    except KeyError:
        return np.nan
    

def fastLapRank(item):
    try:
        return item['FastestLap']['rank']
    except KeyError:
        return np.nan


def getRaceResultsByYearRound(year, roundNum, fmt='json'):
    url = '{}/{}/{}/results.{}'.format(API_PATH, year, roundNum, fmt)
    r = requests.get(url)
    results_data = r.json()
    # TODO Should data be converted to proper types here? Ergast returns all strings.
    return results_data['MRData']['RaceTable']['Races'][0]


def getRaceResultsDf(year, roundNum):
    race_data = getRaceResultsByYearRound(year, roundNum)
    results = race_data['Results']
    
    columns = ['carNum', 'pos', 'driverId', 'constructorId', 'grid', 'laps', 'status', 'fastLapNum', 'fastLapTime', 'fastLapRank']
    data_types = {'carNum': np.int8, 'pos': np.int8, 'driverId': object,
                  'constructorId': object, 'grid': np.int8, 'laps': np.int8,
                  'status': object, 'fastLapNum': np.float64, 'fastLapTime': np.float64,
                  'fastLapRank': np.float64}
    
    data = []
    for item in results:
        data.append(
            (item['number'],
             item['position'],
             item['Driver']['driverId'],
             item['Constructor']['constructorId'],
             item['grid'],
             item['laps'],
             item['status'],
             fastLapNum(item),
             fastLapTime(item),
             fastLapRank(item))
        )
    
    df = pd.DataFrame(data, columns=columns)
    df = df.astype(data_types)
    
    return df

# FastestLap is not set as a key if the driver did not complete a lap.
# That is a quirk of the ergast API, kind of a bad one. Would be better if it
# was either and empty object or null
# getRaceResultsByYearRound(2014, 2)['Results'][21]
results_df = getRaceResultsDf(2014, 2)
results_df

Unnamed: 0,carNum,pos,driverId,constructorId,grid,laps,status,fastLapNum,fastLapTime,fastLapRank
0,44,1,hamilton,mercedes,1,56,Finished,53.0,103.066,1.0
1,6,2,rosberg,mercedes,3,56,Finished,55.0,103.96,2.0
2,1,3,vettel,red_bull,2,56,Finished,51.0,104.289,4.0
3,14,4,alonso,ferrari,4,56,Finished,47.0,104.165,3.0
4,27,5,hulkenberg,force_india,7,56,Finished,38.0,105.982,10.0
5,22,6,button,mclaren,10,56,Finished,47.0,106.039,11.0
6,19,7,massa,williams,13,56,Finished,44.0,104.897,6.0
7,77,8,bottas,williams,18,56,Finished,31.0,105.475,9.0
8,20,9,kevin_magnussen,mclaren,8,55,+1 Lap,44.0,105.373,8.0
9,26,10,kvyat,toro_rosso,11,55,+1 Lap,36.0,106.695,13.0


In [5]:
def formatLapsData(laps):
    data = []

    for lap in laps:
        lapNum = lap['number']
        for timing in lap['Timings']:
            data.append(
                (lapNum,
                 timing['driverId'],
                 timing['position'],
                 timing['time'],
                 lapTimeToSeconds(timing['time']))
            )
    
    return data, ['lapNum', 'driverId', 'position', 'timeStr', 'timeSeconds']


def buildLapsDataFrame(data, columns):
    df = pd.DataFrame(data, columns=columns)
    
    drivers_group = df.groupby('driverId')
    df['timeCuml'] = drivers_group['timeSeconds'].cumsum()
    df['timeDiff'] = drivers_group['timeSeconds'].diff()
    
    df.astype({'lapNum': np.int8, 'driverId': object,
               'position': np.int8, 'timeStr': object,
               'timeSeconds': np.float64, 'timeCuml': np.float64,
               'timeDiff': np.float64})
    
    return df
    

In [6]:
def getRaceLaps(year, roundNum, fmt='json'):
    url = '{}/{}/{}/laps.{}?limit=2500'.format(API_PATH, year, roundNum, fmt)
    r = requests.get(url)
    laps_data = r.json()
    return laps_data['MRData']['RaceTable']['Races'][0]['Laps']


def getRaceLapsDf(year, roundNum):
    laps = getRaceLaps(year, roundNum)
    data, columns = formatLapsData(laps)
    return buildLapsDataFrame(data, columns)

laps_df = getRaceLapsDf(2014, 2)
laps_df

Unnamed: 0,lapNum,driverId,position,timeStr,timeSeconds,timeCuml,timeDiff
0,1,hamilton,1,1:51.824,111.824,111.824,
1,1,rosberg,2,1:53.894,113.894,113.894,
2,1,ricciardo,3,1:54.677,114.677,114.677,
3,1,vettel,4,1:55.012,115.012,115.012,
4,1,alonso,5,1:56.440,116.440,116.440,
5,1,hulkenberg,6,1:56.866,116.866,116.866,
6,1,raikkonen,7,1:58.121,118.121,118.121,
7,1,kevin_magnussen,8,1:58.557,118.557,118.557,
8,1,button,9,1:59.008,119.008,119.008,
9,1,massa,10,1:59.770,119.770,119.770,


In [7]:
laps_df[laps_df['driverId'] == 'hamilton']

Unnamed: 0,lapNum,driverId,position,timeStr,timeSeconds,timeCuml,timeDiff
0,1,hamilton,1,1:51.824,111.824,111.824,
21,2,hamilton,1,1:47.501,107.501,219.325,-4.323
42,3,hamilton,1,1:47.763,107.763,327.088,0.262
63,4,hamilton,1,1:48.375,108.375,435.463,0.612
84,5,hamilton,1,1:47.428,107.428,542.891,-0.947
105,6,hamilton,1,1:47.532,107.532,650.423,0.104
126,7,hamilton,1,1:47.048,107.048,757.471,-0.484
147,8,hamilton,1,1:47.494,107.494,864.965,0.446
167,9,hamilton,1,1:47.695,107.695,972.66,0.201
186,10,hamilton,1,1:47.277,107.277,1079.937,-0.418


In [8]:
def getRaceResultByYearRoundPosition(year, roundNum, pos, fmt='json'):
    url = '{}/{}/{}/results/{}.{}'.format(API_PATH, year, roundNum, pos, fmt)
    r = requests.get(url)
    result_data = r.json()
    return result_data['MRData']['RaceTable']['Races'][0]['Results'][0]

def racePosition(year, roundNum, pos):
    result =  getRaceResultByYearRoundPosition(year, roundNum, pos)
    return result['Driver']['driverId']

p1 = racePosition(2014, 2, 1)
p2 = racePosition(2014, 2, 2)
p3 = racePosition(2014, 2, 3)

# podium tuple
p1, p2, p3

('hamilton', 'rosberg', 'vettel')

In [9]:
def getLapsByYearRoundDriver(year, roundNum, driverId, fmt='json'):
    url = '{}/{}/{}/drivers/{}/laps.{}?limit=2500'.format(API_PATH, year, roundNum, driverId, fmt)
    r = requests.get(url)
    data = r.json()
    return data['MRData']['RaceTable']['Races'][0]['Laps']


def getDriverLapsDf(year, roundNum, driverId):
    laps = getLapsByYearRoundDriver(year, roundNum, driverId)
    data, columns = formatLapsData(laps)
    return buildLapsDataFrame(data, columns)

driver_laps_df = getDriverLapsDf(2014, 2, 'hamilton')
driver_laps_df

Unnamed: 0,lapNum,driverId,position,timeStr,timeSeconds,timeCuml,timeDiff
0,1,hamilton,1,1:51.824,111.824,111.824,
1,2,hamilton,1,1:47.501,107.501,219.325,-4.323
2,3,hamilton,1,1:47.763,107.763,327.088,0.262
3,4,hamilton,1,1:48.375,108.375,435.463,0.612
4,5,hamilton,1,1:47.428,107.428,542.891,-0.947
5,6,hamilton,1,1:47.532,107.532,650.423,0.104
6,7,hamilton,1,1:47.048,107.048,757.471,-0.484
7,8,hamilton,1,1:47.494,107.494,864.965,0.446
8,9,hamilton,1,1:47.695,107.695,972.66,0.201
9,10,hamilton,1,1:47.277,107.277,1079.937,-0.418


In [10]:
# Another way to do racePosition with filtering the results_df
results_df[results_df['pos']==1][['driverId']]

Unnamed: 0,driverId
0,hamilton


In [11]:
# Merging Data Frames
new_df = pd.merge(results_df, drivers_2014[['driverId', 'code']], on='driverId')
new_df.head()

Unnamed: 0,carNum,pos,driverId,constructorId,grid,laps,status,fastLapNum,fastLapTime,fastLapRank,code
0,44,1,hamilton,mercedes,1,56,Finished,53.0,103.066,1.0,HAM
1,6,2,rosberg,mercedes,3,56,Finished,55.0,103.96,2.0,ROS
2,1,3,vettel,red_bull,2,56,Finished,51.0,104.289,4.0,VET
3,14,4,alonso,ferrari,4,56,Finished,47.0,104.165,3.0,ALO
4,27,5,hulkenberg,force_india,7,56,Finished,38.0,105.982,10.0,HUL


In [12]:
#Filter the drivers.df dataframe to just the driverId and code columns
driverIds_df = drivers_2014[['driverId', 'code']]
#The "x" dataframe is the first one we pass in, the "y" dataframe the second
laps_df = pd.merge(laps_df, driverIds_df, left_on='driverId', right_on='driverId')
laps_df

Unnamed: 0,lapNum,driverId,position,timeStr,timeSeconds,timeCuml,timeDiff,code
0,1,hamilton,1,1:51.824,111.824,111.824,,HAM
1,2,hamilton,1,1:47.501,107.501,219.325,-4.323,HAM
2,3,hamilton,1,1:47.763,107.763,327.088,0.262,HAM
3,4,hamilton,1,1:48.375,108.375,435.463,0.612,HAM
4,5,hamilton,1,1:47.428,107.428,542.891,-0.947,HAM
5,6,hamilton,1,1:47.532,107.532,650.423,0.104,HAM
6,7,hamilton,1,1:47.048,107.048,757.471,-0.484,HAM
7,8,hamilton,1,1:47.494,107.494,864.965,0.446,HAM
8,9,hamilton,1,1:47.695,107.695,972.660,0.201,HAM
9,10,hamilton,1,1:47.277,107.277,1079.937,-0.418,HAM


In [13]:
iplot({
    'data': [
        go.Scatter(x=new_df['fastLapRank'], y=new_df['fastLapTime'], mode='text', text=new_df['code'])
    ],
    'layout': go.Layout(
        title='2014 Malaysian GP Fast Lap Time vs Rank',
        xaxis=go.XAxis(title='Fast Lap Rank'),
        yaxis=go.YAxis(title='Fast Lap Time')
    )
})