# Formule 1 Data Collection from ERGAST Web

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import time
start = time.time()

In [3]:
import os
import pandas as pd
import numpy as np
from pprint import pprint
import requests

In [4]:
# I will use this function later to calculate points and wins prior to the race

def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df
   

## Formula 1 Races until 2021

In [5]:
races = {'season': [],
        'round': [],
        'circuit_id': [],
        'lat': [],
        'long': [],
        'country': [],
        'date': [],
        'url': []}

for year in list(range(1950,2021)):
    
    url = 'https://ergast.com/api/f1/{}.json'
    r = requests.get(url.format(year))
    json = r.json()

    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except:
            races['season'].append(None)

        try:
            races['round'].append(int(item['round']))
        except:
            races['round'].append(None)

        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except:
            races['circuit_id'].append(None)

        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except:
            races['lat'].append(None)

        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except:
            races['long'].append(None)

        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except:
            races['country'].append(None)

        try:
            races['date'].append(item['date'])
        except:
            races['date'].append(None)

        try:
            races['url'].append(item['url'])
        except:
            races['url'].append(None)
        
races = pd.DataFrame(races)
print(races.shape)

(1035, 8)


In [6]:
races.head()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,1950,3,indianapolis,39.795,-86.2347,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...


In [7]:
races.tail()

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
1030,2020,13,imola,44.3439,11.7167,Italy,2020-11-01,http://en.wikipedia.org/wiki/2020_Emilia_Romag...
1031,2020,14,istanbul,40.9517,29.405,Turkey,2020-11-15,http://en.wikipedia.org/wiki/2020_Turkish_Gran...
1032,2020,15,bahrain,26.0325,50.5106,Bahrain,2020-11-29,http://en.wikipedia.org/wiki/2020_Bahrain_Gran...
1033,2020,16,bahrain,26.0325,50.5106,Bahrain,2020-12-06,http://en.wikipedia.org/wiki/2020_Sakhir_Grand...
1034,2020,17,yas_marina,24.4672,54.6031,UAE,2020-12-13,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...


In [8]:
if not os.path.exists('./data'):
    os.mkdir('./data')

In [9]:
races.to_csv('./data/races.csv', index = False)

## Formula 1 Rounds

In [10]:
race = pd.read_csv('./data/races.csv')

In [11]:
rounds = []
for year in np.array(race.season.unique()):
    rounds.append([year, list(race[race.season == year]['round'])])

In [12]:
rounds[:5]

[[1950, [1, 2, 3, 4, 5, 6, 7]],
 [1951, [1, 2, 3, 4, 5, 6, 7, 8]],
 [1952, [1, 2, 3, 4, 5, 6, 7, 8]],
 [1953, [1, 2, 3, 4, 5, 6, 7, 8, 9]],
 [1954, [1, 2, 3, 4, 5, 6, 7, 8, 9]]]

## Formula 1 Results

In [13]:
results = {'season': [],
          'round':[],
           'circuit_id':[],
          'driver': [],
           'date_of_birth': [],
           'nationality': [],
          'constructor': [],
          'grid': [],
          'time': [],
          'status': [],
          'points': [],
          'podium': [],
          'url': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:
    
        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        try:

            for item in json['MRData']['RaceTable']['Races'][0]['Results']:
                try:
                    results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
                except:
                    results['season'].append(None)

                try:
                    results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
                except:
                    results['round'].append(None)

                try:
                    results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
                except:
                    results['circuit_id'].append(None)

                try:
                    results['driver'].append(item['Driver']['driverId'])
                except:
                    results['driver'].append(None)

                try:
                    results['date_of_birth'].append(item['Driver']['dateOfBirth'])
                except:
                    results['date_of_birth'].append(None)

                try:
                    results['nationality'].append(item['Driver']['nationality'])
                except:
                    results['nationality'].append(None)

                try:
                    results['constructor'].append(item['Constructor']['constructorId'])
                except:
                    results['constructor'].append(None)

                try:
                    results['grid'].append(int(item['grid']))
                except:
                    results['grid'].append(None)

                try:
                    results['time'].append(int(item['Time']['millis']))
                except:
                    results['time'].append(None)

                try:
                    results['status'].append(item['status'])
                except:
                    results['status'].append(None)

                try:
                    results['points'].append(int(item['points']))
                except:
                    results['points'].append(None)

                try:
                    results['podium'].append(int(item['position']))
                except:
                    results['podium'].append(None)

                try:
                    results['url'].append(json['MRData']['RaceTable']['Races'][0]['url'])
                except:
                    results['url'].append(None)

        except:
            print(rounds[n][0], i)
            print ("Error : %s : %s " % (rounds[n][0], i))

results = pd.DataFrame(results)
print(results.shape)

(24507, 13)


In [14]:
results.head()

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium,url
0,1950,1,silverstone,farina,1906-10-30,Italian,alfa,1,8003600.0,Finished,9.0,1,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,1,silverstone,fagioli,1898-06-09,Italian,alfa,2,8006200.0,Finished,6.0,2,http://en.wikipedia.org/wiki/1950_British_Gran...
2,1950,1,silverstone,reg_parnell,1911-07-02,British,alfa,4,8055600.0,Finished,4.0,3,http://en.wikipedia.org/wiki/1950_British_Gran...
3,1950,1,silverstone,cabantous,1904-10-08,French,lago,6,,+2 Laps,3.0,4,http://en.wikipedia.org/wiki/1950_British_Gran...
4,1950,1,silverstone,rosier,1905-11-05,French,lago,9,,+2 Laps,2.0,5,http://en.wikipedia.org/wiki/1950_British_Gran...


In [15]:
results.tail()

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium,url
24502,2020,17,yas_marina,giovinazzi,1993-12-14,Italian,alfa,14,,+1 Lap,0.0,16,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...
24503,2020,17,yas_marina,latifi,1995-06-29,Canadian,williams,18,,+1 Lap,0.0,17,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...
24504,2020,17,yas_marina,kevin_magnussen,1992-10-05,Danish,haas,20,,+1 Lap,0.0,18,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...
24505,2020,17,yas_marina,pietro_fittipaldi,1996-06-25,Brazilian,haas,17,,+2 Laps,0.0,19,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...
24506,2020,17,yas_marina,perez,1990-01-26,Mexican,racing_point,19,,Transmission,0.0,20,http://en.wikipedia.org/wiki/2020_Abu_Dhabi_Gr...


In [16]:
results.to_csv('./data/results.csv', index = False)

## Formula 1 Drivers Standings

In [18]:
driver_standings = {'season': [],
                    'round':[],
                    'driver': [],
                    'driver_points': [],
                    'driver_wins': [],
                   'driver_standings_pos': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:
    
        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                driver_standings['season'].append(None)

            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                driver_standings['round'].append(None)
                                         
            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except:
                driver_standings['driver'].append(None)
            
            try:
                driver_standings['driver_points'].append(int(item['points']))
            except:
                driver_standings['driver_points'].append(None)
            
            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except:
                driver_standings['driver_wins'].append(None)
                
            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except:
                driver_standings['driver_standings_pos'].append(None)
            
driver_standings = pd.DataFrame(driver_standings)
print(driver_standings.shape)


In [None]:
driver_standings = lookup(driver_standings, 'driver', 'driver_points')

In [None]:
driver_standings = lookup(driver_standings, 'driver', 'driver_wins')

In [None]:
driver_standings = lookup(driver_standings, 'driver', 'driver_standings_pos')

In [None]:
driver_standings.head()

Unnamed: 0,season,round,driver,driver_points_after_race,driver_wins_after_race,driver_standings_pos_after_race,driver_points,driver_wins,driver_standings_pos
0,1950,1,farina,9.0,1,1,0.0,0.0,0.0
1,1950,1,fagioli,6.0,0,2,0.0,0.0,0.0
2,1950,1,reg_parnell,4.0,0,3,0.0,0.0,0.0
3,1950,1,cabantous,3.0,0,4,0.0,0.0,0.0
4,1950,1,rosier,2.0,0,5,0.0,0.0,0.0


In [None]:
driver_standings.tail()

Unnamed: 0,season,round,driver,driver_points_after_race,driver_wins_after_race,driver_standings_pos_after_race,driver_points,driver_wins,driver_standings_pos
26617,2020,17,grosjean,2.0,0,19,2.0,0.0,19.0
26618,2020,17,kevin_magnussen,1.0,0,20,1.0,0.0,20.0
26619,2020,17,latifi,0.0,0,21,0.0,0.0,21.0
26620,2020,17,aitken,0.0,0,22,0.0,0.0,22.0
26621,2020,17,pietro_fittipaldi,0.0,0,23,0.0,0.0,23.0


In [None]:
driver_standings.to_csv('./data/driver_standings.csv', index = False)

## Formula 1 Team/Constructor Standings

In [None]:
constructor_rounds = rounds[8:]

constructor_standings = {'season': [],
                    'round':[],
                    'constructor': [],
                    'constructor_points': [],
                    'constructor_wins': [],
                   'constructor_standings_pos': []}

for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:
    
        url = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r = requests.get(url.format(constructor_rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                constructor_standings['season'].append(None)

            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                constructor_standings['round'].append(None)
                                         
            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except:
                constructor_standings['constructor'].append(None)
            
            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except:
                constructor_standings['constructor_points'].append(None)
            
            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except:
                constructor_standings['constructor_wins'].append(None)
                
            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except:
                constructor_standings['constructor_standings_pos'].append(None)
            
constructor_standings = pd.DataFrame(constructor_standings)
print(constructor_standings.shape)


(12486, 6)


In [None]:
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_points')

In [None]:
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_wins')

In [None]:
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_standings_pos')

In [None]:
constructor_standings.head()

Unnamed: 0,season,round,constructor,constructor_points_after_race,constructor_wins_after_race,constructor_standings_pos_after_race,constructor_points,constructor_wins,constructor_standings_pos
0,1958,1,cooper,8.0,1,1,0.0,0.0,0.0
1,1958,1,ferrari,6.0,0,2,0.0,0.0,0.0
2,1958,1,maserati,3.0,0,3,0.0,0.0,0.0
3,1958,2,cooper,16.0,2,1,8.0,1.0,1.0
4,1958,2,ferrari,12.0,0,2,6.0,0.0,2.0


In [None]:
constructor_standings.tail()

Unnamed: 0,season,round,constructor,constructor_points_after_race,constructor_wins_after_race,constructor_standings_pos_after_race,constructor_points,constructor_wins,constructor_standings_pos
12481,2020,17,ferrari,131.0,0,6,131.0,0.0,6.0
12482,2020,17,alphatauri,107.0,1,7,103.0,1.0,7.0
12483,2020,17,alfa,8.0,0,8,8.0,0.0,8.0
12484,2020,17,haas,3.0,0,9,3.0,0.0,9.0
12485,2020,17,williams,0.0,0,10,0.0,0.0,10.0


In [None]:
constructor_standings.to_csv('./data/constructor_standings.csv', index = False)

In [None]:
end = time.time()

In [None]:
import datetime
str(datetime.timedelta(seconds=(end - start)))

'0:21:44.188030'

In [None]:
print(str(end - start)+" seconds")

1304.1880297660828 seconds
