In [1]:
import numpy as np
import pandas as pd
import xmltodict
import requests
url = "http://ergast.com/api/f1/drivers.json?callback=myParser"
url1 = "http://ergast.com/api/f1/drivers.json"
url2 = "http://ergast.com/api/f1/2022"
response = requests.get(url2)

In [2]:
'''
Checking if the parsed data is in a json file
Trying to parse an empty response as if it were JSON would raise a JSONDecodeError, so we have to check if the response status is not 204, 
the response headers dict has a content-type key and the value of the `content-type` header is application/json.
'''
response = requests.delete(url2)

print('response: 👉️', response)  # response: 👉️ <Response [204]>
print('response.text: 👉️', response.text)  # response.text: 👉️ ""

# response.status_code: 👉️ 204
print('response.status_code: 👉️', response.status_code)
print('response.headers: 👉️', response.headers)

if (response.status_code != 204 and 'content-type' in response.headers and 'application/json' in response.headers['content-type']):
    parsed = response.json()
    print('✅ parsed response: 👉️', parsed)
else:
# 👇️ this runs
    print('⛔️ conditions not met')

response: 👉️ <Response [200]>
response.text: 👉️ <?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type="text/xsl" href="/schemas/mrd-1.5.xsl"?>
<MRData xmlns="http://ergast.com/mrd/1.5" series="f1" url="http://ergast.com/api/f1/2022" limit="30" offset="0" total="22">
	<RaceTable season="2022">
		<Race season="2022" round="1" url="http://en.wikipedia.org/wiki/2022_Bahrain_Grand_Prix">
			<RaceName>Bahrain Grand Prix</RaceName>
			<Circuit circuitId="bahrain" url="http://en.wikipedia.org/wiki/Bahrain_International_Circuit">
				<CircuitName>Bahrain International Circuit</CircuitName>
				<Location lat="26.0325" long="50.5106">
					<Locality>Sakhir</Locality>
					<Country>Bahrain</Country>
				</Location>
			</Circuit>
			<Date>2022-03-20</Date>
			<Time>15:00:00Z</Time>
			<FirstPractice>
				<Date>2022-03-18</Date>
				<Time>12:00:00Z</Time>
			</FirstPractice>
			<SecondPractice>
				<Date>2022-03-18</Date>
				<Time>15:00:00Z</Time>
			</SecondPractice>
			<ThirdPractice>
				

In [3]:
#Exploring response attributes
print(response.url)
print(response.history)
print(response.status_code)
print(response.headers)
print(response.headers['content-type'])

http://ergast.com/api/f1/2022
[]
200
{'Date': 'Mon, 12 Sep 2022 20:58:37 GMT', 'Server': 'Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/5.4.16', 'X-Powered-By': 'PHP/5.4.16', 'Access-Control-Allow-Origin': '*', 'Cache-Control': 'public, max-age=300', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked', 'Content-Type': 'application/xml; charset=utf-8'}
application/xml; charset=utf-8


In [4]:
# Reading the XML files and converting them to data frames
races = pd.DataFrame.from_dict(xmltodict.parse(response.text)['MRData']['RaceTable']['Race'])

In [5]:
races.head(2)

Unnamed: 0,@season,@round,@url,RaceName,Circuit,Date,Time,FirstPractice,SecondPractice,ThirdPractice,Qualifying,Sprint
0,2022,1,http://en.wikipedia.org/wiki/2022_Bahrain_Gran...,Bahrain Grand Prix,"{'@circuitId': 'bahrain', '@url': 'http://en.w...",2022-03-20,15:00:00Z,"{'Date': '2022-03-18', 'Time': '12:00:00Z'}","{'Date': '2022-03-18', 'Time': '15:00:00Z'}","{'Date': '2022-03-19', 'Time': '12:00:00Z'}","{'Date': '2022-03-19', 'Time': '15:00:00Z'}",
1,2022,2,http://en.wikipedia.org/wiki/2022_Saudi_Arabia...,Saudi Arabian Grand Prix,"{'@circuitId': 'jeddah', '@url': 'http://en.wi...",2022-03-27,17:00:00Z,"{'Date': '2022-03-25', 'Time': '14:00:00Z'}","{'Date': '2022-03-25', 'Time': '17:00:00Z'}","{'Date': '2022-03-26', 'Time': '14:00:00Z'}","{'Date': '2022-03-26', 'Time': '17:00:00Z'}",


In [6]:
# Treating races data frame

def practices_quali(i):
    '''
    Support function for converting practices and quali columns
    '''
    global races, races_ap
    if i == 'ThirdPractice' or i == 'Sprint':
        # Converting 'ThirdPractice' and 'Sprint' column
        try:
            races[i] = races[i].apply(lambda x: x if x == x else {'Date': np.nan})
            races[i] = races[i].transform(lambda x: x['Date'])
        except:
            races[i] = np.nan
    else:
        # Converting the other columns
        try:
            races[i] = races[i].transform(lambda x: x['Date'])
        except:
            races[i] = np.nan

#for season in range(1950, 2022):
def season_reader(season):
    global races, races_ap

    # Reading URL and putting into a DF
    url = "http://ergast.com/api/f1/{}".format(season)
    response = requests.get(url)
    races = pd.DataFrame.from_dict(xmltodict.parse(response.text)['MRData']['RaceTable']['Race'])

    # Converting practices and quali functions
    col_list = ['FirstPractice', 'SecondPractice', 'ThirdPractice', 'Qualifying', 'Sprint']
    [practices_quali(i) for i in col_list];

    # Splitting 'Circuit' column for circuit information
    races['CircuitID'] = races['Circuit'].transform(lambda x: x['@circuitId'])
    races['CircuitName'] = races['Circuit'].transform(lambda x: x['CircuitName'])
    races['Locality'] = races['Circuit'].transform(lambda x: x['Location']['Locality'])
    races['Country'] = races['Circuit'].transform(lambda x: x['Location']['Country'])

    # Renaming columns
    races.rename(columns={"@season": "Season", "@round": "round", "Date": "Race"}, errors="raise", inplace=True)

    # Concatenating data frames
    races = pd.concat([races, races_ap], ignore_index=True)
    races_ap = races.copy()

In [7]:
# Empty DF for appending purposes
races_ap = pd.DataFrame()
# Reading seasons from 1950 to 2022
[season_reader(season) for season in range(1950, 2023)];
# Dropping unnecesary columns
races.drop(columns=['@url', 'Circuit', 'Time'], inplace=True)
#Converting datetime columns to datetime
dt_list = ['Race', 'FirstPractice', 'SecondPractice', 'ThirdPractice', 'Qualifying', 'Sprint']
races[dt_list] = races[dt_list].apply(pd.to_datetime, errors='coerce')

In [8]:
races.loc[races['Season'] == '2022']

Unnamed: 0,Season,round,RaceName,Race,FirstPractice,SecondPractice,ThirdPractice,Qualifying,Sprint,CircuitID,CircuitName,Locality,Country
0,2022,1,Bahrain Grand Prix,2022-03-20,2022-03-18,2022-03-18,2022-03-19,2022-03-19,NaT,bahrain,Bahrain International Circuit,Sakhir,Bahrain
1,2022,2,Saudi Arabian Grand Prix,2022-03-27,2022-03-25,2022-03-25,2022-03-26,2022-03-26,NaT,jeddah,Jeddah Corniche Circuit,Jeddah,Saudi Arabia
2,2022,3,Australian Grand Prix,2022-04-10,2022-04-08,2022-04-08,2022-04-09,2022-04-09,NaT,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia
3,2022,4,Emilia Romagna Grand Prix,2022-04-24,2022-04-22,2022-04-23,NaT,2022-04-22,2022-04-23,imola,Autodromo Enzo e Dino Ferrari,Imola,Italy
4,2022,5,Miami Grand Prix,2022-05-08,2022-05-06,2022-05-06,2022-05-07,2022-05-07,NaT,miami,Miami International Autodrome,Miami,USA
5,2022,6,Spanish Grand Prix,2022-05-22,2022-05-20,2022-05-20,2022-05-21,2022-05-21,NaT,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain
6,2022,7,Monaco Grand Prix,2022-05-29,2022-05-27,2022-05-27,2022-05-28,2022-05-28,NaT,monaco,Circuit de Monaco,Monte-Carlo,Monaco
7,2022,8,Azerbaijan Grand Prix,2022-06-12,2022-06-10,2022-06-10,2022-06-11,2022-06-11,NaT,baku,Baku City Circuit,Baku,Azerbaijan
8,2022,9,Canadian Grand Prix,2022-06-19,2022-06-17,2022-06-17,2022-06-18,2022-06-18,NaT,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada
9,2022,10,British Grand Prix,2022-07-03,2022-07-01,2022-07-01,2022-07-02,2022-07-02,NaT,silverstone,Silverstone Circuit,Silverstone,UK


In [9]:
races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1079 entries, 0 to 1078
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Season          1079 non-null   object        
 1   round           1079 non-null   object        
 2   RaceName        1079 non-null   object        
 3   Race            1079 non-null   datetime64[ns]
 4   FirstPractice   44 non-null     datetime64[ns]
 5   SecondPractice  44 non-null     datetime64[ns]
 6   ThirdPractice   38 non-null     datetime64[ns]
 7   Qualifying      44 non-null     datetime64[ns]
 8   Sprint          6 non-null      datetime64[ns]
 9   CircuitID       1079 non-null   object        
 10  CircuitName     1079 non-null   object        
 11  Locality        1079 non-null   object        
 12  Country         1079 non-null   object        
dtypes: datetime64[ns](6), object(7)
memory usage: 109.7+ KB
