In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import ergast_loader 

In [2]:
ergast = ergast_loader.ErgastLoader('data//ergast_data')

In [3]:
def generate_urls(year, race):
    urls = []
    url_blank = 'http://press.pirelli.com/{}-{}{}-race/'
    for i in range(5):
        urls.append(url_blank.format(str(year), race, '-' * i))
    return urls


def find_urls(year):
    df_races = ergast.data['races'].copy()
    races = df_races.loc[df_races['year']== year, 'name'].to_list()
    true_urls = {}
    for race in races:
        found_race_url = False
        race_str = race.lower().replace(' ', '-')
        print(race_str)
        urls = generate_urls(year, race_str)
        for url in urls:
            r = requests.get(url)
            soup = BeautifulSoup(r.text, "html.parser")
            if 'PIT 1' in str(soup):
                true_urls[race] = url
                found_race_url = True
                break
        if not found_race_url:
            true_urls[race] = ''
            print('failed')  
        print()
    return true_urls


def scrape_pirelli(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    
    df = pd.read_html(str(soup), match=r'(.*PIT 1.*)')[0]
    
    if isinstance(df.columns, pd.core.indexes.multi.MultiIndex):
        df.columns = df.columns.droplevel()
    
    df.columns = [c.lower() for c in df.columns]
    df.drop('car', axis=1, inplace=True)

    df = df.melt('driver').copy()
    df = df.dropna(axis=0, subset=['value'])
    
    df.rename({'variable': 'stop'}, axis=1, inplace=True)
    df['stop'] = df['stop'].str.replace('start', '0')
    df['stop'] = df['stop'].str.replace('pit ', '')
    df['stop'] = df['stop'].astype(int)
    
    df.loc[df['value'].str.contains('u'), 'condition'] = 'used'
    df.loc[df['value'].str.contains('n'), 'condition'] = 'new'
    
    df.loc[df['value'].str[0] == 'C', 'compound'] = df['value'].str[:2]
    df.loc[df['value'].str.contains('I'), 'compound'] = 'I'
    df.loc[df['value'].str.contains('W'), 'compound'] = 'W'
    
    df['lap'] = df['value'].str.extract(r'.*\(([\d]*)\)')
    df['lap'] = df['lap'].fillna(0)
    
    
    df2 = pd.read_html(str(soup), match=r'(.*LAPS.*)')[0]
    if isinstance(df2.columns, pd.core.indexes.multi.MultiIndex):
        df2.columns = df2.columns.droplevel()
    df2 = df2[df2['COMPOUND'].str.contains('C')]
    df2['race_assignment'] = df2['COMPOUND'].str.extract(r'(\S*) ')
    df2['race_assignment'] = df2['race_assignment'].str.lower()
    df2['compound'] = df2['COMPOUND'].str.extract(r'\S* (\S*)')
    df2 = df2[['compound', 'race_assignment']]
    
    df = df.merge(right=df2, how='left', on='compound')
    
    return df


def get_pit_tyre_data(year, urls):
    dfs = []
    for race in urls.keys():
        if urls[race]:
            df = scrape_pirelli(urls[race])
            df['race'] = race
            df['year'] = year
            dfs.append(df)
    return pd.concat(dfs)

In [4]:
scrape_pirelli('http://press.pirelli.com/2020-spanish-grand-prix---race/')

Unnamed: 0,driver,stop,value,condition,compound,lap,race_assignment
0,HAM,0,C3u,used,C3,0,soft
1,VER,0,C3u,used,C3,0,soft
2,BOT,0,C3u,used,C3,0,soft
3,STR,0,C3u,used,C3,0,soft
4,PER,0,C3u,used,C3,0,soft
5,SAI,0,C3u,used,C3,0,soft
6,VET,0,C2n,new,C2,0,medium
7,ALB,0,C3u,used,C3,0,soft
8,GAS,0,C3u,used,C3,0,soft
9,NOR,0,C3u,used,C3,0,soft


In [5]:
# used find urls to collect, no need to do it twice

urls_2020 = {
    'Austrian Grand Prix': 'http://press.pirelli.com/2020-austrian-grand-prix---race/',
    'Styrian Grand Prix': 'http://press.pirelli.com/2020-styrian-grand-prix---race/',
    'Hungarian Grand Prix': 'http://press.pirelli.com/2020-hungarian-grand-prix---race/',
    'British Grand Prix': 'http://press.pirelli.com/2020-british-grand-prix---race/',
    '70th Anniversary Grand Prix': 'https://press.pirelli.com/emirates-formula-1-70-anniversary-grand-prix-2020---race/',
    'Spanish Grand Prix': 'http://press.pirelli.com/2020-spanish-grand-prix---race/',
    'Belgian Grand Prix': 'http://press.pirelli.com/2020-belgian-grand-prix--race/',
    'Italian Grand Prix': '',
    'Tuscan Grand Prix': '',
    'Russian Grand Prix': '',
    'Eifel Grand Prix': '',
    'Portuguese Grand Prix': '',
    'Emilia Romagna Grand Prix': '',
    'Turkish Grand Prix': '',
    'Bahrain Grand Prix': '',
    'Sakhir Grand Prix': '',
    'Abu Dhabi Grand Prix': ''
}
   
urls_2019 = {
    'Australian Grand Prix': 'http://press.pirelli.com/2019-australian-grand-prix--race/',
    'Bahrain Grand Prix': 'http://press.pirelli.com/2019-bahrain-grand-prix--race/',
    'Chinese Grand Prix': 'http://press.pirelli.com/2019-chinese-grand-prix---race/',
    'Azerbaijan Grand Prix': 'http://press.pirelli.com/2019-azerbaijan-grand-prix---race/',
    'Spanish Grand Prix': 'http://press.pirelli.com/2019-spanish-grand-prix---race/',
    'Monaco Grand Prix': 'https://press.pirelli.com/2019-monaco-grand-prix---race-0/',
    'Canadian Grand Prix': 'https://press.pirelli.com/2019-canada-grand-prix---race/',
    'French Grand Prix': 'http://press.pirelli.com/2019-french-grand-prix---race/',
    'Austrian Grand Prix': 'http://press.pirelli.com/2019-austrian-grand-prix--race/',
    'British Grand Prix': 'http://press.pirelli.com/2019-british-grand-prix---race/',
    'German Grand Prix': 'http://press.pirelli.com/2019-german-grand-prix---race/',
    'Hungarian Grand Prix': 'https://press.pirelli.com/2019-hungarian-grand-prix----race-0/',
    'Belgian Grand Prix': 'http://press.pirelli.com/2019-belgian-grand-prix---race/',
    'Italian Grand Prix': 'http://press.pirelli.com/2019-italian-grand-prix---race/',
    'Singapore Grand Prix': 'http://press.pirelli.com/2019-singapore-grand-prix--race/',
    'Russian Grand Prix': 'http://press.pirelli.com/2019-russian-grand-prix---race/',
    'Japanese Grand Prix': 'http://press.pirelli.com/2019-japanese-grand-prix---race/',
    'Mexican Grand Prix': 'http://press.pirelli.com/2019-mexican-grand-prix---race/',
    'United States Grand Prix': 'http://press.pirelli.com/2019-united-states-grand-prix----race/',
    'Brazilian Grand Prix': 'http://press.pirelli.com/2019-brazilian-grand-prix---race/',
    'Abu Dhabi Grand Prix': 'http://press.pirelli.com/2019-abu-dhabi-grand-prix---race/'
}

In [6]:
pit_tyre_data_2019 = get_pit_tyre_data(2019, urls_2019)

In [7]:
pit_tyre_data_2020 = get_pit_tyre_data(2020, urls_2020)

In [8]:
pit_tyre_data_2019

Unnamed: 0,driver,stop,value,condition,compound,lap,race_assignment,race,year
0,BOT,0,C4u,used,C4,0,soft,Australian Grand Prix,2019
1,HAM,0,C4u,used,C4,0,soft,Australian Grand Prix,2019
2,VER,0,C4u,used,C4,0,soft,Australian Grand Prix,2019
3,VET,0,C4u,used,C4,0,soft,Australian Grand Prix,2019
4,LEC,0,C4u,used,C4,0,soft,Australian Grand Prix,2019
...,...,...,...,...,...,...,...,...,...
41,VET,2,C4n (38),new,C4,38,medium,Abu Dhabi Grand Prix,2019
42,SAI,2,C4n (41),new,C4,41,medium,Abu Dhabi Grand Prix,2019
43,RIC,2,C5u (42),used,C5,42,soft,Abu Dhabi Grand Prix,2019
44,GIO,2,C4n (26),new,C4,26,medium,Abu Dhabi Grand Prix,2019


In [9]:
pit_tyre_data_2020

Unnamed: 0,driver,stop,value,condition,compound,lap,race_assignment,race,year
0,BOT,0,C4u,used,C4,0,soft,Austrian Grand Prix,2020
1,LEC,0,C4u,used,C4,0,soft,Austrian Grand Prix,2020
2,NOR,0,C4u,used,C4,0,soft,Austrian Grand Prix,2020
3,HAM,0,C4u,used,C4,0,soft,Austrian Grand Prix,2020
4,SAI,0,C4u,used,C4,0,soft,Austrian Grand Prix,2020
...,...,...,...,...,...,...,...,...,...
35,LAT,1,C2n (10),new,C2,10,hard,Belgian Grand Prix,2020
36,MAG,1,C2n (10),new,C2,10,hard,Belgian Grand Prix,2020
37,LEC,2,C3n (24),new,C3,24,medium,Belgian Grand Prix,2020
38,LAT,2,C3n (30),new,C3,30,medium,Belgian Grand Prix,2020


In [10]:
pit_tyre_data_2019.to_csv('data\\pirelli\\race_tyre_usage_2019.csv', index=False)
pit_tyre_data_2020.to_csv('data\\pirelli\\race_tyre_usage_2020.csv', index=False)