In [44]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import ergast_loader 

In [45]:
ergast = ergast_loader.ErgastLoader('data//ergast_data')

In [94]:
def generate_urls(year, race):
    urls = []
    url_blank = 'http://press.pirelli.com/{}-{}{}-race/'
    for i in range(5):
        urls.append(url_blank.format(str(year), race, '-' * i))
    return urls


def find_urls(year):
    df_races = ergast.data['races'].copy()
    races = df_races.loc[df_races['year']== year, 'name'].to_list()
    true_urls = {}
    for race in races:
        found_race_url = False
        race_str = race.lower().replace(' ', '-')
        print(race_str)
        urls = generate_urls(year, race_str)
        for url in urls:
            r = requests.get(url)
            soup = BeautifulSoup(r.text, "html.parser")
            if 'PIT 1' in str(soup):
                true_urls[race] = url
                found_race_url = True
                break
        if not found_race_url:
            true_urls[race] = ''
            print('failed')  
        print()
    return true_urls


def scrape_pirelli(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    
    df = pd.read_html(str(soup), match=r'(.*PIT 1.*)')[0]
    
    if isinstance(df.columns, pd.core.indexes.multi.MultiIndex):
        df.columns = df.columns.droplevel()
    
    df.columns = [c.lower() for c in df.columns]
    df.drop('car', axis=1, inplace=True)

    df = df.melt('driver').copy()
    df = df.dropna(axis=0, subset=['value'])
    
    df.rename({'variable': 'stop'}, axis=1, inplace=True)
    df['stop'] = df['stop'].str.replace('start', '0')
    df['stop'] = df['stop'].str.replace('pit ', '')
    df['stop'] = df['stop'].astype(int)
    
    df.loc[df['value'].str.contains('u'), 'condition'] = 'used'
    df.loc[df['value'].str.contains('n'), 'condition'] = 'new'
    
    df.loc[df['value'].str[0] == 'C', 'compound'] = df['value'].str[:2]
    df.loc[df['value'].str.contains('I'), 'compound'] = 'I'
    df.loc[df['value'].str.contains('W'), 'compound'] = 'W'
    
    df['lap'] = df['value'].str.extract(r'.*\(([\d]*)\)')
    df['lap'] = df['lap'].fillna(0)
    
    
    
    return df


def get_pit_tyre_data(year, urls):
    dfs = []
    for race in urls_2019.keys():
        if urls_2019[race]:
            df = scrape_pirelli(urls_2019[race])
            df['race'] = race
            df['year'] = 2019
            dfs.append(df)
    return pd.concat(dfs)

In [101]:
scrape_pirelli('http://press.pirelli.com/2019-german-grand-prix---race/')

Unnamed: 0,driver,stop,value,condition,compound,lap
0,VER,0,Wn,new,W,0
1,VET,0,Wn,new,W,0
2,KVY,0,Wn,new,W,0
3,STR,0,Wn,new,W,0
4,SAI,0,Wn,new,W,0
...,...,...,...,...,...,...
110,HAM,5,C4u (53),used,C4,53
111,KUB,5,C4u (57),used,C4,57
112,RUS,5,C4u (57),used,C4,57
129,MAG,6,C4u (57),used,C4,57


In [95]:
# used find urls to collect, no need to do it twice

urls_2020 = {
    'Austrian Grand Prix': 'http://press.pirelli.com/2020-austrian-grand-prix---race/',
    'Styrian Grand Prix': 'http://press.pirelli.com/2020-styrian-grand-prix---race/',
    'Hungarian Grand Prix': 'http://press.pirelli.com/2020-hungarian-grand-prix---race/',
    'British Grand Prix': 'http://press.pirelli.com/2020-british-grand-prix---race/',
    '70th Anniversary Grand Prix': 'https://press.pirelli.com/emirates-formula-1-70-anniversary-grand-prix-2020---race/',
    'Spanish Grand Prix': 'http://press.pirelli.com/2020-spanish-grand-prix---race/',
    'Belgian Grand Prix': 'http://press.pirelli.com/2020-belgian-grand-prix--race/',
    'Italian Grand Prix': '',
    'Tuscan Grand Prix': '',
    'Russian Grand Prix': '',
    'Eifel Grand Prix': '',
    'Portuguese Grand Prix': '',
    'Emilia Romagna Grand Prix': '',
    'Turkish Grand Prix': '',
    'Bahrain Grand Prix': '',
    'Sakhir Grand Prix': '',
    'Abu Dhabi Grand Prix': ''
}
   
urls_2019 = {
    'Australian Grand Prix': 'http://press.pirelli.com/2019-australian-grand-prix--race/',
    'Bahrain Grand Prix': 'http://press.pirelli.com/2019-bahrain-grand-prix--race/',
    'Chinese Grand Prix': 'http://press.pirelli.com/2019-chinese-grand-prix---race/',
    'Azerbaijan Grand Prix': 'http://press.pirelli.com/2019-azerbaijan-grand-prix---race/',
    'Spanish Grand Prix': 'http://press.pirelli.com/2019-spanish-grand-prix---race/',
    'Monaco Grand Prix': 'https://press.pirelli.com/2019-monaco-grand-prix---race-0/',
    'Canadian Grand Prix': 'https://press.pirelli.com/2019-canada-grand-prix---race/',
    'French Grand Prix': 'http://press.pirelli.com/2019-french-grand-prix---race/',
    'Austrian Grand Prix': 'http://press.pirelli.com/2019-austrian-grand-prix--race/',
    'British Grand Prix': 'http://press.pirelli.com/2019-british-grand-prix---race/',
    'German Grand Prix': 'http://press.pirelli.com/2019-german-grand-prix---race/',
    'Hungarian Grand Prix': 'https://press.pirelli.com/2019-hungarian-grand-prix----race-0/',
    'Belgian Grand Prix': 'http://press.pirelli.com/2019-belgian-grand-prix---race/',
    'Italian Grand Prix': 'http://press.pirelli.com/2019-italian-grand-prix---race/',
    'Singapore Grand Prix': 'http://press.pirelli.com/2019-singapore-grand-prix--race/',
    'Russian Grand Prix': 'http://press.pirelli.com/2019-russian-grand-prix---race/',
    'Japanese Grand Prix': 'http://press.pirelli.com/2019-japanese-grand-prix---race/',
    'Mexican Grand Prix': 'http://press.pirelli.com/2019-mexican-grand-prix---race/',
    'United States Grand Prix': 'http://press.pirelli.com/2019-united-states-grand-prix----race/',
    'Brazilian Grand Prix': 'http://press.pirelli.com/2019-brazilian-grand-prix---race/',
    'Abu Dhabi Grand Prix': 'http://press.pirelli.com/2019-abu-dhabi-grand-prix---race/'
}

In [96]:
pit_tyre_data_2019 = get_pit_tyre_data(2019, urls_2019)

From cffi callback <function _verify_callback at 0x000001CF2DF7CDC8>:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\OpenSSL\SSL.py", line 311, in wrapper
    @wraps(callback)
KeyboardInterrupt


SSLError: ("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])",)

In [None]:
pit_tyre_data_2020 = get_pit_tyre_data(2020, urls_2020)

In [None]:
pit_tyre_data_2019