In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import ergast_loader 

In [2]:
ergast = ergast_loader.ErgastLoader('data//ergast_data')

In [3]:
data_path = 'data\\pirelli\\race_tyre_usage_2020.csv'

In [4]:
def scrape_pirelli(url):
    """scrape pirelli race press release url tire usage chart"""
    page = requests.get(url)
    soup = BeautifulSoup(page.content)
    
    df = pd.read_html(str(soup), match=r'(.*PIT 1.*)')[0]
    
    if isinstance(df.columns, pd.core.indexes.multi.MultiIndex):
        df.columns = df.columns.droplevel()
    
    df.columns = [c.lower() for c in df.columns]
    df.drop('car', axis=1, inplace=True)

    df = df.melt('driver').copy()
    df = df.dropna(axis=0, subset=['value'])
    
    df.rename({'variable': 'stop'}, axis=1, inplace=True)
    df['stop'] = df['stop'].str.replace('start', '0')
    df['stop'] = df['stop'].str.replace('pit ', '')
    df['stop'] = df['stop'].astype(int)
    
    df.loc[df['value'].str.contains('u'), 'condition'] = 'used'
    df.loc[df['value'].str.contains('n'), 'condition'] = 'new'
    
    df.loc[df['value'].str[0] == 'C', 'compound'] = df['value'].str[:2]
    df.loc[df['value'].str.contains('I'), 'compound'] = 'I'
    df.loc[df['value'].str.contains('W'), 'compound'] = 'W'
    
    df['lap'] = df['value'].str.extract(r'.*\(([\d]*)\)')
    df['lap'] = df['lap'].fillna(0)
    
    
    df2 = pd.read_html(str(soup), match=r'(.*LAPS.*)')[0]
    if isinstance(df2.columns, pd.core.indexes.multi.MultiIndex):
        df2.columns = df2.columns.droplevel()
    df2 = df2[df2['COMPOUND'].str.contains('C')]
    df2['race_assignment'] = df2['COMPOUND'].str.extract(r'(\S*) ')
    df2['race_assignment'] = df2['race_assignment'].str.lower()
    df2['compound'] = df2['COMPOUND'].str.extract(r'\S* (\S*)')
    df2 = df2[['compound', 'race_assignment']]
    
    df = df.merge(right=df2, how='left', on='compound')
    
    return df


def get_pit_tyre_data(year, urls):
    """Scrape data in urls dict"""
    dfs = []
    for race in urls.keys():
        if urls[race]:
            df = scrape_pirelli(urls[race])
            df['race'] = race
            df['year'] = year
            dfs.append(df)
    return pd.concat(dfs)

In [5]:
# used find urls to collect, no need to do it twice
urls_2020 = {
    'Tuscan Grand Prix': 'https://press.pirelli.com/2020-tuscan-grand-prix---race/',
}

In [6]:
pit_tyre_data = get_pit_tyre_data(2020, urls_2020)

In [7]:
pit_tyre_data

Unnamed: 0,driver,stop,value,condition,compound,lap,race_assignment,race,year
0,HAM,0,C3u,used,C3,0,soft,Tuscan Grand Prix,2020
1,BOT,0,C3u,used,C3,0,soft,Tuscan Grand Prix,2020
2,ALB,0,C3u,used,C3,0,soft,Tuscan Grand Prix,2020
3,RIC,0,C3u,used,C3,0,soft,Tuscan Grand Prix,2020
4,PER,0,C3u,used,C3,0,soft,Tuscan Grand Prix,2020
...,...,...,...,...,...,...,...,...,...
67,VET,4,C3u (43),used,C3,43,soft,Tuscan Grand Prix,2020
68,RUS,4,C3u (45),used,C3,45,soft,Tuscan Grand Prix,2020
69,GRO,4,C3u (44),used,C3,44,soft,Tuscan Grand Prix,2020
70,RAI,5,C3u (44),used,C3,44,soft,Tuscan Grand Prix,2020


In [8]:
current_data = pd.read_csv(data_path)
updated_data = current_data.append(pit_tyre_data)

num_cols = ['stop', 'lap', 'year']
str_cols = ['driver', 'value', 'condition', 'compound', 'race_assignment', 'race']
for col in num_cols:
    updated_data[col] = pd.to_numeric(updated_data[col])
for col in str_cols:
    updated_data[col] = updated_data[col].astype(str)
    
updated_data.drop_duplicates(inplace=True, ignore_index=True)
updated_data.to_csv(data_path, index=False)

In [9]:
updated_data.iloc[461].to_list()

['GRO', 3, 'C3u (42)', 'used', 'C3', 42, 'soft', 'Tuscan Grand Prix', 2020]