In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from unidecode import unidecode

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/F1-Fantasy-Predictor/'
elif platform == "darwin":
    path = '~/Documents/GitHub/F1-Fantasy-Predictor/'

In [2]:
years = list(range(2014,  2023 + 1))

In [3]:
def format_qualifying(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return round(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])), 3) if x != 0 else 0
        else:
            return(round(float(x), 3))
    else:
        return x

In [4]:
def format_driver(x):
    # givenName, familyName, _ = x.split()
    driver = x.split()
    givenName = unidecode(driver[0])
    familyName = unidecode(driver[1])
    return givenName.lower() + '_' + familyName.lower()

In [5]:
### Final data collection method to scrape final qualifying results

starting_grid = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n + 1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    starting_grid = pd.concat([starting_grid, year_df])

ValueError: No tables found

In [None]:
starting_grid.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'final_time', 'No': 'no'}, inplace=True)

starting_grid.drop(['final_time', 'no', 'constructor'], axis=1, inplace=True)

starting_grid.driver = starting_grid.driver.apply(lambda x: format_driver(x))

starting_grid.shape

(3665, 4)

In [None]:
starting_grid.query('season == 2021 & round == 10')

Unnamed: 0,grid,driver,season,round
0,1,max_verstappen,2021,10
1,2,lewis_hamilton,2021,10
2,3,valtteri_bottas,2021,10
3,4,charles_leclerc,2021,10
4,5,lando_norris,2021,10
5,6,daniel_ricciardo,2021,10
6,7,fernando_alonso,2021,10
7,8,sebastian_vettel,2021,10
8,9,esteban_ocon,2021,10
9,10,carlos_sainz,2021,10


In [None]:
starting_grid.query('season == 2022')

Unnamed: 0,grid,driver,season,round
0,1,charles_leclerc,2022,1
1,2,max_verstappen,2022,1
2,3,carlos_sainz,2022,1
3,4,sergio_perez,2022,1
4,5,lewis_hamilton,2022,1
...,...,...,...,...
15,16,kevin_magnussen,2022,22
16,17,pierre_gasly,2022,22
17,18,valtteri_bottas,2022,22
18,19,alexander_albon,2022,22


In [None]:
starting_grid.to_csv(path+'data/starting-grid.csv', index=False)