In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [2]:
years = list(range(2014,  2022 + 1))

In [3]:
def format_qualifying(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return round(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])), 3) if x != 0 else 0
        else:
            return(round(float(x), 3))
    else:
        return x

In [4]:
### Final data collection method to scrape final qualifying results

starting_grid = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n + 1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    starting_grid = pd.concat([starting_grid, year_df])

In [5]:
starting_grid.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'final_time', 'No': 'no'}, inplace=True)

starting_grid.drop(['final_time'], axis=1, inplace=True)

starting_grid.shape

(3665, 6)

In [6]:
starting_grid.query('season == 2021 & round == 10')

Unnamed: 0,grid,no,driver,constructor,season,round
0,1,33,Max Verstappen VER,Red Bull Racing Honda,2021,10
1,2,44,Lewis Hamilton HAM,Mercedes,2021,10
2,3,77,Valtteri Bottas BOT,Mercedes,2021,10
3,4,16,Charles Leclerc LEC,Ferrari,2021,10
4,5,4,Lando Norris NOR,McLaren Mercedes,2021,10
5,6,3,Daniel Ricciardo RIC,McLaren Mercedes,2021,10
6,7,14,Fernando Alonso ALO,Alpine Renault,2021,10
7,8,5,Sebastian Vettel VET,Aston Martin Mercedes,2021,10
8,9,31,Esteban Ocon OCO,Alpine Renault,2021,10
9,10,55,Carlos Sainz SAI,Ferrari,2021,10


In [7]:
starting_grid.query('season == 2022')

Unnamed: 0,grid,no,driver,constructor,season,round
0,1,16,Charles Leclerc LEC,Ferrari,2022,1
1,2,1,Max Verstappen VER,Red Bull Racing RBPT,2022,1
2,3,55,Carlos Sainz SAI,Ferrari,2022,1
3,4,11,Sergio Perez PER,Red Bull Racing RBPT,2022,1
4,5,44,Lewis Hamilton HAM,Mercedes,2022,1
...,...,...,...,...,...,...
15,16,20,Kevin Magnussen MAG,Haas Ferrari,2022,22
16,17,10,Pierre Gasly GAS,AlphaTauri RBPT,2022,22
17,18,77,Valtteri Bottas BOT,Alfa Romeo Ferrari,2022,22
18,19,23,Alexander Albon ALB,Williams Mercedes,2022,22


In [8]:
starting_grid.to_csv(path+'data/starting-grid.csv', index=False)