In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [2]:
years = list(range(2014,  2022))

In [3]:
def format_qualifying(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return round(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])), 3) if x != 0 else 0
        else:
            return(round(float(x), 3))
    else:
        return x

In [4]:
### Final data collection method to scrape final qualifying results

qualifying_results = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])

KeyboardInterrupt: 

In [19]:
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'final_time'}, inplace=True)

qualifying_results.drop('No', axis=1, inplace=True)

qualifying_results.shape

(3229, 6)

In [20]:
qualifying_results.query('season == 2021 & round == 10')

Unnamed: 0,grid,driver,constructor,final_time,season,round
0,1,Max Verstappen VER,Red Bull Racing Honda,,2021,10
1,2,Lewis Hamilton HAM,Mercedes,,2021,10
2,3,Valtteri Bottas BOT,Mercedes,,2021,10
3,4,Charles Leclerc LEC,Ferrari,,2021,10
4,5,Lando Norris NOR,McLaren Mercedes,,2021,10
5,6,Daniel Ricciardo RIC,McLaren Mercedes,,2021,10
6,7,Fernando Alonso ALO,Alpine Renault,,2021,10
7,8,Sebastian Vettel VET,Aston Martin Mercedes,,2021,10
8,9,Esteban Ocon OCO,Alpine Renault,,2021,10
9,10,Carlos Sainz SAI,Ferrari,,2021,10


In [10]:
qualifying_results['final_time'] = qualifying_results.final_time.map(lambda x: format_qualifying(str(x)))

qualifying_results['final_time'].describe()

count    3058.000000
mean       88.158361
std        13.864620
min        53.377000
25%        76.969500
50%        88.046500
75%        97.210250
max       141.611000
Name: final_time, dtype: float64

In [11]:
# calculate difference in qualifying times

qualifying_results = qualifying_results[qualifying_results['final_time'] != 0]
qualifying_results.sort_values(['season', 'round', 'grid'], inplace = True)
qualifying_results['time_difference'] = qualifying_results.groupby(['season', 'round']).final_time.diff()
qualifying_results['q_delta'] = qualifying_results.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
qualifying_results.drop('time_difference', axis=1, inplace=True)

qualifying_results.head()

Unnamed: 0,grid,driver,constructor,final_time,season,round,q_delta
0,1,Lewis Hamilton HAM,Mercedes,104.231,2014,1,0.0
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,2014,1,0.317
2,3,Nico Rosberg ROS,Mercedes,104.595,2014,1,0.364
3,4,Kevin Magnussen MAG,McLaren Mercedes,105.745,2014,1,1.514
4,5,Fernando Alonso ALO,Ferrari,105.819,2014,1,1.588


In [12]:
def format_q_rounds(row):
    if str(row.q3) != 'nan':
        return 'q3'
    elif str(row.q2) != 'nan':
        return 'q2'
    elif str(row.q1) != 'nan':
        return 'q1'
    else:
        return row.q1

In [13]:
qualifying_times['stage'] = qualifying_times.apply(lambda row: format_q_rounds(row), axis=1)
qualifying_times = qualifying_times.drop(['q1', 'q2', 'q3'], axis=1)

In [14]:
qualifying_times.query('season == 2014 & round == 1')

Unnamed: 0,pos,driver,constructor,season,round,stage
0,1,Lewis Hamilton HAM,Mercedes,2014,1,q3
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,2014,1,q3
2,3,Nico Rosberg ROS,Mercedes,2014,1,q3
3,4,Kevin Magnussen MAG,McLaren Mercedes,2014,1,q3
4,5,Fernando Alonso ALO,Ferrari,2014,1,q3
5,6,Jean-Eric Vergne VER,STR Renault,2014,1,q3
6,7,Nico Hulkenberg HUL,Force India Mercedes,2014,1,q3
7,8,Daniil Kvyat KVY,STR Renault,2014,1,q3
8,9,Felipe Massa MAS,Williams Mercedes,2014,1,q3
9,10,Valtteri Bottas BOT,Williams Mercedes,2014,1,q3


In [5]:
qualy_df = pd.merge(qualifying_results, qualifying_times, how='inner', on=['season', 'round', 'driver', 'constructor'])

qualy_df = qualy_df[['grid', 'pos', 'season', 'round', 'driver', 'constructor', 'final_time', 'stage', 'q_delta']]

qualy_df.query('season == 2014 & round == 1')

NameError: name 'qualifying_times' is not defined

In [16]:
# qualy_df.to_csv(path+'data/qualifying.csv', index=False)