In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    # path = '~/Documents/GitHub/f1-analytics/'
    path = '/Users/oliverjcarter/Documents/GitHub/f1-analytics/'

In [2]:
years = list(range(2014,  2022))

In [3]:
def format_qualifying(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return round(float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])), 3) if x != 0 else 0
        else:
            return(round(float(x), 3))
    else:
        return x

In [4]:
### Data collection method to scrape initial qualifiying results

qualifying_times = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'

    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'qualifying.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1

        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_times = pd.concat([qualifying_times, year_df])

In [5]:
qualifying_times.rename(columns = {'Pos': 'pos', 'Driver': 'driver', 'Car': 'constructor', 'Q1': 'q1', 'Q2': 'q2', 'Q3': 'q3'}, inplace=True)

qualifying_times.drop(['No', 'Laps'], axis=1, inplace=True)

qualifying_times.head()

Unnamed: 0,pos,driver,constructor,q1,q2,q3,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:31.699,1:42.890,1:44.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:30.775,1:42.295,1:44.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:32.564,1:42.264,1:44.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:30.949,1:43.247,1:45.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:31.388,1:42.805,1:45.819,2014,1


In [6]:
qualifying_times.query('season == 2014 & round == 1 & q3.isnull()')

Unnamed: 0,pos,driver,constructor,q1,q2,q3,season,round
9,10,Nico Hulkenberg HUL,Force India Mercedes,1:38.165,1:37.333,,2016,3
10,11,Felipe Massa MAS,Williams Mercedes,1:38.016,1:37.347,,2016,3
11,12,Fernando Alonso ALO,McLaren Honda,1:38.451,1:38.826,,2016,3
12,13,Jenson Button BUT,McLaren Honda,1:37.593,1:39.093,,2016,3
13,14,Romain Grosjean GRO,Haas Ferrari,1:38.425,1:39.830,,2016,3
14,15,Marcus Ericsson ERI,Sauber Ferrari,1:38.321,1:40.742,,2016,3
15,16,Felipe Nasr NAS,Sauber Ferrari,1:38.654,1:42.430,,2016,3
16,17,Kevin Magnussen MAG,Renault,1:38.673,,,2016,3
17,18,Esteban Gutierrez GUT,Haas Ferrari,1:38.770,,,2016,3
18,19,Jolyon Palmer PAL,Renault,1:39.528,,,2016,3


In [8]:
qualifying_times[['q1','q2','q3']] = qualifying_times[['q1','q2','q3']].applymap(lambda x: format_qualifying(str(x)))

qualifying_times.query('season == 2014 & round == 1 & q3.isnull()')

Unnamed: 0,pos,driver,constructor,q1,q2,q3,season,round
10,11,Jenson Button BUT,McLaren Mercedes,91.396,104.437,,2014,1
11,12,Kimi Räikkönen RAI,Ferrari,92.439,104.494,,2014,1
12,13,Sebastian Vettel VET,Red Bull Racing Renault,91.931,104.668,,2014,1
13,14,Adrian Sutil SUT,Sauber Ferrari,93.673,105.655,,2014,1
14,15,Kamui Kobayashi KOB,Caterham Renault,94.274,105.867,,2014,1
15,16,Sergio Perez PER,Force India Mercedes,94.141,107.293,,2014,1
16,17,Max Chilton CHI,Marussia Ferrari,94.293,,,2014,1
17,18,Jules Bianchi BIA,Marussia Ferrari,94.794,,,2014,1
18,19,Esteban Gutierrez GUT,Sauber Ferrari,95.117,,,2014,1
19,20,Marcus Ericsson ERI,Caterham Renault,95.157,,,2014,1


In [9]:
### Final data collection method to scrape final qualifying results

qualifying_results = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])

In [13]:
# qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'final_time'}, inplace=True)

# qualifying_results.drop('No', axis=1, inplace=True)

qualifying_results.query('season == 2014 & round == 1')

Unnamed: 0,grid,driver,constructor,final_time,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:44.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:44.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:44.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:45.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:45.819,2014,1
5,6,Jean-Eric Vergne VER,STR Renault,1:45.864,2014,1
6,7,Nico Hulkenberg HUL,Force India Mercedes,1:46.030,2014,1
7,8,Daniil Kvyat KVY,STR Renault,1:47.368,2014,1
8,9,Felipe Massa MAS,Williams Mercedes,1:48.079,2014,1
9,10,Jenson Button BUT,McLaren Mercedes,1:44.437,2014,1


In [14]:
qualifying_results['final_time'] = qualifying_results.final_time.map(lambda x: format_qualifying(str(x)))

qualifying_results['final_time'].describe()

count    3058.000000
mean       88.158361
std        13.864620
min        53.377000
25%        76.969500
50%        88.046500
75%        97.210250
max       141.611000
Name: final_time, dtype: float64

In [15]:
# calculate difference in qualifying times

qualifying_results = qualifying_results[qualifying_results['final_time'] != 0]
qualifying_results.sort_values(['season', 'round', 'grid'], inplace = True)
qualifying_results['time_difference'] = qualifying_results.groupby(['season', 'round']).final_time.diff()
qualifying_results['q_delta'] = qualifying_results.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
qualifying_results.drop('time_difference', axis=1, inplace=True)

qualifying_results.head()

Unnamed: 0,grid,driver,constructor,final_time,season,round,q_delta
0,1,Lewis Hamilton HAM,Mercedes,104.231,2014,1,0.0
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,2014,1,0.317
2,3,Nico Rosberg ROS,Mercedes,104.595,2014,1,0.364
3,4,Kevin Magnussen MAG,McLaren Mercedes,105.745,2014,1,1.514
4,5,Fernando Alonso ALO,Ferrari,105.819,2014,1,1.588


In [16]:
qualifying_results.query('season == 2014 & round == 1')

Unnamed: 0,grid,driver,constructor,final_time,season,round,q_delta
0,1,Lewis Hamilton HAM,Mercedes,104.231,2014,1,0.0
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,2014,1,0.317
2,3,Nico Rosberg ROS,Mercedes,104.595,2014,1,0.364
3,4,Kevin Magnussen MAG,McLaren Mercedes,105.745,2014,1,1.514
4,5,Fernando Alonso ALO,Ferrari,105.819,2014,1,1.588
5,6,Jean-Eric Vergne VER,STR Renault,105.864,2014,1,1.633
6,7,Nico Hulkenberg HUL,Force India Mercedes,106.03,2014,1,1.799
7,8,Daniil Kvyat KVY,STR Renault,107.368,2014,1,3.137
8,9,Felipe Massa MAS,Williams Mercedes,108.079,2014,1,3.848
9,10,Jenson Button BUT,McLaren Mercedes,104.437,2014,1,0.206


In [17]:
qualy_df = pd.merge(qualifying_results, qualifying_times, how='inner', on=['season', 'round', 'driver', 'constructor'])

qualy_df = qualy_df[['grid', 'pos', 'season', 'round', 'driver', 'constructor', 'final_time', 'q_delta', 'q1', 'q2', 'q3']]

qualy_df.query('season == 2014 & round == 1')

Unnamed: 0,grid,pos,season,round,driver,constructor,final_time,q_delta,q1,q2,q3
0,1,1,2014,1,Lewis Hamilton HAM,Mercedes,104.231,0.0,91.699,102.89,104.231
1,2,2,2014,1,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,0.317,90.775,102.295,104.548
2,3,3,2014,1,Nico Rosberg ROS,Mercedes,104.595,0.364,92.564,102.264,104.595
3,4,4,2014,1,Kevin Magnussen MAG,McLaren Mercedes,105.745,1.514,90.949,103.247,105.745
4,5,5,2014,1,Fernando Alonso ALO,Ferrari,105.819,1.588,91.388,102.805,105.819
5,6,6,2014,1,Jean-Eric Vergne VER,STR Renault,105.864,1.633,93.488,103.849,105.864
6,7,7,2014,1,Nico Hulkenberg HUL,Force India Mercedes,106.03,1.799,93.893,103.658,106.03
7,8,8,2014,1,Daniil Kvyat KVY,STR Renault,107.368,3.137,93.777,104.331,107.368
8,9,9,2014,1,Felipe Massa MAS,Williams Mercedes,108.079,3.848,91.228,104.242,108.079
9,10,11,2014,1,Jenson Button BUT,McLaren Mercedes,104.437,0.206,91.396,104.437,


In [None]:
qualy_df.to_csv(path+'data/qualifying.csv', index=False)