In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
import requests
import bs4
from bs4 import BeautifulSoup
import time

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'

In [2]:
years = list(range(2014,  2022))

In [52]:
def format_qualifying(x):
    if not any(i in x for i in ['DNF', 'DNS']):
        if ':' in x:
            return float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0
        else:
            return(float(x))
    else:
        return x

In [3]:
### Data collection method to scrape initial qualifiying results

qualifying_times = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'

    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'qualifying.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1

        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_times = pd.concat([qualifying_times, year_df])

In [13]:
qualifying_times.rename(columns = {'Pos': 'pos', 'Driver': 'driver', 'Car': 'constructor', 'Q1': 'q1', 'Q2': 'q2', 'Q3': 'q3'}, inplace=True)

qualifying_times.drop(['No', 'Laps'], axis=1, inplace=True)

qualifying_times.head()

Unnamed: 0,grid,driver,constructor,q1,q2,q3,Laps,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:31.699,1:42.890,1:44.231,22,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:30.775,1:42.295,1:44.548,20,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:32.564,1:42.264,1:44.595,21,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:30.949,1:43.247,1:45.745,19,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:31.388,1:42.805,1:45.819,21,2014,1


In [25]:
qualifying_times.head()

Unnamed: 0,grid,driver,constructor,q1,q2,q3,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:31.699,1:42.890,1:44.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:30.775,1:42.295,1:44.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:32.564,1:42.264,1:44.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:30.949,1:43.247,1:45.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:31.388,1:42.805,1:45.819,2014,1


In [37]:
qualifying_times.query('season == 2016 & round == 3 & q3.isnull()')

Unnamed: 0,grid,driver,constructor,q1,q2,q3,season,round
9,10,Nico Hulkenberg HUL,Force India Mercedes,1:38.165,1:37.333,,2016,3
10,11,Felipe Massa MAS,Williams Mercedes,1:38.016,1:37.347,,2016,3
11,12,Fernando Alonso ALO,McLaren Honda,1:38.451,1:38.826,,2016,3
12,13,Jenson Button BUT,McLaren Honda,1:37.593,1:39.093,,2016,3
13,14,Romain Grosjean GRO,Haas Ferrari,1:38.425,1:39.830,,2016,3
14,15,Marcus Ericsson ERI,Sauber Ferrari,1:38.321,1:40.742,,2016,3
15,16,Felipe Nasr NAS,Sauber Ferrari,1:38.654,1:42.430,,2016,3
16,17,Kevin Magnussen MAG,Renault,1:38.673,,,2016,3
17,18,Esteban Gutierrez GUT,Haas Ferrari,1:38.770,,,2016,3
18,19,Jolyon Palmer PAL,Renault,1:39.528,,,2016,3


In [51]:
qualifying_times[['q1','q2','q3']] = qualifying_times[['q1','q2','q3']].applymap(lambda x: format_qualifying(str(x)))

qualifying_times.head()

1:31.699
1:30.775
1:32.564
1:30.949
1:31.388
1:33.488
1:33.893
1:33.777
1:31.228
1:31.601
1:31.396
1:32.439
1:31.931
1:33.673
1:34.274
1:34.141
1:34.293
1:34.794
1:35.117
1:35.157
1:36.993
DNF
1:57.202
1:57.654
1:57.183
1:58.889
1:58.913
1:59.257
1:58.883
2:00.358
2:01.689
2:00.889
2:01.175
2:01.134
2:00.047
2:00.076
1:59.709
2:00.202
2:02.074
2:02.131
2:02.702
2:03.595
2:04.388
2:04.407
1:35.439
1:35.323
1:36.220
1:34.934
1:34.998
1:35.234
1:35.699
1:35.085
1:35.288
1:35.251
1:35.549
1:34.874
1:35.395
1:35.815
1:36.567
1:36.654
1:36.663
1:36.840
1:37.085
1:37.310
1:37.875
1:37.913
1:55.516
1:56.641
1:55.926
1:56.058
1:56.961
1:56.850
1:56.501
1:55.913
1:57.477
1:58.411
1:58.279
1:57.783
1:57.261
1:58.138
1:57.369
1:58.362
1:58.988
1:59.260
1:59.326
2:00.646
2:00.865
nan
1:27.238
1:26.764
1:28.053
1:28.198
1:28.472
1:28.308
1:28.329
1:28.279
1:28.061
1:27.958
1:28.155
1:28.469
1:28.074
1:28.374
1:28.389
1:28.194
1:28.563
1:29.586
1:30.177
1:30.312
1:30.375
DNF
1:17.678
1:17.823
1:17.90

Unnamed: 0,grid,driver,constructor,q1,q2,q3,season,round
0,1,Lewis Hamilton HAM,Mercedes,91.699,102.89,104.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,90.775,102.295,104.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,92.564,102.264,104.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,90.949,103.247,105.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,91.388,102.805,105.819,2014,1


In [5]:
### Final data collection method to scrape final qualifying results

qualifying_results = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])

In [6]:
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'final_time'}, inplace=True)

qualifying_results.drop('No', axis=1, inplace=True)

qualifying_results.head()

Unnamed: 0,grid,driver,constructor,qual_time,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:44.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:44.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:44.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:45.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:45.819,2014,1


In [8]:
qualifying_results['final_time'] = qualifying_results.qual_time.map(lambda x: format_qualifying(str(x)))

qualifying_results['final_time'].describe()

count    3058.000000
mean       88.158361
std        13.864620
min        53.377000
25%        76.969500
50%        88.046500
75%        97.210250
max       141.611000
Name: qual_time, dtype: float64

In [9]:
# calculate difference in qualifying times

qualifying_results = qualifying_results[qualifying_results['final_time'] != 0]
qualifying_results.sort_values(['season', 'round', 'grid'], inplace = True)
qualifying_results['time_difference'] = qualifying_results.groupby(['season', 'round']).final_time.diff()
qualifying_results['q_delta'] = qualifying_results.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
qualifying_results.drop('time_difference', axis=1, inplace=True)

In [63]:
qualifying_results.head()

Unnamed: 0,grid,driver,constructor,qual_time,season,round,q_delta
0,1,Lewis Hamilton HAM,Mercedes,104.231,2014,1,0.0
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,2014,1,0.317
2,3,Nico Rosberg ROS,Mercedes,104.595,2014,1,0.364
3,4,Kevin Magnussen MAG,McLaren Mercedes,105.745,2014,1,1.514
4,5,Fernando Alonso ALO,Ferrari,105.819,2014,1,1.588


In [85]:
qualy_df = pd.merge(qualifying_results, qualifying_times, how='inner', on=['season', 'round', 'driver', 'constructor'])

qualy_df = qualy_df[['grid', 'pos', 'season', 'round', 'driver', 'constructor', 'qual_time', 'q_delta', 'q1', 'q2', 'q3']]

qualy_df.query('season == 2020 & round == 1')

Unnamed: 0,grid,pos,season,round,driver,constructor,qual_time,q_delta,q1,q2,q3
2444,1,1,2020,1,Valtteri Bottas BOT,Mercedes,62.939,0.0,64.111,63.015,62.939
2445,2,3,2020,1,Max Verstappen VER,Red Bull Racing Honda,63.477,0.538,64.024,64.0,63.477
2446,3,4,2020,1,Lando Norris NOR,McLaren Renault,63.626,0.687,64.606,63.819,63.626
2447,4,5,2020,1,Alexander Albon ALB,Red Bull Racing Honda,63.868,0.929,64.661,63.746,63.868
2448,5,2,2020,1,Lewis Hamilton HAM,Mercedes,62.951,0.012,64.198,63.096,62.951
2449,6,6,2020,1,Sergio Perez PER,Racing Point BWT Mercedes,63.868,0.929,64.543,63.86,63.868
2450,7,7,2020,1,Charles Leclerc LEC,Ferrari,63.923,0.984,64.5,64.041,63.923
2451,8,8,2020,1,Carlos Sainz SAI,McLaren Renault,63.971,1.032,64.537,63.971,63.971
2452,9,9,2020,1,Lance Stroll STR,Racing Point BWT Mercedes,64.029,1.09,64.309,63.955,64.029
2453,10,10,2020,1,Daniel Ricciardo RIC,Renault,64.239,1.3,64.556,64.023,64.239


In [None]:
qualy_df.to_csv(path+'data/qualifying.csv', index = False)