In [34]:
import pandas as pd
import numpy as np
from selenium import webdriver
import requests
import bs4
from bs4 import BeautifulSoup
import time

from sys import platform

if platform == "win32":
    path = 'C:/Users/olive/GitHub/f1-analytics/'
elif platform == "darwin":
    path = '~/Documents/GitHub/f1-analytics/'

In [35]:
years = list(range(2014,  2022))

In [36]:
### Debug webscraper to ascertain if a driver made it into q3

qualifying_times = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'

    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'qualifying.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1

        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_times = pd.concat([qualifying_times, year_df])

In [37]:
qualifying_times.query('Q3.notnull() & round == 3').head()

Unnamed: 0,Pos,No,Driver,Car,Q1,Q2,Q3,Laps,season,round
0,1,6,Nico Rosberg ROS,Mercedes,1:35.439,1:33.708,1:33.185,12,2014,3
1,2,44,Lewis Hamilton HAM,Mercedes,1:35.323,1:33.872,1:33.464,12,2014,3
2,3,3,Daniel Ricciardo RIC,Red Bull Racing Renault,1:36.220,1:34.592,1:34.051,16,2014,3
3,4,77,Valtteri Bottas BOT,Williams Mercedes,1:34.934,1:34.842,1:34.247,15,2014,3
4,5,11,Sergio Perez PER,Force India Mercedes,1:34.998,1:34.747,1:34.346,19,2014,3


In [38]:
### Web scraping qualifying data

qualifying_results = pd.DataFrame()
for year in years:
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])

In [39]:
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'qual_time'}, inplace=True)

qualifying_results.drop('No', axis=1, inplace=True)

qualifying_results.head()

Unnamed: 0,grid,driver,constructor,qual_time,season,round
0,1,Lewis Hamilton HAM,Mercedes,1:44.231,2014,1
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,1:44.548,2014,1
2,3,Nico Rosberg ROS,Mercedes,1:44.595,2014,1
3,4,Kevin Magnussen MAG,McLaren Mercedes,1:45.745,2014,1
4,5,Fernando Alonso ALO,Ferrari,1:45.819,2014,1


In [40]:
def format_qualifying(x):
    if ':' in x:
        return float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0
    else:
        return(float(x))

In [41]:
qualifying_results['qual_time'] = qualifying_results.qual_time.map(lambda x: format_qualifying(str(x)))

qualifying_results['qual_time'].describe()

count    3058.000000
mean       88.158361
std        13.864620
min        53.377000
25%        76.969500
50%        88.046500
75%        97.210250
max       141.611000
Name: qual_time, dtype: float64

In [42]:
# calculate difference in qualifying times

qualifying_results = qualifying_results[qualifying_results['qual_time'] != 0]
qualifying_results.sort_values(['season', 'round', 'grid'], inplace = True)
qualifying_results['time_difference'] = qualifying_results.groupby(['season', 'round']).qual_time.diff()
qualifying_results['q_delta'] = qualifying_results.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
qualifying_results.drop('time_difference', axis=1, inplace=True)

In [43]:
qualifying_results.head()

Unnamed: 0,grid,driver,constructor,qual_time,season,round,q_delta
0,1,Lewis Hamilton HAM,Mercedes,104.231,2014,1,0.0
1,2,Daniel Ricciardo RIC,Red Bull Racing Renault,104.548,2014,1,0.317
2,3,Nico Rosberg ROS,Mercedes,104.595,2014,1,0.364
3,4,Kevin Magnussen MAG,McLaren Mercedes,105.745,2014,1,1.514
4,5,Fernando Alonso ALO,Ferrari,105.819,2014,1,1.588


In [44]:
### Checking for nulls

qualifying_results[qualifying_results['qual_time'].isnull()]

Unnamed: 0,grid,driver,constructor,qual_time,season,round,q_delta
20,21,Pastor Maldonado MAL,Lotus Renault,,2014,1,0.0
21,22,Pastor Maldonado MAL,Lotus Renault,,2014,4,0.0
13,14,Kevin Magnussen MAG,McLaren Mercedes,,2014,5,0.0
14,15,Sebastian Vettel VET,Red Bull Racing Renault,,2014,5,0.0
20,21,Jean-Eric Vergne VER,STR Renault,,2014,5,0.0
...,...,...,...,...,...,...,...
15,16,Nicholas Latifi LAT,Williams Mercedes,,2021,19,0.0
16,17,George Russell RUS,Williams Mercedes,,2021,19,0.0
17,18,Mick Schumacher MSC,Haas Ferrari,,2021,19,0.0
18,19,Nikita Mazepin MAZ,Haas Ferrari,,2021,19,0.0


In [45]:
nulls

NameError: name 'nulls' is not defined

In [None]:
# qualifying_results.to_csv(path+'data/qualifying.csv', index = False)