In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
import requests
import bs4
from bs4 import BeautifulSoup
import time

In [None]:
### Web scraping qualifying data

qualifying_results = pd.DataFrame()
for year in list(range(2020,2022)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')
    
    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link: 
            year_links.append(link)

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        df = pd.read_html(new_url.format(link))
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df])

    qualifying_results = pd.concat([qualifying_results, year_df])

In [None]:
qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver', 'Car': 'constructor', 'Time': 'qual_time'}, inplace=True)

qualifying_results.drop('No', axis=1, inplace=True)

qualifying_results.head()

In [None]:
def format_qualifying(x):
    if ':' in x:
        return float(str(x).split(':')[1]) + (60 * float(str(x).split(':')[0])) if x != 0 else 0
    else:
        return(float(x))

In [None]:
qualifying_results['qual_time'] = qualifying_results.qual_time.map(lambda x: format_qualifying(str(x)))

qualifying_results['qual_time'].describe()

In [None]:
# calculate difference in qualifying times

qualifying_results = qualifying_results[qualifying_results['qual_time'] != 0]
qualifying_results.sort_values(['season', 'round', 'grid_pos'], inplace = True)
qualifying_results['time_difference'] = qualifying_results.groupby(['season', 'round']).qual_time.diff()
qualifying_results['q_delta'] = qualifying_results.groupby(['season', 'round']).time_difference.cumsum().fillna(0)
qualifying_results.drop('time_difference', axis=1, inplace=True)

In [29]:
qualifying_results.to_csv('data/qualifying.csv', index = False)