In [2]:
import requests
from bs4 import BeautifulSoup
import json
from typing import List
import os
from time import sleep
import pandas as pd
import pathvalidate 
import csv
from os import listdir
from os.path import isfile, join, isdir, dirname, exists
from io import BytesIO
from PIL import Image
import wptools
import zipfile
from datetime import datetime, date, timedelta
import requests
import re
from pathvalidate import sanitize_filename


In [2]:
root_directory = "./notebooks"

In [45]:

def get_circuit_URLs(url, host)-> List[str]:
    '''
    Aimed to recover the URLs for every circuit webpage in the blog, from we want to scrap the circuit
    information
    '''
    response = requests.get(url)
    page = response.text
    
    soups = BeautifulSoup(page, "html.parser").find_all("ul", {"id":"menu-page"}) #returns a resultSet

    url_list = []
    for soup in soups:
        aes = soup.find_all("a") #keeps just "a" elements in "ul"
        for a in aes:
            
            if "Track" in a["title"]:

                url_list.append(host + a["href"])
    
    return url_list

def get_HTML_from_source(URL)-> BeautifulSoup:
    '''
    obtain data from URLs and return a BeautifulSoup object that allows you
    to parse it.
    '''
    requests.encoding = "utf-8" #based on HTML <meta> labels of race-fans web
    response = requests.get(URL)
    page = response.content
    soup = BeautifulSoup(page,"html.parser")
    return soup

def save_HTML_file_from_url(urls:List[str], root_directory=root_directory,folder_name:str="data/circuits_HTML"):
    '''
    stores data taken from an HTTP request to the url in a file
    '''
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    if len(urls) == 0:
        return

    requests.encoding = "utf-8"
    
    failed_requests = []
    
    for url in urls:
        try:
            response = requests.get(url)
            file_title = [w for w in url.split("/") if w]
            with open(os.path.join(folder_name, file_title[-1] + ".HTML"), mode='wb') as file:
                file.write(response.content)

        except Exception as e:
            
            failed_requests.append(url)
            print(str(url) + ": "+ str(e))
            print("I will sleep ...")
            sleep(2)
    print(f'pending URLs: {len(failed_requests)}')
    save_HTML_file_from_url(failed_requests)


    # Save HTML to file
    
def get_HTLM_from_file(file, folder)-> BeautifulSoup:
    '''
    gather HTML text from file and return it content for future extractions
    '''
    with open(os.path.join(folder, file), encoding='latin-1') as file:
        file_value = file.read()
        soup = BeautifulSoup(file_value, "html.parser")
    
    return soup

def get_circuit_table(soup_object):
    '''
    we want to recover the `circuit table`, with the technical
    info about the circuits we would gather. 
    '''
    
    try:
        circuit_name_key = soup_object.find("h1", {"class":"entry-title"}).text
    except Exception as e:
        circuit_name_key = None
    try:
        dict_data = {}
        for data in soup_object.find_all("table", {"class": "thin"}):
            trs = data.find_all("tr")
            for tr in trs:
                tds = tr.find_all("td")

                if len(tds[0].find_all("strong")) > 0:
                    key = tds[0].text
                    dict_data[key] = {}
                elif len(tds) == 2:
                    dict_data[key][tds[0].text] = tds[1].text
    except Exception as e:
        print(f'{e}')
        dict_data = {}
    return circuit_name_key,dict_data

def build_data(urls):
    '''for url list provided, it parses the html file to recover the technical data related circuits and save it to json file'''
    prepared_data = dict()
    for url in urls:
        try:
            title, data = get_circuit_table(url)
            prepared_data[title] = data
        except Exception as e:
            print(f"fail on {url}")
            print(e)
            print("="*50)
    print(f"success in {len(prepared_data)} urls")
    return prepared_data

def save_to_json(data):
    with open("sample.json", "w") as outfile:
        json.dump(data, outfile, indent=4)
    

## Obtaining circuits information programatically

In [None]:
base_url="https://www.racefans.net/f1-information/going-to-a-race/"
host= "https://www.racefans.net"
urls = get_circuit_URLs(base_url, host)
#print(urls)

save_HTML_file_from_url(urls)


results of execution after about 10 mins of execution

In [160]:
files = os.listdir(os.path.join(root_directory, "data/circuits_HTML"))
print(len(files))

circuits = []
for url in urls:
    circuit_name = [words for words in url.split("/") if words][-1] + ".HTML"
    circuits.append(circuit_name)

    if circuit_name  not in files:
        print(f"name of the circuit that was not possible to retrieve: {circuit_name}")

27
name of the circuit that was not possible to retrieve: paul-ricard.HTML


## obtaining technical data from the HTML pages

In [305]:
files = sorted(os.listdir(os.path.join(root_directory,"data/circuits_HTML")))
soup = get_HTLM_from_file(files[0])

get_circuit_table(soup)

parsing_result = {}
for file in files:
    
    try:
        soup = get_HTLM_from_file(file)
        circuit_name, circuit_info = get_circuit_table(soup)
    except Exception as e:
        print(f"exception for {file}: {e}")
        raise e
    
    if len(circuit_info) > 0: #table technical info from circuit exists
        dict_info = {"circuit": circuit_name,
                    "circuit_info": circuit_info,
                    "scrapping_result":"Success"}
    else:
        dict_info = {"circuit": circuit_name,
                    "circuit_info": None,
                    "scrapping_result":"technical info is not informed"}
    
    parsing_result[circuit_name] = dict_info

parsing_result

import json
with open(os.path.join(root_directory, "data", "circuits_HTML", "circuit_technical_data_from_racefans_result.json"), 'w') as outfile:
    json.dump(parsing_result, outfile, indent=4, ensure_ascii=True)


In [277]:
sorted(urls)

['https://www.racefans.net/f1-information/going-to-a-race/albert-park-melbourne-circuit-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/autodromo-hermanos-rodriguez-circuit-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/autodromo-jose-carlos-pace-carlos-pace-interlagos-circuit-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/autodromo-nazionale-monza-italy-circuit-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/bahrain-international-circuit-track-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/baku-city-circuit-track-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/buddh-international-circuit-track-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/circuit-americas-track-information/',
 'https://www.racefans.net/f1-information/going-to-a-race/circuit-de-catalunya-barcelona-circuit-information/',
 'https://www.racefans

## Circuit information loading and preparation

In [None]:
# circuit table info from HTML files retrieved
df =pd.read_json("./data/outputs/circuit_data_scrapping_v1.json", orient="index")
df.sort_index()

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33 entries, yas_marina to zandvoort
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   type                          33 non-null     object 
 1   Lap length                    33 non-null     float64
 2   Full throttle                 33 non-null     float64
 3   Longest flat-out section      33 non-null     int64  
 4   Downforce level               33 non-null     int64  
 5   Gear changes per lap          27 non-null     object 
 6   Fuel use per lap              15 non-null     object 
 7   Time penalty per lap of fuel  15 non-null     object 
 8   turns                         10 non-null     float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.6+ KB


In [53]:
df["longest_flat_out/lenght"] = df.apply(lambda x: x['Longest flat-out section']/(x['Lap length']*1000), axis=1)

In [54]:
df.describe()

Unnamed: 0,Lap length,Full throttle,Longest flat-out section,Downforce level,turns,longest_flat_out/lenght
count,33.0,33.0,33.0,33.0,10.0,33.0
mean,5.203848,65.015152,1125.909091,3.272727,17.3,0.216985
std,0.725212,9.814279,301.170152,0.977008,4.877385,0.048624
min,3.337,45.0,669.0,1.0,13.0,0.129175
25%,4.653,59.0,969.0,3.0,14.25,0.183482
50%,5.4,66.0,1073.0,3.0,15.0,0.207259
75%,5.621,70.0,1205.0,4.0,18.25,0.256283
max,7.004,84.0,2015.0,5.0,27.0,0.334833


In [55]:
df.rename(columns={"Lap length": "lap_lenght", "Full throttle": "full_throttle", "Longest flat-out section":"longest_straight","Downforce level":"downforce_level", "longest_flat_out/lenght": "longest_straight/lap"}, inplace=True)

In [56]:
df['longest_straight'] = df.apply(lambda x: x['longest_straight']/1000, axis=1) # pass longitud to km

In [57]:
#saving the info of interest
df = df[["type", "lap_lenght", 'full_throttle', "longest_straight",'downforce_level',"longest_straight/lap"]]

In [58]:
df.shape

(33, 6)

In [59]:
df.to_csv("./data/outputs/final_circuit_data_output.csv")

## circuit images from wikipedia

In [26]:
csv_data_directory = os.path.join(root_directory, "data", "csv_ergast_data")
circuit_df = pd.read_csv(csv_data_directory + "/circuits.csv")
dir_path = os.path.join(root_directory, "data", "circuit_images")
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

In [102]:
from io import BytesIO
import io
page = wptools.page('Nivelles-Baulers', silent=True).get()

img=page.data["image"][0]['url']
print(img)
image_file_format = img.split(".")[-1]
circuit_image = requests.get(img).content
#b = BytesIO(circuit_image)
#i = Image.open(b)
            
##i = Image.open(BytesIO(circuit_image))
#i.save(os.path.join(dir_path, 'Nivelles-Baulers' + "." + image_file_format))
with open(os.path.join(dir_path, 'Nivelles-Baulers' + "." + image_file_format), mode='wb') as file:
    
    file.write(circuit_image)
#Image.open(BytesIO(requests.get(img).content))

https://upload.wikimedia.org/wikipedia/commons/0/0f/Circuit_Nivelles-Baulers.png


In [260]:
circuit_df = pd.read_csv(os.path.join(csv_data_directory, "circuits.csv"))
circuit_df

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.84970,144.96800,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.73800,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.03250,50.51060,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57000,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.95170,29.40500,130,http://en.wikipedia.org/wiki/Istanbul_Park
...,...,...,...,...,...,...,...,...,...
72,75,portimao,Autódromo Internacional do Algarve,Portimão,Portugal,37.22700,-8.62670,108,http://en.wikipedia.org/wiki/Algarve_Internati...
73,76,mugello,Autodromo Internazionale del Mugello,Mugello,Italy,43.99750,11.37190,255,http://en.wikipedia.org/wiki/Mugello_Circuit
74,77,jeddah,Jeddah Corniche Circuit,Jeddah,Saudi Arabia,21.63190,39.10440,15,http://en.wikipedia.org/wiki/Jeddah_Street_Cir...
75,78,losail,Losail International Circuit,Al Daayen,Qatar,25.49000,51.45420,\N,http://en.wikipedia.org/wiki/Losail_Internatio...


## Obtaining circuit images from wikipedia

By the use of URLs in table circuits, I developed a programatic backup of circuit figures from wikipedia, for future use

### correction of URLs

Some URLS were bad encoded and errors happened during the request process, therefore, I developed a programatic and reproducible way of correct the wrong characters

In [271]:

root_directory = "/Users/mililic/Documents/UMA_big_data/TFM/my_TFM/notebooks"
csv_data_directory = os.path.join(root_directory, "data", "csv_ergast_data")

# Extract all contents from zip file

def df_from_zip_file(zipfile_name, data_directory = csv_data_directory):
    
    try:
        with zipfile.ZipFile(csv_data_directory + zipfile_name, 'r') as myzip:
            data=myzip.read("circuits.csv").decode(encoding="utf-8")
    
    except zipfile.BadZipFile as error:
        print(error)

    df = pd.read_csv(csv_data_directory + "/circuits.csv", encoding='utf-8')
    return df

def correct_URLs_from_circuit_df(circuit_df, to_save_file_name, save_to_data_directory = csv_data_directory):

    replacement_dict = {r'(.*)Aut%C3%B3dromo(.*)':r'\1Autódromo\2', r'(.*)Rodr%C3%ADguez(.*)':r'\1Rodríguez\2', r"(.*)N%C3%BCrburgring(.*)": r"\1Nürburgring\2",
                        r'(.*)G%C3%A1lvez(.*)': r'\1Gálvez\2', r'(.*)Montju%C3%AFc(.*)':r'\1Montjuïc\2', r'(.*)Jos%C3%A9(.*)': r'\1José\2'}

    circuit_df.url.replace(to_replace=replacement_dict, regex=True, inplace=True)
    circuit_df.to_csv(os.path.join(save_to_data_directory,to_save_file_name))
    return circuit_df

def get_circuit_images_from_wiki(df, folder_name:str="circuit_images"):
    
    # creates dir if dir does not exist
    dir_path = os.path.join(root_directory, "data", folder_name)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    image_df_list = []
    image_errors = {}
    headers = {'User-Agent': 'My User Agent 1.0'}
    for index in range(len(df)):
        
        try:
            circuit_name = df.loc[index,"url"].split("/")[-1]
            circuit_id = df.loc[index,"circuitId"]
            page = wptools.page(circuit_name, silent=True).get()
            images = page.data['image']
            circuit_image_URL = images[0]["url"]
            image_file_format = circuit_image_URL.split(".")[-1]
            circuit_image = requests.get(circuit_image_URL,headers).content
            
            if image_file_format == "svg": #saved as binary
                with open(os.path.join(dir_path, circuit_name + "." + image_file_format), mode='wb') as file:
                    
                    file.write(circuit_image)
            
            elif image_file_format == "png":
                b = BytesIO(circuit_image)
                i = Image.open(b)
                i.save(os.path.join(dir_path, circuit_name + "." + image_file_format))
            
            image_df_list.append({"circuitId": circuit_id,
                                    'circuit_name': circuit_name,
                                    'circuit_image_url': circuit_image_URL})
        
        except Exception as e:

            print(str(circuit_id) + "_" + circuit_name + ": " + str(e))
            image_errors[str(circuit_id) + "_" + circuit_name] = images
            
    img_df = pd.DataFrame(image_df_list, columns=["circuitId",'circuit_name','circuit_image_url'])
    return image_errors, img_df


In [272]:
circuits_df = df_from_zip_file('/f1db_csv.zip')
# correct URL names:
circuits_df = correct_URLs_from_circuit_df(circuit_df, "circuits_OK.csv", save_to_data_directory = csv_data_directory)
circuits_df

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.84970,144.96800,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.73800,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.03250,50.51060,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57000,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.95170,29.40500,130,http://en.wikipedia.org/wiki/Istanbul_Park
...,...,...,...,...,...,...,...,...,...
72,75,portimao,Autódromo Internacional do Algarve,Portimão,Portugal,37.22700,-8.62670,108,http://en.wikipedia.org/wiki/Algarve_Internati...
73,76,mugello,Autodromo Internazionale del Mugello,Mugello,Italy,43.99750,11.37190,255,http://en.wikipedia.org/wiki/Mugello_Circuit
74,77,jeddah,Jeddah Corniche Circuit,Jeddah,Saudi Arabia,21.63190,39.10440,15,http://en.wikipedia.org/wiki/Jeddah_Street_Cir...
75,78,losail,Losail International Circuit,Al Daayen,Qatar,25.49000,51.45420,\N,http://en.wikipedia.org/wiki/Losail_Internatio...


* If you try next cell, can take about 4 minutes, depending on your pc 

In [273]:
errors, img_df = get_circuit_images_from_wiki(circuits_df)

15_Marina_Bay_Street_Circuit: cannot identify image file <_io.BytesIO object at 0x7fcc51791ef0>
80_Las_Vegas_Grand_Prix#Circuit: cannot identify image file <_io.BytesIO object at 0x7fcc301404f0>
24_Yas_Marina_Circuit: cannot identify image file <_io.BytesIO object at 0x7fcc28451b30>
25_Autódromo_Oscar_Alfredo_Gálvez: https://en.wikipedia.org/w/api.php?action=query&exintro&formatversion=2&inprop=url|watchers&list=random&pithumbsize=240&pllimit=500&ppprop=disambiguation|wikibase_item&prop=extracts|info|links|pageassessments|pageimages|pageprops|pageterms|redirects&redirects&rdlimit=500&rnlimit=1&rnnamespace=0&titles=Aut%C3%B3dromo%20Oscar%20y%20Juan%20G%C3%A1lvez&plcontinue=1403866|0|San_Martín_Palace
28_TI_Circuit: cannot identify image file <_io.BytesIO object at 0x7fcc517bb950>
30_Kyalami: cannot identify image file <_io.BytesIO object at 0x7fcc20d07f90>
36_Autódromo_Internacional_Nelson_Piquet: cannot identify image file <_io.BytesIO object at 0x7fcc60597c20>
39_Circuit_Zandvoort: ca

Most of the errors were from *png> I was researching about that, but given most of the images were recovered, I will comeback for specific 
image if I dont have it after this process. 

In [274]:
img_df

Unnamed: 0,circuitId,circuit_name,circuit_image_url
0,1,Melbourne_Grand_Prix_Circuit,https://upload.wikimedia.org/wikipedia/commons...
1,2,Sepang_International_Circuit,https://upload.wikimedia.org/wikipedia/commons...
2,3,Bahrain_International_Circuit,https://upload.wikimedia.org/wikipedia/commons...
3,4,Circuit_de_Barcelona-Catalunya,https://upload.wikimedia.org/wikipedia/commons...
4,5,Istanbul_Park,https://upload.wikimedia.org/wikipedia/commons...
5,6,Circuit_de_Monaco,https://upload.wikimedia.org/wikipedia/commons...
6,7,Circuit_Gilles_Villeneuve,https://upload.wikimedia.org/wikipedia/commons...
7,8,Circuit_de_Nevers_Magny-Cours,https://upload.wikimedia.org/wikipedia/commons...
8,9,Silverstone_Circuit,https://upload.wikimedia.org/wikipedia/commons...
9,10,Hockenheimring,https://upload.wikimedia.org/wikipedia/commons...


## Obtaining Grand Prix Tyres strategies information

### obtain the URLs from race_fans that contains "tyres" as title

In [94]:

def get_tyre_strategy_URLS(date_init:date, how_many_days=365):

    url = 'https://www.racefans.net/'
    counter = 0
    tyre_related_URLS = []
    for day in range(how_many_days):

        new_day = (date_init + timedelta(day))
        d = datetime.strftime(new_day, "%Y/%m/%d/")
        r = requests.get(url + d).content
        article_titles = BeautifulSoup(r, "html.parser").find_all("h2", {"class":"entry-title"}) #article titles
        
        for title in article_titles:
            a = title.find("a")
            if a is not None:
                if "tyre" in a.text:
                    print(a.text)
                    tyre_related_URLS.append(a["href"])
            
    print(len(tyre_related_URLS))
    print(tyre_related_URLS)
    return tyre_related_URLS
    

def get_tyre_strategy_table(soup_object:BeautifulSoup):
    '''
    we want to recover the `tyre strategy table`, type of tyres and tyre changes along the
    race.
    '''
    tables = soup_object.find_all("table", {"class": "thin"})
    return_value = {}
    dict_keys = []
    for table in tables:
        scrapping_result = {}
        ths = table.find_all("th")
        
        if len(ths) !=0:
            #column headers
            dict_keys = [th.text for th in ths]
        
        body = table.find("tbody")

        if body:
            trs_in_body = body.find_all("tr")
        
            for tr in trs_in_body:

                tds_text = [td.text for td in tr.find_all("td")]

                if len(tds_text) < len(dict_keys):
                    tds_text += [None]*(len(dict_keys)-len(tds_text))
                
                json_object = {key:value for key, value in zip(dict_keys[1:], tds_text[1:])}
                
                for key, value in json_object.items():
                    json_object[key] = parse_values(key, value)

                scrapping_result[tds_text[0]] = json_object

        else:
            trs_in_body = table.find_all("tr")
        
            for tr in trs_in_body:
    
                if len(dict_keys) ==0:
                    #column headers
                    dict_keys = [td.text for td in tr.find_all("td")]
                    continue

                tds_text = [td.text for td in tr.find_all("td")]

                if len(tds_text) < len(dict_keys):
                    tds_text += [None]*(len(dict_keys)-len(tds_text))
                
                json_object = {key:value for key, value in zip(dict_keys[1:], tds_text[1:])}
                
                for key, value in json_object.items():
                    json_object[key] = parse_values(key, value)

                scrapping_result[tds_text[0]] = json_object

        
        return_value[table.previous_element] = scrapping_result
    
    return return_value

def parse_values(key:str, text:str):
    if "stint" in key.lower() and text:
        tyre_type, lap_to = split_stint(text)
        return {"tyre_type": tyre_type, "lap_to": int(lap_to)}
    return text
        

def split_stint(text):
    splited = [e.strip() for e in re.split(r'(.*)\((.*)\)', text) if e]
    if len(splited) == 1:
        splited.append(-1)
    return splited


In [321]:

def parse_race_name(file_name):
    name_without_year = [race for race in re.split(r'^([0-9]{4}-)+(.*)', file_name) if race][-1]
    name_without_extention = name_without_year.split('.')[0]
    return name_without_extention.replace("-", " ")
    

def parse_file(file_name, source_directory_name, target_directory_name, header):
    soup = get_HTLM_from_file(file_name, source_directory_name)
    content = get_tyre_strategy_table(soup)
    if not content:
        raise Exception("empty dict")

    #print(content)
    for key, value in content.items():
        final_target_directory = join(target_directory_name, sanitize_filename(key))
        os.makedirs(final_target_directory, exist_ok=True)

        for header_key, header_value in header.items():
            value[header_key] = header_value

        with open(join(final_target_directory, f"{file_name}.json"), 'w', encoding="utf-8") as outfile:
            json.dump(value, outfile, indent=4, ensure_ascii=False)

In [96]:
header = {
    "year": "2013",
    "file_name": "2013-australian-grand-prix-tyre-strategies-pit-stops.HTML",
    "race": parse_race_name("2013-australian-grand-prix-tyre-strategies-pit-stops.HTML")
}
print(header)
parse_file("2013-australian-grand-prix-tyre-strategies-pit-stops.HTML", "./data/tyre_strategy_racefans/2013_tyre_strategy", "./data/tyre_strategy_racefans/json/2013_tyre_strategy", header)

{'year': '2013', 'file_name': '2013-australian-grand-prix-tyre-strategies-pit-stops.HTML', 'race': 'australian grand prix tyre strategies pit stops'}


In [334]:

#root_directory = "./data/tyre_strategy_racefans"

source_directories = ["2015_tyre_strategy"]

source_directories = [dir for dir in listdir(root_directory) if dir != "json"]

for source in source_directories:
    source_directory = join(root_directory, source)
    dest_directory = join(root_directory, "json", source)

    for file in listdir(source_directory):
        os.makedirs(dest_directory, exist_ok=True)

        header = {
            "year": source[0:4],
            "file_name": file,
            "race": parse_race_name(file)
        }
        
        try:
            parse_file(file, source_directory, dest_directory, header)
        except ValueError as e:
            print(f"ValueError FAIL: [{join(source_directory, file)}] {e}")
        except Exception as e:
            print(f"Exception FAIL: [{join(source_directory, file)}] {e}")



Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\1410.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\178.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\2011-belgian-grand-prix-ferrari.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\2011-canadian-grand-prix-virgin.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\2011-european-grand-prix-hrt.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\2011-hungarian-grand-prix-prerace-analysis.HTML] empty dict
Exception FAIL: [./data/tyre_strategy_racefans\2011_tyre_strategy\2011-indian-grand-prix-practice-analysis.HTML] [WinError 3] The system cannot find the path specified: './data/tyre_strategy_racefans\\json\\2011_tyre_strategy\\ was quickest in the first session but slipped to fourth in the afternoon. He said “We looked quick this morning. This afternoon, 

In [None]:
tyre_URLS = get_tyre_strategy_URLS(date_init=date(2011,1,1))

In [319]:
len(tyre_URLS)

59

In [320]:
save_HTML_file_from_url(tyre_URLS, folder_name="data/tyre_strategy_racefans/2011_tyre_strategy")

pending URLs: 0


In [339]:

def parse_csv():
    root_directory = join("data","tyre_strategy_racefans","json")
    headers_to_exclude = ['year', 'file_name', 'race']
    headers = [ 'year', 'race', 'name']
    values = []

    for dir in listdir(root_directory):
        inner_dir = join(root_directory, dir, "The tyre strategies for each driver")
        if not exists(inner_dir):
            print(f'NO DIR - {inner_dir}')
            continue

        for file_name in listdir(inner_dir):
            data = json.load(open(join(inner_dir, file_name)))

            for key, value in data.items():
                if key not in headers_to_exclude:
                    try:
                        new_row = [data["year"], data["race"], key] 
                    except Exception as e:
                        print(file_name)
                        raise e

                    for i in range(1,10):
                        stint_key = f'Stint {i}'
                        stint_value = value.get(stint_key, None)
                        
                        if stint_value:
                            try:
                                add_header_if_not_exists(headers, stint_key, stint_value)
                            except Exception as e:
                                print(f'file_name - {file_name}')
                                print(f'header - {header}')
                                print(f'stint_key - {stint_key}')
                                print(f'stint_value - {stint_value}')
                                raise e
                            new_row.append(stint_value["tyre_type"])
                            new_row.append(stint_value["lap_to"])

                    values.append(new_row)
    
    return headers, values

def add_header_if_not_exists(headers:list, new_header:str, internal_value:dict):
    for key, _ in internal_value.items():
        proposed_header = f'{new_header} {key}'
        if proposed_header not in headers:
            headers.append(proposed_header)

header, data = parse_csv()
with open('data/tyre_strategy_racefans/csv/tyre_strategy.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(data)


NO DIR - data\tyre_strategy_racefans\json\2011_tyre_strategy\The tyre strategies for each driver
NO DIR - data\tyre_strategy_racefans\json\csv\The tyre strategies for each driver


### TYRE STRATEGY MERGE 

In [152]:
tyre_strategy_pd = pd.read_csv(join("data", "tyre_strategy_racefans", "csv", "tyre_strategy.csv"))
drivers_pd = pd.read_csv(join("data", "csv_ergast_data", "drivers.csv"))
circuits_pd = pd.read_csv(join("data", "csv_ergast_data", "circuits.csv"))
races_pd = pd.read_csv(join("data", "csv_ergast_data", "races.csv"))

tyre_strategy_pd_2022 = pd.read_csv(join("data", "tyre_strategy_racefans", "csv", "2022-missing.csv"))

In [153]:
drivers_pd["name"] = drivers_pd["forename"] + " " + drivers_pd["surname"]

races_pd = races_pd[races_pd["year"] > 2011]
races_pd["match_country"] = races_pd["name"].str.lower()
# correct the names to same format between tables
tyre_strategy_pd.replace({'Carlos Sainz Jnr':'Carlos Sainz', 'Kimi Raikkonen': "Kimi Räikkönen",
                             "Sergio Perez": "Sergio Pérez", "Nico Hulkenberg": "Nico Hülkenberg", 
                             "Zhou Guanyu": "Guanyu Zhou", "Nyck De Vries": "Nyck de Vries",
                             "Esteban Gutierrez": "Esteban Gutiérrez", "Jean-Eric Vergne": "Jean-Éric Vergne",
                             "Jerome dâ€™Ambrosio": "Jérôme d'Ambrosio"}, inplace=True)

tyre_strategy_pd_2022["match_country"] = tyre_strategy_pd_2022["race"]
drivers_pd_2022 = drivers_pd[~drivers_pd["driverId"].isin([27, 76, 30, 818])] # drop duplicates driver codes 

In [154]:
tyre_strategy_pd = pd.merge(tyre_strategy_pd, drivers_pd[["name", "driverId"]], how="left", on="name")
tyre_strategy_pd_2022 = pd.merge(tyre_strategy_pd_2022, drivers_pd_2022[["code", "name", "driverId"]], how="inner", on="code")

In [155]:
tyre_strategy_pd_2022 = tyre_strategy_pd_2022[['year', 'race', 'name', 'Stint 1 tyre_type', 'Stint 1 lap_to',
       'Stint 2 tyre_type', 'Stint 2 lap_to', 'Stint 3 tyre_type',
       'Stint 3 lap_to', 'Stint 4 tyre_type', 'Stint 4 lap_to',
       'Stint 5 tyre_type', 'Stint 5 lap_to', 'Stint 6 tyre_type',
       'Stint 6 lap_to', 'Stint 7 tyre_type', 'Stint 7 lap_to', 'driverId']]

In [156]:
tyre_strategy_pd = pd.concat([tyre_strategy_pd, tyre_strategy_pd_2022])
tyre_strategy_pd["match_country"] = tyre_strategy_pd["race"].str.split(" grand", expand=True)[0] + " grand prix"
tyre_strategy_pd.replace({
    'sao paulo grand prix':'são paulo grand prix', 
    'australian gp interactive data lap charts times and tyres grand prix':'australian grand prix'}, inplace=True)
    
races_pd = races_pd[["raceId","round","circuitId","name","year", "match_country"]].set_index(["match_country", "year"])
tyre_strategy_pd.set_index(["match_country", "year"], inplace=True)

tyre_strategy_with_id = tyre_strategy_pd.join(races_pd, how="left", rsuffix="_x")

In [157]:
tyre_strategy_with_id.rename(columns={
    "Stint 1 tyre_type" : "stint_1_tyre", 
    'Stint 1 lap_to'    : "stint_1",
    'Stint 2 tyre_type' : "stint_2_tyre", 
    'Stint 2 lap_to'    : "stint_2", 
    'Stint 3 tyre_type' : "stint_3_tyre",
    'Stint 3 lap_to'    : "stint_3", 
    'Stint 4 tyre_type' : "stint_4_tyre", 
    'Stint 4 lap_to'    : "stint_4",
    'Stint 5 tyre_type' : "stint_5_tyre", 
    'Stint 5 lap_to'    : "stint_5",
    'Stint 6 tyre_type' : "stint_6_tyre",
    'Stint 6 lap_to'    : "stint_6", 
    'Stint 7 tyre_type' : "stint_7_tyre", 
    'Stint 7 lap_to'    : "stint_7"
    }, inplace=True)

In [159]:
tyre_strategy_with_id.to_csv(join("data","outputs", "tyre_strategy_with_id_plus_2022.csv"))

In [158]:
tyre_strategy_with_id.reset_index(inplace=True)

In [161]:
tyre_strategy_with_id.shape

(3897, 23)

In [162]:
tyre_strategy_with_id_for_review = []
for stint in range(1, 8):
    tyre_strategy_with_id[f'stint_{stint}_tyre'] = tyre_strategy_with_id[f'stint_{stint}_tyre'].str.lower()
    df_auxiliar = tyre_strategy_with_id[["match_country","year",f'stint_{stint}_tyre']]
    df_auxiliar = df_auxiliar.rename(columns={f'stint_{stint}_tyre' : "tyre"})
    tyre_strategy_with_id_for_review.append(df_auxiliar)

tyre_strategy_with_id_for_review = pd.concat(tyre_strategy_with_id_for_review)
tyre_strategy_with_id_for_review = tyre_strategy_with_id_for_review.dropna()
#tyre_strategy_with_id_for_review = tyre_strategy_with_id_for_review[tyre_strategy_with_id_for_review["tyre"] != "wet"]
#tyre_strategy_with_id_for_review = tyre_strategy_with_id_for_review[tyre_strategy_with_id_for_review["tyre"] != "intermediate"]


In [163]:
tyre_strategy_with_id_for_review

Unnamed: 0,match_country,year,tyre
0,70th anniversary grand prix,2020,c2
1,70th anniversary grand prix,2020,c3
2,70th anniversary grand prix,2020,c3
3,70th anniversary grand prix,2020,c3
4,70th anniversary grand prix,2020,c3
...,...,...,...
2139,hungarian grand prix,2020,c2
2640,malaysian grand prix,2014,hard
3457,spanish grand prix,2013,medium
1982,german grand prix,2019,c4


In [164]:
# combinaciones unicas de neumaticos en todas las carreras
map_to_tyres_combinations = pd.DataFrame(tyre_strategy_with_id_for_review \
        .drop_duplicates() \
        .sort_values(by="tyre") \
        .groupby(["match_country","year"])['tyre'] \
        .apply(','.join)).reset_index()


In [None]:
map_to_tyres_combinations

In [165]:
tyre_strategy_with_id.set_index(["match_country","year"], inplace=True)
map_to_tyres_combinations.set_index(["match_country","year"], inplace=True)

tyre_strategy_df_to_merge = tyre_strategy_with_id.join(map_to_tyres_combinations, how="left", rsuffix="_x")

In [166]:
mapper = {
    'c2,c3,c4': {'c2':'hard', 'c3':'medium', 'c4':'soft'},
    'medium,soft': {'soft':'soft', 'medium':'medium'},
    'soft,super soft,ultra soft': {'super soft':'soft', 'ultra soft':'medium', 'soft':'hard'},
    'super soft,ultra soft': {'super soft':'soft', 'ultra soft':'medium'},
    'hyper soft,super soft,ultra soft': {'hyper soft':'soft', 'super soft':'medium', 'ultra soft':'hard'},
    'c3,c4,c5' : {'c5':'soft','c4':'medium','c3':'hard'},
    'medium,super soft' :  {'super soft':'soft','medium':'medium'},
    'medium,soft,super soft' :  {'super soft':'soft','soft':'medium','medium':'hard'},
    'c2,c3,c5' :  {'c5':'soft','c3':'medium','c2':'hard'},
    'soft,super soft' :  {'super soft':'soft','soft':'medium'},
    'c3,c4' :  {'c4':'soft','c3':'medium'},
    'hard,medium' :  {'medium':'medium','hard':'hard'},
    'c1,c2,c3' :  {'c3':'soft','c2':'medium','c1':'hard'},
    'c2,c3' :  {'c3':'soft','c2':'medium'},
    'hard,intermediate,medium,wet' :  {'medium':'medium','hard':'hard'},
    'intermediate,wet' :  {'soft':'soft'},
    'hard,soft' :  {'soft':'soft','hard':'hard'},
    'intermediate,medium,soft,wet' :  {'soft':'soft','medium':'medium','':'hard'},
    'hyper soft,medium,soft' :  {'hyper soft':'soft','soft':'medium','medium':'hard'},
    'medium,soft,ultra soft' :  {'ultra soft':'soft','soft':'medium','medium':'hard'},
    'c3,c4,intermediate,wet' :  {'c4':'soft','c3':'medium'},
    'c2,c3,c4,intermediate' :  {'c4':'soft','c3':'medium','c2':'hard'},
    'intermediate,medium,soft,ultra soft,wet' :  {'ultra soft':'soft','soft':'medium','medium':'hard'},
    'intermediate,medium,soft' :  {'soft':'soft'},
    'hard,medium,soft' :  {'soft':'soft','medium':'medium','hard':'hard'},
    'hard,intermediate,medium' :  {'soft':'soft','medium':'medium','hard':'hard'},
    'intermediate,soft,super soft' :  {'super soft':'soft','soft':'medium'},
    'intermediate,soft,super soft,ultra soft,wet' :  {'super soft':'soft','ultra soft':'medium','soft':'hard'},
    'c3,c4,c5,intermediate,wet' :  {'c5':'soft','c4':'medium','c3':'hard'},
    'hyper soft,soft,ultra soft' :  {'hyper soft':'soft','ultra soft':'medium','soft':'hard'},
    'c3,c4,c5,intermediate' :  {'c5':'soft','c4':'medium','c3':'hard'},
    'c3,intermediate' :  {'c3':'medium'}
}

In [168]:
#tyre_strategy_df_to_merge.reset_index(inplace=True)
concatenate_me = []
for raceId in tyre_strategy_df_to_merge.raceId.unique():
    try:
        race_to_modify = tyre_strategy_df_to_merge[tyre_strategy_df_to_merge["raceId"] == raceId]
        instantiate_mapper = mapper[race_to_modify["tyre"][0]]
        concatenate_me.append(tyre_strategy_df_to_merge[tyre_strategy_df_to_merge["raceId"] == raceId].replace(instantiate_mapper))
    except:
        print(raceId)

tyre_strategy_normalized = pd.concat(concatenate_me)
tyre_strategy_normalized = tyre_strategy_normalized.reset_index()

In [169]:
tyre_strategy_normalized.shape

(3897, 24)

### Tyre strategies as numerical

Here I will prepare the data to be expressed as columns with numbers: every columns contain the stint (number of laps) for the stint-number-tire-type column

In [170]:
tyre_strategy_normalized.columns

Index(['match_country', 'year', 'race', 'name', 'stint_1_tyre', 'stint_1',
       'stint_2_tyre', 'stint_2', 'stint_3_tyre', 'stint_3', 'stint_4_tyre',
       'stint_4', 'stint_5_tyre', 'stint_5', 'stint_6_tyre', 'stint_6',
       'stint_7_tyre', 'stint_7', 'driverId', 'raceId', 'round', 'circuitId',
       'name_x', 'tyre'],
      dtype='object')

In [171]:
df = pd.DataFrame()

for i in tyre_strategy_normalized.index:
     row = tyre_strategy_normalized.loc[i]

     for n in range(1,8):
          
          df.loc[i, f"stint_{n}_{row[f'stint_{n}_tyre']}"] = row[f"stint_{n}"]
          for c in ['match_country', 'year', 'race', 'name', 'driverId', 'raceId', 'round', 'circuitId','name', 'tyre']:
               df.loc[i, c] = row[c]

In [175]:
df_tyre_stint_as_cat = df.copy()
df_tyre_stint_as_cat = df_tyre_stint_as_cat.astype({'year':'int64', 'driverId': 'int64', 'raceId':'int64', 'round':'int64', 'circuitId':'int64'})

for n in range(1,8):
    df_tyre_stint_as_cat.drop(columns = f"stint_{n}_nan", axis=1, inplace=True)


In [176]:
df_tyre_stint_as_cat.to_csv(join("data","outputs", "tyre_strategy_normalized_and_numeric_plus_2022.csv"))

In [177]:
tyre_strategy_normalized.reset_index(inplace=True)

In [500]:
tyre_strategy_normalized.to_csv(join("data","outputs", "tyre_strategy_normalized_plus_2022.csv"))

In [428]:
# combinaciones unicas de neumaticos en todas las carreras
tyre_strategy_with_id_for_review \
        .drop_duplicates() \
        .sort_values(by="tyre") \
        .groupby(["match_country","year"])['tyre'] \
        .apply(','.join) \
        .reset_index()["tyre"] \
        .unique()

array(['c2,c3,c4', 'medium,soft', 'soft,super soft,ultra soft',
       'super soft,ultra soft', 'hyper soft,super soft,ultra soft',
       'c3,c4,c5', 'medium,super soft', 'medium,soft,super soft',
       'c2,c3,c5', 'soft,super soft', 'c3,c4', 'hard,medium', 'c1,c2,c3',
       'c2,c3', 'hard,intermediate,medium,wet', 'intermediate,wet',
       'hard,soft', 'intermediate,medium,soft,wet',
       'hyper soft,medium,soft', 'medium,soft,ultra soft',
       'c3,c4,intermediate,wet', 'c2,c3,c4,intermediate',
       'intermediate,medium,soft,ultra soft,wet',
       'intermediate,medium,soft', 'hard,medium,soft',
       'hard,intermediate,medium', 'intermediate,soft,super soft',
       'intermediate,soft,super soft,ultra soft,wet',
       'c3,c4,c5,intermediate,wet', 'hyper soft,soft,ultra soft',
       'c3,c4,c5,intermediate', 'c3,intermediate'], dtype=object)

In [None]:
soup = get_HTLM_from_file("2018-abu-dhabi-grand-prix-interactive-data-lap-charts-times-and-tyres.HTML", "data/tyre_strategy_racefans/2018_tyre_strategy")
tables = soup.find_all("table", {"class": "thin"})
for table in tables:

        ths = table.find_all("th")
        if len(ths) !=0:
            dict_keys = [th.text for th in ths]
            print(dict_keys)

In [None]:
get_tyre_strategy_table(soup)

In [5]:
def tyres_df_per_race(html_df):
    
    for stint in html_df.columns[1:]:

        html_df[f"stint_laps_{stint}"] = pd.to_numeric(html_df[stint].str.split("(", expand=True)[1].str.strip(")"))
        html_df[stint] = html_df[stint].str.split("(", expand=True)[0]

        html_df.rename(columns={"Unnamed: 0": "drivers"}, inplace=True)
        html_df.set_index("drivers", inplace=True)
        html_df.dropna(axis = 0, how = 'all', inplace = True)
        html_df.reset_index(inplace=True)
    
    tyre_per_lap = []
    for n in range(1, 53):
        tyre_per_lap.append(pd.DataFrame(columns=['drivers', 'lap', 'tyre_type'], data = {"drivers":html_df.drivers.values, 'lap':n}))

    tyre_per_lap = pd.concat(tyre_per_lap)
    
    columns = int((html_df.shape[1] - 1)/2)
    for i in html_df.index:
        row = html_df.loc[i]

        last_lap = 0
        for stint in range(1,columns + 1):
            laps = row[f'stint_laps_Stint {stint}']
            driver = row['drivers']
            tyre_type = row[f'Stint {stint}']

            if np.isnan(laps):
                continue

            tyre_per_lap.loc[(tyre_per_lap['drivers'] == driver) & (tyre_per_lap['lap'].isin(range(last_lap + 1, int(last_lap + laps) + 1))), 'tyre_type'] = tyre_type
            last_lap = int(last_lap + laps)
            
    tyre_per_lap = tyre_per_lap.set_index(["drivers","lap"])
    return tyre_per_lap

In [8]:
pd.read_html('./data/circuits_HTML/autodromo-jose-carlos-pace-carlos-pace-interlagos-circuit-information.HTML')[0]

Unnamed: 0,0,1
0,Lap data,Lap data
1,Lap length,4.309km (2.677 miles)
2,Race laps,71
3,Race distance,305.909km (190.083 miles)
4,Pole position,Right-hand side of the track
5,Lap record*,"1’11.473 (Juan Pablo Montoya, 2004)"
6,Fastest lap,"1’07.281 (Lewis Hamilton, 2018, qualifying three)"
7,Maximum speed,331kph (205.674 mph)
8,Distance from grid to turn one,195m
9,Car performance,Car performance
