In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### importing the list of all formula one races from wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_Formula_One_circuits"
page = requests.get(url)

In [41]:
soup = BeautifulSoup(page.text, "html")
# print(soup.prettify())

In [42]:
tables = soup.find_all("table")
# print(tables)

In [43]:
print(len(tables))

4


In [44]:
for table in tables:
    print(f"Table: <{table.name}>")
    print(f"ID: {table.get('id')}")
    print(f"Class: {table.get('class')}")

Table: <table>
ID: None
Class: ['sidebar', 'sidebar-collapse', 'nomobile', 'nowraplinks', 'plainlist']
Table: <table>
ID: None
Class: ['wikitable']
Table: <table>
ID: None
Class: ['wikitable', 'sortable']
Table: <table>
ID: None
Class: ['nowraplinks', 'mw-collapsible', 'autocollapse', 'navbox-inner']


In [46]:
# print(tables[2])

we got the table we were looking for

In [47]:
circuts = tables[2]
columns = circuts.find_all("th")
column_names = [title.text.strip() for title in columns]
print(column_names)

['Circuit', 'Map', 'Type', 'Direction', 'Location', 'Country', 'Last length used', 'Turns', 'Grands Prix', 'Season(s)', 'Grands Prix held']


we won't use "Map" column as it is an Image and it's useless for our purpose

In [48]:
column_names = [name for name in column_names if name != "Map"]

we have the column names, let's move on to the data and put it in a dataframe

In [49]:
df = pd.DataFrame(columns = column_names)
df

Unnamed: 0,Circuit,Type,Direction,Location,Country,Last length used,Turns,Grands Prix,Season(s),Grands Prix held


In [50]:
data = circuts.find_all("tr")
for row in data[1:]:
    current_row = row.find_all("td")
    current_row_data = [current_row[k].text.strip() for k in range(len(current_row)) if k != 1] # omit image data from Map column
    df.loc[len(df)] = current_row_data

In [51]:
df

Unnamed: 0,Circuit,Type,Direction,Location,Country,Last length used,Turns,Grands Prix,Season(s),Grands Prix held
0,Adelaide Street Circuit,Street circuit,Clockwise,Adelaide,Australia,3.780 km (2.349 mi),16,Australian Grand Prix,1985–1995,11
1,Ain-Diab Circuit,Road circuit,Clockwise,Casablanca,Morocco,7.618 km (4.734 mi),18,Moroccan Grand Prix,1958,1
2,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5
3,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),16,Australian Grand Prix,"1996–2019, 2022–2025",28
4,Algarve International Circuit,Race circuit,Clockwise,Portimão,Portugal,4.653 km (2.891 mi),15,Portuguese Grand Prix,2020–2021,2
...,...,...,...,...,...,...,...,...,...,...
72,TI Circuit Aida,Race circuit,Clockwise,Mimasaka,Japan,3.703 km (2.301 mi),11,Pacific Grand Prix,1994–1995,2
73,Valencia Street Circuit,Street circuit,Clockwise,Valencia,Spain,5.419 km (3.367 mi),25,European Grand Prix,2008–2012,5
74,Watkins Glen International,Race circuit,Clockwise,Watkins Glen,United States,5.430 km (3.374 mi),10,United States Grand Prix,1961–1980,20
75,Yas Marina Circuit *,Race circuit,Anti-clockwise,Abu Dhabi,United Arab Emirates,5.281 km (3.281 mi),15,Abu Dhabi Grand Prix,2009–2024,16


won't need this column as well

df = df.drop(columns=['Grands Prix held'])

In [52]:
df.to_csv("circuts.csv", index=False)

## transform the table to SCD2 type, by dividing 'Season(s)' column to: year_from, year_to, isActive 

In [4]:
df = pd.read_csv('circuts.csv')

In [52]:
def parse_seasons(seasons_str, isActive): # is active from the table as the seasons date's not always match
    periods = []
    parts = [s.strip() for s in seasons_str.split(',')]
    for part in parts:
        if ']' in part:
            part = part[0:-3]
            print(part)
        if '–' in part:
            start, end = part.split('–')
            if end == "2025":
                periods.append((int(start), int(end), True))
            elif end == "2024" and isActive:
                periods.append((int(start), 2025, True))
            else:
                periods.append((int(start), int(end), False))
        else:
            if part == "2025":
                periods.append((int(part), int(part), True))
            elif part == "2024" and isActive:
                periods.append((2025, 2025, True))
            else:
                periods.append((int(part), int(part), False))
    return periods

In [42]:
df.head(20)

Unnamed: 0,Circuit,Type,Direction,Location,Country,Last length used,Turns,Grands Prix,Season(s),Grands Prix held
0,Adelaide Street Circuit,Street circuit,Clockwise,Adelaide,Australia,3.780 km (2.349 mi),16,Australian Grand Prix,1985–1995,11
1,Ain-Diab Circuit,Road circuit,Clockwise,Casablanca,Morocco,7.618 km (4.734 mi),18,Moroccan Grand Prix,1958,1
2,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5
3,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),16,Australian Grand Prix,"1996–2019, 2022–2025",28
4,Algarve International Circuit,Race circuit,Clockwise,Portimão,Portugal,4.653 km (2.891 mi),15,Portuguese Grand Prix,2020–2021,2
5,Autódromo do Estoril,Race circuit,Clockwise,Estoril,Portugal,4.360 km (2.709 mi),13,Portuguese Grand Prix,1984–1996,13
6,Autódromo Hermanos Rodríguez *,Race circuit,Clockwise,Mexico City,Mexico,4.304 km (2.674 mi),17,"Mexican Grand Prix,Mexico City Grand Prix","1963–1970, 1986–1992, 2015–2019, 2021–2024",24
7,Autódromo Internacional do Rio de Janeiro,Race circuit,Anti-clockwise,Rio de Janeiro,Brazil,5.031 km (3.126 mi),11,Brazilian Grand Prix,"1978, 1981–1989",10
8,Autodromo Internazionale del Mugello,Race circuit,Clockwise,Scarperia e San Piero,Italy,5.245 km (3.259 mi),14,Tuscan Grand Prix,2020,1
9,Autodromo Internazionale Enzo e Dino Ferrari *,Race circuit,Anti-clockwise,Imola,Italy,4.909 km (3.050 mi),17,"Italian Grand Prix,San Marino Grand Prix,Emili...","1980–2006, 2020–2022, 2024",31


In [33]:
print(df.loc[3, "Season(s)"])
print(parse_seasons(df.loc[3, "Season(s)"]))

1996–2019, 2022–2025
[(1996, 2019, False), (2022, 2025, True)]


In [55]:
def build_new_rows(row):
    periods_str = row['Season(s)']
    isActive = True if '*' in row['Circuit'] else False
    result = []
    periods = parse_seasons(periods_str, isActive)

    # sort by start year
    periods.sort()
    
    result = []
    
    for start, end, active in periods:
        new_row = row.to_dict()
        new_row['from'] = start
        new_row['to'] = end
        new_row['isActivePeriod'] = active
        result.append(new_row)
    return result

In [56]:
transformed_df = []
for _, row in df.iterrows():
    transformed_df.extend(build_new_rows(row))
    print(len(transformed_df))

scd2_df = pd.DataFrame(transformed_df)

1
2
6
8
9
10
14
16
17
20
24
26
31
32
2012–2025
34
36
46
47
48
49
52
53
54
57
58
60
66
74
79
83
85
87
96
101
104
106
107
110
116
117
118
119
121
130
131
133
135
136
137
140
141
142
144
146
147
151
155
157
167
168
169
171
2014–2024
174
175
180
181
182
183
185
1987–2024
202
203
206
207
208
209
210
211


In [63]:
scd2_df.head(15)

Unnamed: 0,Circuit,Type,Direction,Location,Country,Last length used,Turns,Grands Prix,Season(s),Grands Prix held,from,to,isActivePeriod
0,Adelaide Street Circuit,Street circuit,Clockwise,Adelaide,Australia,3.780 km (2.349 mi),16,Australian Grand Prix,1985–1995,11,1985,1995,False
1,Ain-Diab Circuit,Road circuit,Clockwise,Casablanca,Morocco,7.618 km (4.734 mi),18,Moroccan Grand Prix,1958,1,1958,1958,False
2,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5,1955,1955,False
3,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5,1957,1957,False
4,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5,1959,1959,False
5,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5,1961,1962,False
6,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),16,Australian Grand Prix,"1996–2019, 2022–2025",28,1996,2019,False
7,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),16,Australian Grand Prix,"1996–2019, 2022–2025",28,2022,2025,True
8,Algarve International Circuit,Race circuit,Clockwise,Portimão,Portugal,4.653 km (2.891 mi),15,Portuguese Grand Prix,2020–2021,2,2020,2021,False
9,Autódromo do Estoril,Race circuit,Clockwise,Estoril,Portugal,4.360 km (2.709 mi),13,Portuguese Grand Prix,1984–1996,13,1984,1996,False
