In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import datetime 

### Reading the csv files 

In [None]:
circuits = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/circuits.csv')
laptimes = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/lap_times.csv')
pitstops = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/pit_stops.csv')
seasons = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/seasons.csv', parse_dates=['year'])
status = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/status.csv')

In [None]:
constructor_standings = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/constructor_standings.csv')
constructors = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/constructors.csv')
driver_standings = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/driver_standings.csv')
drivers = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/drivers.csv')

In [None]:
races = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/races.csv',parse_dates=['year'])
constructor_results = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/constructor_results.csv')
results = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/results.csv')
qualifying = pd.read_csv('https://raw.githubusercontent.com/mspasts/f1-hystorical-analysis/main/qualifying.csv')

### Most successful constructors 

F1 isn't always a winning business for carmakers—financially, it often breaks even at best. Traditional investment logic says if the return on equity is lower than the cost of capital, it destroys value. But that's not the whole story. For many manufacturers, F1 is a powerful marketing tool, boosting brand image and top-line growth. Some teams consistently invest in performance to stay at the top—if you're a die-hard fan, you probably already know who they are. Let's dive into the data!

In [None]:
#merging the constructors dataframe with race results

team = constructors.merge(results,on='constructorId',how = 'left')

In [None]:
#extracting the columns needed and grouping it by constructor name, extracting the total races entered

best = team[['name','points','raceId']]
best = best.groupby('name')['raceId'].nunique().sort_values(ascending=False).reset_index(name = 'races')
best = best[best['races'] >= 100]
best.head() 

In [None]:
#building a formula to calculate points per race 

func = lambda x: x.points.sum()/x.raceId.nunique()
data = (
    team[team['name'].isin(best.name)]
    .groupby('name')
    .apply(func, include_groups=False)
    .sort_values(ascending=False)
    .reset_index(name='points_per_race')
)
data.head(10)

In [None]:
#plotting the results

fig = go.Figure(
    data=[go.Bar(x = data.name, y=data['points_per_race'])],
    layout_title_text="Constructor's Points per Race"
    
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.show()

Mercedes and Red Bull have dominated the past decade, showing their consistency in points per race. Meanwhile, Ferrari hasn’t secured a championship title since 2008. One standout is Force India—a smaller-budget team that held its own against giants like Mercedes and Ferrari, averaging an impressive five points per race

In [None]:
#calculating historic overall points of top 10 constructors

historic_points = team.groupby('name').agg({'points':'sum'}).sort_values('points',ascending=False).reset_index().head(10)
historic_points

In [None]:
#plotting a bar chart

fig = go.Figure(
    data=[go.Bar(x = historic_points.name, y=historic_points['points'])],
    layout_title_text="Constructor's Historic Points"
)
fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.show()

One of the most impressive stats on the chart is Mercedes AMG Petronas securing second place, despite joining Formula 1 only in 2010. In just twelve years, they've amassed two-thirds of Ferrari's all-time points—an incredible achievement.

### Q1: Do higher altitude circuits cause more engine failures?

At high altitudes, the air is thinner, meaning less airflow through radiators and intake valves to cool the brakes and engine. Engines also rely on oxygen for combustion, so lower oxygen levels can lead to performance loss. Key issues include overheating of the transmission and engine components. The Mexican GP’s Autódromo Hermanos Rodríguez, sitting at 2,227 meters above sea level, is the highest F1 track by a wide margin. It's notorious for causing major headaches for teams, forcing engineers to rethink race preparations, knowing performance drops will show up in Friday practice.

In [None]:
#merging circuits, races, results and race status dataframes

df = circuits.merge(races,how='left',left_on = 'circuitId',right_on = 'circuitId')
df2 = df.merge(results,how='left',on='raceId')
status_df = df2.merge(status,how='inner',left_on = 'statusId',right_on= 'statusId')

In [None]:
#cosmetic changes: dropping columns and renaming

status_df.drop(['name_y','url_y','url_x','time_y'],axis=1,inplace=True)
status_df.rename(columns={'name_x':'name','time_x':'time'},inplace=True)

In [None]:
#including rows with issues correlated with thin air in higher altitudes, setting the year to last 7 to include Mexico GP

altitude = status_df[status_df['status'].isin(['Transmission','Engine','Overheating'])]
altitude = altitude[altitude['year'] >= pd.to_datetime('2015-01-01')]
altitude.head()

In [None]:
#grouping by track name and altitude and renaming columns 

circuit_altitudes = altitude.groupby(['name','alt'])['status'].count().sort_values(ascending = False).reset_index().head(10)
circuit_altitudes.rename(columns={'status':'engine & transmission failures'},inplace=True)
circuit_altitudes

In [None]:
#plotting a bubble chart: bigger the size of the bubble, higher the altitude

df = circuit_altitudes

fig = px.scatter(df, x="alt", y="engine & transmission failures",
         size="alt", color="name",
                  log_x=True, size_max= 80)
fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

The Mexican GP has the most overheating-related retirements, followed by the Red Bull Ring. Surprisingly, Bahrain GP—despite being at sea level—faces similar issues, likely due to high track temperatures. These are just observations without statistical proof, but understanding the underlying causes before applying models is crucial. Using supervised algorithms or linear regression to link track temperatures and altitude to driver race status could help uncover causal relationships.

### Q2: The case for the best F1 drivers

In 72 years of F1 history, 34 drivers have claimed the championship. Over the years, cars have evolved dramatically—from the high-pitched roar of early 2000s V10 engines to today's expertly designed and much safer V6 machines. Some drivers thrived in their era before making way for younger talent. Now, let's check out the first chart, showing how championship wins are distributed across different nations.


#### Distribution by Geography

In [None]:
# grouping by nationality, counting the driver and plotting a pie chart

driver_nationality = drivers.groupby('nationality')['nationality'].count().sort_values(ascending = False).reset_index(name = 'number of drivers')
fig = go.Figure(data=[go.Pie(labels=driver_nationality.nationality.head(10), values=driver_nationality['number of drivers'])])
fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(
    title="Historical Driver Nationality Distribution since 1950")
fig.show()

In [None]:
#merging drivers, driver standings and race data 

driver_position = drivers.merge(driver_standings,left_on='driverId',right_on='driverId',how = 'left')
driver_position = driver_position.merge(races,on = 'raceId',how = 'left')

In [None]:
#grouping by nationality year and surname to get the max points achieved every season

champions = driver_position.groupby(['nationality','year','surname'])[['points','wins']
                                            ].max().sort_values('points',ascending = False).reset_index()
champions.drop_duplicates(subset=['year'], inplace=True)

In [None]:
# counting the number of times a nation ended in P1 and plotting a pie chart
champion_nations = champions['nationality'].value_counts().to_frame(name='count')

fig = go.Figure(data=[
    go.Pie(labels=champion_nations.index, values=champion_nations['count'])
])
fig.update_traces(
    textfont_size=20,
    marker=dict(line=dict(color='#000000', width=2))
)
fig.update_layout(
    title="Distribution of Historic Champions by Nation"
)
fig.show()


To understand the prominence of British drivers and champions in Formula 1, we can trace it back to World War II. During the war, intense aerial battles led to the construction of large airfields in Britain. After the war, these airfields were transformed into race tracks by car enthusiasts, attracting drivers and engineers who had worked on fighter jet engines. One notable airfield became the famous Silverstone Circuit. This influx of racing talent resulted in many Formula 1 teams establishing their headquarters in the UK, with 6 out of 10 constructors based there in 2022.

In [None]:
#grouping by nationality year and surname to get the max points achieved every season and dropping year duplicates

champion_drivers = driver_position.groupby(['nationality','year','surname'])[['points','wins']
                                            ].max().sort_values('points',ascending = False).reset_index()
champion_drivers.drop_duplicates(subset=['year'], inplace=True)

#grouping by nationality and counting the surname of drivers 

final = champion_drivers.groupby('nationality')['surname'].nunique().reset_index(name = 'champions').sort_values(
    by='champions',ascending = False)

#merging both the datasets and creating a column to calculate the ratio

ratios = final.merge(driver_nationality,on='nationality',how='inner')
ratios['perc_winners'] = (ratios.champions/ratios['number of drivers']*100).round(2)
ratios = ratios.sort_values('perc_winners',ascending = False)
ratios.head(5) 

In [None]:
#creating a bar chart

df = ratios
fig = px.bar(df, x='nationality', y='perc_winners',
         hover_data=['champions','number of drivers'], color='number of drivers',
         height=400)
fig.update_traces(textfont_size=20,
              marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(
    title="Champions from a nation with respect to total drivers from the nation")
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

#### Most wins by a driver in a single season

In [None]:
#merging driver data, their standings and race data

driver_position = drivers.merge(driver_standings,left_on='driverId',right_on='driverId',how = 'left')
driver_position = driver_position.merge(races,on = 'raceId',how = 'left')

In [None]:
#filtering the dataset to include only where the position is 1 and grouping by name, year and extracting the max wins

positions = driver_position[driver_position['position'] == 1].groupby(
    ['surname','year'])['wins'].max().sort_values(ascending=False).reset_index(name = 'Wins')
positions.head(20)
positions.year = positions.year.dt.year
positions.rename(columns={'surname':'name'},inplace=True)
positions.Wins = positions.Wins.astype('int64')

positions.head(20)

In [None]:
#plotting a bubble chart

fig = px.scatter(positions.head(30), x="year", y="Wins", color="name",
                 title="Most wins by a driver in a single season",size = 'Wins')
fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
fig.update_xaxes(showgrid=False)
fig.show()

#### Most competitive seasons by points difference

In [None]:
competition = driver_position[driver_position['year'].dt.year == 1991
               ].groupby(['surname','year']).points.max().sort_values(ascending = False).reset_index().head(5)
competition.year = competition.year.dt.year
competition.iloc[0:2,0]         

In [None]:
def rivalry(x):
    competition = driver_position[driver_position['year'].dt.year == x] \
             .groupby(['surname', 'year']).points.max() \
             .sort_values(ascending=False).reset_index().head(5)
    competition.year = competition.year.dt.year    
    
    h = driver_position.merge(circuits,left_on='circuitId',right_on='circuitId',how = 'left')
    h.rename(columns={'name_y':'circuit_name'},inplace=True)
    viz = h.loc[:,['date','year','circuit_name','surname','points','wins']]

    viz.dropna(inplace = True)

    viz.points = viz.points.astype('int64')
    viz.wins = viz.wins.astype('int64')
    viz.year = viz.year.dt.year
    viz.date = pd.to_datetime(viz.date)
    
    top_five = viz[viz.loc[:,'year'] == x]
    top_five = top_five.groupby(['surname'])[['points','wins']].max().sort_values('points',ascending = False).head(6).reset_index()
    
    duo = competition.iloc[0:2]
    if competition.iloc[0,2] - competition.iloc[1,2] <= 10:
        print('\033[1m' + 'A rivalry in the history books!')
    elif competition.iloc[0,2] - competition.iloc[1,2] <= 20:
        print('\033[1m' + 'Spicy!')
    elif competition.iloc[0,2] - competition.iloc[1,2] < 30:
        print('\033[1m' + 'Meh!')
    elif competition.iloc[0,2] - competition.iloc[1,2] >= 30:
        print('\033[1m' + 'Snore Fest!')

        
    df = top_five
    fig = px.bar(df, x='surname', y='points',
             hover_data=['wins'], color='points',
            height=400,color_continuous_scale= 'turbo')
    fig.update_traces(textfont_size=20,
                  marker=dict(line=dict(color='#000000', width=2)))
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False)
    
    print('----------------------------------')
    x = fig.show()
    return x 

**PLAY AROUND WITH THE YEAR BELOW**

In [None]:
rivalry(2021)      

### Who has the fastest lap time in every circuit?

In [None]:
#merging and extraction of important columns

fast = circuits.merge(races,left_on = 'circuitId',right_on='circuitId',how = 'left')
fast = fast.merge(results,on='raceId',how = 'left')
fast = fast.merge(drivers,left_on='driverId',right_on='driverId',how = 'inner')
fast.rename(columns={'name_x':'circuit_name'},inplace = True)
fast = fast[['circuit_name','country','surname','fastestLapTime','nationality','year']]

# dropping null values and converting fastestlaptime into seconds

# fast = fast.dropna()
# fast['fastestLapTime_seconds']=fast['fastestLapTime'].apply(lambda x: float(x.split(':')[0])*60+float(x.split(':')[1]))

# Dropping null values and ensuring fastestLapTime has valid format
fast = fast.dropna()
fast = fast[fast['fastestLapTime'].str.contains(':', na=False)]

# Converting fastestLapTime into seconds
fast['fastestLapTime_seconds'] = fast['fastestLapTime'].apply(
    lambda x: float(x.split(':')[0]) * 60 + float(x.split(':')[1])
)

In [None]:
fast['fastest_recorded_lap'] = fast.groupby(['circuit_name'])['fastestLapTime'].transform('min')
fastest = fast[fast['fastest_recorded_lap']==fast['fastestLapTime']].sort_values('country').reset_index(col_level = 1)
fastest.drop(['index','fastest_recorded_lap'],inplace=True,axis = 1)
fastest['year'] = fastest.year.dt.year
fastest.head()

In [None]:
fastest_viz = fastest.surname.value_counts().rename_axis('driver').reset_index(name= 'fastest laps') 

df = fastest_viz
fig = px.bar(df, x='driver', y='fastest laps',
         hover_data=['fastest laps'], color='fastest laps',
        height=400,color_continuous_scale= 'Blues')
fig.update_layout(
    title="Drivers with the most fastest ever laps")
fig.update_traces(textfont_size=20,
              marker=dict(line=dict(color='#000000', width=2)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

####  

### Qualifying Pole = Race Pole? 

In F1, finishing at the pole on Saturday often does not guarantee the same on race day. What makes the mark of a great driver? Finishing pole every time he starts at the pole? A perfect ratio of 1? Or someone who's won more races than qualifying wins?

##### Calculating grid pole positions

In [None]:
#merging the (1) two dataframes [drivers and qualifying] needed and (2) performing aggregation function

driver_quali = results.merge(drivers,left_on = 'driverId',right_on='driverId',how = 'left')
driver_quali['full_name'] = driver_quali['forename'] + ' ' + driver_quali['surname']
driver_quali = driver_quali[['full_name','grid','position']]


quali_wins = driver_quali[driver_quali['grid'] == 1].groupby('full_name')['grid'].sum().reset_index(
                                        name = 'grid poles').sort_values('grid poles',ascending = False)

#resetting index numbers 

quali_wins = quali_wins.reset_index(col_level=0)
quali_wins.drop('index',axis = 1,inplace = True)
quali_wins.head(20)

##### Calculating race wins

In [None]:
#merging (1) dataframes [drivers, results], (2) creating a full name column and (3) selecting the important columns 

race_wins = drivers.merge(results,left_on='driverId',right_on='driverId',how = 'left')
race_wins['full_name'] = race_wins['forename'] + ' ' + race_wins['surname']

race_wins = race_wins[['full_name','position']]

# grouping by surname and counting the number of races won

highest_rw = race_wins[race_wins['position'] == 1.0].groupby('full_name').count().sort_values(
    'position', ascending = False).reset_index()
highest_rw.head()

##### Calculating RacePole / GridPole 

In [None]:
#merging the race wins alongside starting at grid position 1 

racexpole = highest_rw.merge(quali_wins,on = 'full_name',how = 'left')
racexpole = racexpole[racexpole['grid poles'] > 10]                    #setting minimum grid poles as > 10 
racexpole.dropna(inplace=True)                                         #dropping 28 null values

#cosmetic changes and calculation of racexgrid pole ratio

racexpole.rename(columns={'position':'race poles'},inplace=True)
racexpole['grid poles'] = racexpole['grid poles'].astype(int)
racexpole['racexgrid'] = (racexpole['race poles']/racexpole['grid poles']).round(2)
racexpole = racexpole.sort_values('racexgrid',ascending=False).reset_index(col_level=0)
racexpole.drop('index',axis = 1,inplace = True)
racexpole.head(15)

In [None]:
df = racexpole.head(12)
fig = px.bar(df, x='full_name', y='racexgrid',
         hover_data=['race poles','grid poles'], color='racexgrid',
        height=400,color_continuous_scale= 'gray')
fig.update_layout(
    title="Race Poles/Grid Poles Ratio")
fig.update_traces(textfont_size=20,
              marker=dict(line=dict(color='#000000', width=2)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

The above chart indicates Max Verstappen has more wins (26) than pole positions (16). His innate ability to thrive under pressure, as we witnessed in the last lap of the 2021 season and his aggressive driving stance helps him win races even if he isn't the best in qualifying. This is also a case for why Schumacher would be considered the best ever because he's extracted those race points and championships from more challenging grid positions than Lewis Hamilton. 

### Worst tracks based on overtaking action

There are always a few tracks in the calendar year that we know will be a snoozefest. Poor racing and overtake action happen on circuits, with most of the design and racing happening in straight lines and nothing else. To check for the least overtaking action, I counted the number of times the positions of the cars remained unchanged from the start till the end of the race. There's always a possibility of overtaking and the driver returning to his initial position. Although this method of rating tracks is not perfect, it'll give us a reasonable glimpse of potentially bland circuits.

In [None]:
tracks = circuits.merge(races, left_on='circuitid',right_on='circuitId',how = 'inner') 
tracks = tracks.merge(results,on = 'raceId',how = 'left')
tracks = tracks[['name_x','circuitid','driverId','position','grid','raceId','year']]
tracks.rename(columns={'name_x':'circuit'},inplace=True)
tracks.dropna(inplace = True)


tracks = tracks.loc[(tracks['year'] >= '2010-01-01')]
tracks

In [None]:
tracks['position_status'] = np.where(tracks['position'] == tracks['grid'],1,0)

In [None]:
#counting total number of races held in each track and attaching it to the previously created tracks column

total_races = races.loc[(races['year'] >= '2010-01-01')]
total_races = total_races.circuitId.value_counts().reset_index()
total_races.rename(columns={"index":'circuitid','circuitId':'num_races'},inplace=True)
tracks = tracks.merge(total_races,on='circuitid',how = 'left')  
tracks

In [None]:
#pivot table to find the count of unchanged positions [marked numbers column 1 values]

circuit_rating = tracks[tracks['num_races'] >= 5].pivot_table(index=['circuit','num_races'], columns='position_status', aggfunc='size', 
                                             fill_value=0)
circuit_rating = circuit_rating.reset_index()

In [None]:
#checking for tracks that had the least amount of overtaking action
circuit_rating['boring_score'] = circuit_rating[1].divide(circuit_rating['num_races'])
circuit_rating = circuit_rating.sort_values('boring_score',ascending=False)

circuit_rating

In [None]:
df = circuit_rating.head(10)
fig = px.bar(df, x='circuit', y='boring_score',
         hover_data=['num_races',1], color='num_races',
        height=400,color_continuous_scale= 'ice')
fig.update_layout(
    title="Worst Tracks to Overtake")
fig.update_traces(textfont_size=20,
              marker=dict(line=dict(color='#000000', width=2)))
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

The data is filtered to include only observations after 2009. Monaco, considered an iconic track for the generational history of F1, makes it to this list. Monaco GP is poorly designed for modern-day F1 cars, which are much bigger than the race cars of the 1990s. The lack of room in the track makes it extremely hard for drivers to overtake on race day.