In [1]:
# %load prepareStats.py
#!/usr/bin/env python

"""
Don't repeat yourself, they say. So this is the stuff that goes in the beginning of all my 
Jupyter Notebooks exploring various F1 statistics. https://github.com/michalkasparek/f1-stats/
"""

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Let's define some useful lists first

unknownResult = ["nan0n", "NaN", "NA", "inf"]
crashes = ["Accident", "Collision", "Fatal accident", "Collision damage", "Spun off"]
dnqs = ["Did not qualify", "Did not prequalify", "107% Rule"]

Europe = ["Austria", "Austrian", "Azerbaijan", "Belgian", "Belgium", "British", "Czech", "Danish", "Dutch", "East German", "Finnish", "France", "French", "German", "Germany", "Hungarian", "Hungary", "Irish", "Italian", "Italy", "Liechtensteiner", "Monaco", "Monegasque", "Netherlands", "Polish", "Portugal", "Portuguese", "Russia", "Russian", "Spanish", "Spain", "Sweden", "Swedish", "Swiss", "Switzerland", "Turkey", "UK"]
NAmerica = ["American", "Canada", "Canadian", "Mexican", "Mexico", "USA"]
SAmerica = ["Argentina", "Argentine", "Brazil", "Brazilian", "Chilean", "Colombian", "Uruguayan", "Venezuelan"]
Asia = ["Bahrain", "Chinese", "China", "Hong Kong", "India", "Indian", "Indonesian", "Japan", "Japanese", "Korea", "Malaysia", "Malaysian", "Qatar", "Saudi Arabia", "Singapore", "Thai", "UAE"]
Africa = ["Morocco", "Rhodesian", "South Africa", "South African"]
Oceania = ["Australia", "Australian", "New Zealander"]
multiple = ["American-Italian", "Argentine-Italian"]
westernEurope = ["Austria", "Belgium", "Germany", "France", "Italy", "Monaco", "Netherlands", "Portugal", "Spain", "Sweden", "Switzerland", "UK"]

streetCircuits = ["Melbourne", "Monte-Carlo", "Montreal", "Valencia", "Marina Bay", "Sochi", "Baku", "Jeddah", "Adelaide", "Phoenix", "Detroit", "Dallas", "Nevada", "California", "Oporto", "Lisbon"]

# Now we're gonna load the stats… (source: http://ergast.com/mrd/db/#csv)

drivers = pd.read_csv(os.path.join("data", "drivers.csv"))
results = pd.read_csv(os.path.join("data", "results.csv"))
races = pd.read_csv(os.path.join("data", "races.csv"))
circuits = pd.read_csv(os.path.join("data", "circuits.csv"))
status = pd.read_csv(os.path.join("data", "status.csv"))
constructors = pd.read_csv(os.path.join("data", "constructors.csv"))

# …merge all the tables into a single dataframe…

results = results.merge(drivers, on="driverId", how="right")
races = races.merge(circuits, on="circuitId", how="right")
results = results.merge(races, on="raceId", how="right")
results = results.merge(status, on="statusId", how="right")
results = results.merge(constructors, on="constructorId", how="right")

# …make the columns more useful…

results["fullname"] = results["forename"] + " " + results["surname"]
results["date"] = pd.to_datetime(results["date"], format="%Y-%m-%d")
results["year"] = results["year"].apply(pd.to_numeric, errors = "coerce").astype(np.int64, errors="ignore")
results["round"] = results["round"].apply(pd.to_numeric, errors = "coerce").astype(np.int64, errors="ignore")
results["position"] = results["position"].apply(pd.to_numeric, errors = "coerce").astype(np.int64, errors="ignore")
results["fastestLapSpeed"] = results["fastestLapSpeed"].apply(pd.to_numeric, errors = "coerce").astype(np.int64, errors="ignore")

# …and create some new columns.

results["year_round"] = results["year"].astype(str).str.slice(0,4) + results["round"].astype(str).str.slice(0,-2).str.zfill(2)
results = results[~results.year_round.isin(unknownResult)]
results["year_round"] = results["year_round"].apply(pd.to_numeric, errors = "coerce").astype(int)

results.loc[results.nationality_x.isin(Europe),"driverContinent"]="Europe"
results.loc[results.nationality_x.isin(NAmerica),"driverContinent"]="NAmerica"
results.loc[results.nationality_x.isin(SAmerica),"driverContinent"]="SAmerica"
results.loc[results.nationality_x.isin(Asia),"driverContinent"]="Asia"
results.loc[results.nationality_x.isin(Africa),"driverContinent"]="Africa"
results.loc[results.nationality_x.isin(Oceania),"driverContinent"]="Oceania"
results.loc[results.nationality_x.isin(multiple),"driverContinent"]="multiple"

results.loc[results.location.isin(streetCircuits),"street"]=True

entries = pd.Series(results.groupby(["driverId"]).size(), name="entries")
results = results.merge(entries, on = ["driverId"], how = "right")

# Now split the main dataframe for wins, podiums and top 6 finishes only 

wins = results[results["position"] == 1]
podiums = results[results["position"] < 4]
top6 = results[results["position"] < 7]

# Finally let's give the plots some swag

plt.style.use("_mpl-gallery")
plt.rcParams["figure.figsize"] = (20,3)

  results = results.merge(constructors, on="constructorId", how="right")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results["year_round"] = results["year_round"].apply(pd.to_numeric, errors = "coerce").astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[results.nationality_x.isin(Europe),"driverContinent"]="Europe"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexi

In [2]:
results = results[~results.status.isin(dnqs)]

In [3]:
fathersAndSons = {
"Andrettis": ["Mario Andretti", "Michael Andretti"],
"Brabhams": ["Jack Brabham", "David Brabham"],
"Fittipaldis": ["Wilson Fittipaldi", "Christian Fittipaldi"],
"Hills": ["Graham Hill", "Damon Hill"], 
"Magnussens": ["Jan Magnussen", "Kevin Magnussen"],
"Nakajimas": ["Satoru Nakajima", "Kazuki Nakajima"],
"Palmers": ["Jonathan Palmer", "Jolyon Palmer"],
"Parnells": ["Reg Parnell", "Tim Parnell"],
"Piquets": ["Nelson Piquet", "Nelson Piquet Jr."], 
"Rosbergs": ["Keke Rosberg", "Nico Rosberg"],
"Schumachers": ["Michael Schumacher", "Mick Schumacher"],
"Verstappens": ["Jos Verstappen", "Max Verstappen"], 
"Villeneuves": ["Gilles Villeneuve", "Jacques Villeneuve"],
"Winkelhocks": ["Manfred Winkelhock", "Markus Winkelhock"]
}

In [4]:
sons = []
for key, value in fathersAndSons.items():
    sons.append(value[1])

In [5]:
results.loc[results.fullname.isin(sons),"son"]=True

In [6]:
resultsSons = results[results.fullname.isin(sons)]

## Historical stats

In [7]:
woSons = results.groupby(["year", "round"])["son"].count()
woSons[woSons <1].tail(1)

year    round
2004.0  15.0     0
Name: son, dtype: int64

The last GP without a single son of another F1 driver entering was the 2004 Italian Grand Prix.

In [8]:
woSons = results.groupby(["year"])["son"].count()
woSons[woSons <1].tail(1)

year
1991.0    0
Name: son, dtype: int64

In [9]:
results[results.fullname.isin(sons)].reset_index().groupby("year")["fullname"].nunique()

year
1961.0    1
1990.0    1
1992.0    2
1993.0    3
1994.0    3
1995.0    1
1996.0    2
1997.0    2
1998.0    2
1999.0    2
2000.0    1
2001.0    1
2002.0    1
2003.0    1
2004.0    1
2005.0    1
2006.0    2
2007.0    3
2008.0    3
2009.0    3
2010.0    1
2011.0    1
2012.0    1
2013.0    1
2014.0    2
2015.0    3
2016.0    4
2017.0    3
2018.0    2
2019.0    2
2020.0    2
2021.0    2
2022.0    3
Name: fullname, dtype: int64

## Sons, sorted

In [10]:
resultsSons.groupby("fullname")["points"].sum().sort_values(ascending=False)

fullname
Max Verstappen          1667.5
Nico Rosberg            1594.5
Damon Hill               360.0
Jacques Villeneuve       235.0
Kevin Magnussen          172.0
Nelson Piquet Jr.         19.0
Christian Fittipaldi      12.0
Jolyon Palmer              9.0
Kazuki Nakajima            9.0
Michael Andretti           7.0
David Brabham              0.0
Markus Winkelhock          0.0
Mick Schumacher            0.0
Tim Parnell                0.0
Name: points, dtype: float64

In [11]:
resultsSons.groupby("fullname")["raceId"].count().sort_values(ascending=False)

fullname
Nico Rosberg            206
Jacques Villeneuve      165
Max Verstappen          148
Kevin Magnussen         127
Damon Hill              116
Christian Fittipaldi     40
Jolyon Palmer            37
Kazuki Nakajima          36
Mick Schumacher          29
Nelson Piquet Jr.        28
David Brabham            24
Michael Andretti         13
Tim Parnell               2
Markus Winkelhock         1
Name: raceId, dtype: int64

In [12]:
resultsSons.groupby("fullname")["position"].min().sort_values()

fullname
Damon Hill               1.0
Jacques Villeneuve       1.0
Max Verstappen           1.0
Nico Rosberg             1.0
Kevin Magnussen          2.0
Nelson Piquet Jr.        2.0
Michael Andretti         3.0
Christian Fittipaldi     4.0
Jolyon Palmer            6.0
Kazuki Nakajima          6.0
David Brabham           10.0
Tim Parnell             10.0
Mick Schumacher         11.0
Markus Winkelhock        NaN
Name: position, dtype: float64

## Gaps between father's last GP and son's first

In [13]:
def timedelta():
    timedeltas = {}
    for key, value in fathersAndSons.items():
        delta = results[results["fullname"] == value[1]]["date"].min() - results[results["fullname"] == value[0]]["date"].max()
        delta = delta
        timedeltas[key] = delta
    timedeltas = pd.DataFrame.from_dict(timedeltas, orient="index", columns = ["gap"])    
    return(timedeltas)
        
timedelta().sort_values(by="gap")

Unnamed: 0,gap
Parnells,2555 days
Schumachers,3045 days
Andrettis,3823 days
Verstappens,4172 days
Villeneuves,5054 days
Magnussens,5761 days
Nakajimas,5831 days
Piquets,5978 days
Fittipaldis,5992 days
Hills,6377 days


## Drivers both father and son have raced against

In [14]:
results["adversaries"] = results["year_round"].map(results.groupby("year_round")["fullname"].agg(list))

In [15]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def adversaries():
    adversaries = {}
    for key, value in fathersAndSons.items():
        father = results[results["fullname"] == value[0]]
        father = father["adversaries"].tolist()
        father = [item for sublist in father for item in sublist]
        father = list(set(father))
        son = results[results["fullname"] == value[1]]
        son = son["adversaries"].tolist()
        son = [item for sublist in son for item in sublist]
        son = list(set(son))        
        inter = intersection(father, son)
        if len(inter) > 0:
            adversaries[key] = ", ".join(inter)
    
    return(adversaries)

adversaries()

{'Andrettis': 'Derek Warwick, Michele Alboreto, Alain Prost, Andrea de Cesaris, Riccardo Patrese',
 'Parnells': 'Stirling Moss, Maurice Trintignant, Roy Salvadori',
 'Schumachers': 'Nico Hülkenberg, Kimi Räikkönen, Fernando Alonso, Daniel Ricciardo, Sebastian Vettel, Lewis Hamilton, Robert Kubica, Sergio Pérez',
 'Verstappens': 'Jenson Button, Kimi Räikkönen, Fernando Alonso'}

In [17]:
def adversariesCount():
    adversariesCount = {}
    for key, value in fathersAndSons.items():
        father = results[results["fullname"] == value[0]]
        father = father["adversaries"].tolist()
        father = [item for sublist in father for item in sublist]
        father = list(set(father))
        son = results[results["fullname"] == value[1]]
        son = son["adversaries"].tolist()
        son = [item for sublist in son for item in sublist]
        son = list(set(son))        
        inter = intersection(father, son)
        adversariesCount[key] = len(inter)
    adversariesCount = pd.DataFrame.from_dict(adversariesCount, orient="index", columns=["drivers they both raced against"])    
    return(adversariesCount)

adversariesCount().sort_values(by="drivers they both raced against", ascending=False)

KeyError: 'common adversaries'