# F1 Elo EDA

This notebook creates Elo rankings for drivers over races from 1985-2023 (Brazil), and uses the following approaches to identify the F1 greatest of all time (GOAT):

1. Days as highest rated driver
2. Peak score

We also plot a greatest of their time graph given limitations of the initial approach (e.g. older generation drivers raced less and cars were more prone to retiring).

## Environment setup

In [3]:
import datetime as dt

import pandas as pd
import plotly.express as px

### Helper functions

In [4]:
points_map = {
    1: 25,
    2: 18,
    3: 15,
    4: 12,
    5: 10,
    6: 8,
    7: 6,
    8: 4,
    9: 2,
    10: 1
}

def get_points(race_pos: int, fast_pos: int, points_map: dict = points_map) -> int:
    '''Returns points scored with 2023 F1 scoring system'''
    
    points = points_map[race_pos] if race_pos >= 1 and race_pos <= 10 else 0
    points += 1 if fast_pos == 1 else 0

    return points

## Data importing

In [5]:
DATA_DIR = "../data/raw"

results_df = pd.read_csv(f"{DATA_DIR}/results.csv")[["raceId", "driverId", "constructorId", "position", "time", "milliseconds", "rank", "statusId"]]
races_df = pd.read_csv(f"{DATA_DIR}/races.csv")[["raceId", "year", "round", "date"]]
drivers_df = pd.read_csv(f"{DATA_DIR}/drivers.csv")[["driverId", "driverRef"]]
constructors_df = pd.read_csv(f"{DATA_DIR}/constructors.csv")[["constructorId", "constructorRef"]]
status_df = pd.read_csv(f"{DATA_DIR}/status.csv")

raw_df = results_df.merge(races_df, on="raceId", how="left")
raw_df = raw_df.merge(drivers_df, on="driverId", how="left")
raw_df = raw_df.merge(constructors_df, on="constructorId", how="left")
raw_df = raw_df.merge(status_df, on="statusId", how="left")

raw_df = raw_df[["year", "round", "date", "constructorRef", "driverRef", "position", "rank", "time", "milliseconds", "status"]]
raw_df = raw_df.rename(columns={
    "driverRef": "driverId",
    "constructorRef": "constructorId"
})

## Data cleaning

In [6]:
res_df = raw_df.copy()
res_df = res_df[res_df["year"] > 1985] # more than 2 drivers per team <= 1985
res_df["estPosition"] = res_df.groupby(["year", "round"]).cumcount() + 1
res_df["estPoints"] = res_df[["estPosition", "rank"]].apply(
    lambda row: get_points(row["estPosition"], row["rank"]), axis=1
)
res_df["milliseconds"] = res_df["milliseconds"].replace("\\N", None).astype(float)
res_df["status"] = res_df["status"].apply(str.lower)
res_df["date"] = pd.to_datetime(res_df["date"])
res_df = res_df.drop(columns=["position", "rank"])

In [7]:
win_cols = ["year", "round", "constructorId"]
win_df = res_df.groupby(win_cols)["estPosition"].min().reset_index()
win_df["win"] = 1

res_df = res_df.merge(win_df, on=win_cols.append("estPosition"), how="left")
res_df["win"] = res_df["win"].fillna(0)

wl_df = pd.pivot_table(res_df, values="driverId", index=["year", "round", "constructorId"], 
                       columns="win", aggfunc=lambda x: ' '.join(x)).reset_index()

In [8]:
elo_scores = {id: 1500 for id in set(res_df["driverId"])}
res_df["score"] = None

k = 32
c = 300
l = 4

for _, row in wl_df.iterrows(): 
    
    d_a = row.loc[0]
    d_b = row.loc[1]

    if not (isinstance(row.loc[0], str) and isinstance(row.loc[1], str)):
        continue # < 2 drivers participated

    # get previous ratings and points scored in weekend by drivers
    a_ix = (res_df["year"] == row["year"]) & (res_df["round"] == row["round"]) & (res_df["driverId"] == row.loc[0])
    b_ix = (res_df["year"] == row["year"]) & (res_df["round"] == row["round"]) & (res_df["driverId"] == row.loc[1])
    
    p_a = res_df.loc[a_ix, "estPoints"].iloc[0]
    p_b = res_df.loc[b_ix, "estPoints"].iloc[0]

    r_a = elo_scores[row.loc[0]]
    r_b = elo_scores[row.loc[1]]

    # calculate scaled distance apart
    o_a = res_df.loc[a_ix, "status"].iloc[0]
    o_b = res_df.loc[b_ix, "status"].iloc[0]

    
    m_a = res_df.loc[a_ix, "milliseconds"].iloc[0]
    m_b = res_df.loc[b_ix, "milliseconds"].iloc[0]


    # calculate expected and realised outcome scores
    if p_a + p_b == 0:
        s_a = 0.5
        s_b = 0.5
    else:
        s_a = p_a / (p_a + p_b)
        s_b = p_b / (p_a + p_b)

    q_a = 10 ** (r_a / c)
    q_b = 10 ** (r_b / c)

    e_a = q_a / (q_a + q_b)        
    e_b = q_b / (q_a + q_b)

    # calculate new ratings and insert to results df
    n_a = r_a + (k * (0 - e_a)) + (l * s_a)
    n_b = r_b + (k * (1 - e_b)) + (l * s_b)

    res_df.loc[a_ix, "score"] = n_a
    elo_scores[row.loc[0]] = n_a

    res_df.loc[b_ix, "score"] = n_b
    elo_scores[row.loc[1]] = n_b


## Data exploration

In [24]:
min_races = res_df.groupby("year")["round"].nunique().sort_values().iloc[0] # shortest season

gott_df = res_df.sort_values(["year", "round", "score"], ascending=[True, True, False]).drop_duplicates(['year','round'])
gott_days = gott_df["driverId"].value_counts()
gott_drivers = set(gott_days[gott_days > min_races].index)
gott_df = res_df[res_df["driverId"].isin(gott_drivers)].sort_values(["year", "round"])

px.line(gott_df, x="date", y="score", color="driverId")

In [26]:
curr_drivers = set(res_df.loc[(res_df["year"] == 2023), "driverId"])
curr_df = res_df[res_df["driverId"].isin(curr_drivers)].sort_values(["year", "round"])
px.line(curr_df, x="date", y="score", color="driverId")

## Data export

In [14]:
OUT_DIR = "../data/prod"

res_df.to_csv(f"{OUT_DIR}/results.csv", index=False)
wl_df.to_csv(f"{OUT_DIR}/win_loss.csv", index=False)