In [1]:
import datetime as dt

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sn

sn.set_theme()
plt.rcParams.update({
    "figure.figsize": [20, 10],
    "font.size": 16
})

In [2]:
POINTS_MAP = {
    1: 25,
    2: 18,
    3: 15,
    4: 12,
    5: 10,
    6: 8,
    7: 6,
    8: 4,
    9: 2,
    10: 1
}

def get_points(race_pos: int, fast_pos: int) -> int:
    '''Returns points scored with 2023 F1 scoring system'''
    
    points = POINTS_MAP[race_pos] if race_pos >= 1 and race_pos <= 10 else 0
    points += 1 if fast_pos == 1 else 0

    return points

In [3]:
DATA_DIR = "../data"

results_df = pd.read_csv(f"{DATA_DIR}/results.csv")[["raceId", "driverId", "constructorId", "position", "time", "milliseconds", "rank", "statusId"]]
races_df = pd.read_csv(f"{DATA_DIR}/races.csv")[["raceId", "year", "round", "date"]]
drivers_df = pd.read_csv(f"{DATA_DIR}/drivers.csv")[["driverId", "driverRef"]]
constructors_df = pd.read_csv(f"{DATA_DIR}/constructors.csv")[["constructorId", "constructorRef"]]
status_df = pd.read_csv(f"{DATA_DIR}/status.csv")

raw_df = results_df.merge(races_df, on="raceId", how="left")
raw_df = raw_df.merge(drivers_df, on="driverId", how="left")
raw_df = raw_df.merge(constructors_df, on="constructorId", how="left")
raw_df = raw_df.merge(status_df, on="statusId", how="left")

raw_df = raw_df[["year", "round", "date", "constructorRef", "driverRef", "position", "rank", "time", "milliseconds", "status"]]
raw_df = raw_df.rename(columns={
    "driverRef": "driverId",
    "constructorRef": "constructorId"
})

FileNotFoundError: [Errno 2] No such file or directory: 'data/results.csv'

In [None]:
res_df = raw_df.copy()
res_df = res_df[res_df["year"] > 1985] # more than 2 drivers per team <= 1985
res_df["estPosition"] = res_df.groupby(["year", "round"]).cumcount() + 1
res_df["estPoints"] = res_df[["estPosition", "rank"]].apply(
    lambda row: get_points(row["estPosition"], row["rank"]), axis=1
)
res_df["milliseconds"] = res_df["milliseconds"].replace("\\N", None).astype(float)
res_df["status"] = res_df["status"].apply(str.lower)
res_df["date"] = pd.to_datetime(res_df["date"])
res_df = res_df.drop(columns=["position", "rank"])

In [None]:
win_cols = ["year", "round", "constructorId"]
win_df = res_df.groupby(win_cols)["estPosition"].min().reset_index()
win_df["win"] = 1

res_df = res_df.merge(win_df, on=win_cols.append("estPosition"), how="left")
res_df["win"] = res_df["win"].fillna(0)

wl_df = pd.pivot_table(res_df, values="driverId", index=["year", "round", "constructorId"], 
                       columns="win", aggfunc=lambda x: ' '.join(x)).reset_index()

In [None]:
res_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15756 entries, 0 to 15755
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   year           15756 non-null  int64         
 1   round          15756 non-null  int64         
 2   date           15756 non-null  datetime64[ns]
 3   constructorId  15756 non-null  object        
 4   driverId       15756 non-null  object        
 5   time           15756 non-null  object        
 6   milliseconds   5475 non-null   float64       
 7   status         15756 non-null  object        
 8   estPosition    15756 non-null  int64         
 9   estPoints      15756 non-null  int64         
 10  win            15756 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(4), object(4)
memory usage: 1.3+ MB


In [None]:
elo_scores = {id: 1500 for id in set(res_df["driverId"])}
res_df["score"] = None

k = 32
c = 300
l = 4

for _, row in wl_df.iterrows(): 
    
    d_a = row.loc[0]
    d_b = row.loc[1]

    if not (isinstance(row.loc[0], str) and isinstance(row.loc[1], str)):
        continue # < 2 drivers participated

    # get previous ratings and points scored in weekend by drivers
    a_ix = (res_df["year"] == row["year"]) & (res_df["round"] == row["round"]) & (res_df["driverId"] == row.loc[0])
    b_ix = (res_df["year"] == row["year"]) & (res_df["round"] == row["round"]) & (res_df["driverId"] == row.loc[1])
    
    p_a = res_df.loc[a_ix, "estPoints"].iloc[0]
    p_b = res_df.loc[b_ix, "estPoints"].iloc[0]

    r_a = elo_scores[row.loc[0]]
    r_b = elo_scores[row.loc[1]]

    # calculate scaled distance apart
    o_a = res_df.loc[a_ix, "status"].iloc[0]
    o_b = res_df.loc[b_ix, "status"].iloc[0]

    
    m_a = res_df.loc[a_ix, "milliseconds"].iloc[0]
    m_b = res_df.loc[b_ix, "milliseconds"].iloc[0]


    # calculate expected and realised outcome scores
    if p_a + p_b == 0:
        s_a = 0.5
        s_b = 0.5
    else:
        s_a = p_a / (p_a + p_b)
        s_b = p_b / (p_a + p_b)

    q_a = 10 ** (r_a / c)
    q_b = 10 ** (r_b / c)

    e_a = q_a / (q_a + q_b)        
    e_b = q_b / (q_a + q_b)

    # calculate new ratings and insert to results df
    n_a = r_a + (k * (0 - e_a)) + (l * s_a)
    n_b = r_b + (k * (1 - e_b)) + (l * s_b)

    res_df.loc[a_ix, "score"] = n_a
    elo_scores[row.loc[0]] = n_a

    res_df.loc[b_ix, "score"] = n_b
    elo_scores[row.loc[1]] = n_b


In [None]:
min_races = res_df.groupby("year")["round"].nunique().sort_values().iloc[:2].sum() # sum of shortest two seasons
ttl_races = res_df["driverId"].value_counts()
drivers = list(ttl_races[ttl_races > min_races].index)


drivers_2023 = list(res_df.loc[res_df["year"] == 2023, "driverId"])

In [None]:
sub_df = res_df[res_df["driverId"].isin(drivers_2023)].sort_values(["year", "round"])
px.line(sub_df, x="date", y="score", color="driverId")

In [None]:
res_df.groupby("driverId")["score"].max().sort_values(ascending=False).dropna().head(10)

driverId
hamilton              2367.301148
alonso                2294.284175
rosberg               2225.568602
button                2200.330642
bottas                 2187.46776
max_verstappen        2186.395286
vettel                2177.088154
barrichello           2108.770029
russell               2107.421148
michael_schumacher    2104.174836
Name: score, dtype: object