In [1]:
import pandas as pd

In [2]:
races = pd.read_csv('./ergast//races.csv')
races['date'] = pd.to_datetime(races['date'])

In [None]:
races[races['raceId'] == 1063]

In [None]:
results = pd.read_csv('./ergast//results.csv')

In [None]:
drivers = pd.read_csv('./ergast//drivers.csv')
teams = pd.read_csv('./ergast//constructors.csv')

In [None]:
races

In [None]:
results = results.merge(drivers, on='driverId')
results = results.merge(teams, on='constructorId')

In [None]:
results[results['raceId'] == 1072].sort_values('positionOrder')

In [None]:
# Subset to 2020 only

In [None]:
rel = races[races['date'].dt.year >= 2014]

In [None]:
rel

In [None]:
rel_results = results[results['raceId'].isin(rel['raceId'])]

In [None]:
# Need: races, results, drivers, constructors, status.

In [None]:
counts = rel_results['driverRef'].value_counts()
enough = counts[counts > 2].index
rel_results = rel_results[rel_results['driverRef'].isin(enough)]

In [None]:
status = pd.read_csv('./ergast//status.csv')
rel_results = rel_results.merge(status)

In [None]:
rel_results = rel_results.merge(rel, on='raceId')

In [None]:
# Fix up 2015 Italian GP
# Rosberg & Alonso technically finished because they covered more than 90% of the race, but
# had serious car problems.
rel_results.loc[(rel_results['raceId'] == 938) & (rel_results['status'].isin(['+3 Laps', '+6 Laps'])),
                'status'] = 'DNF'

In [None]:
# Team renamings
sorted(rel_results['constructorRef'].unique())

In [None]:
renamings = {
    'racing_point': 'aston_martin',
    'force_india': 'aston_martin',
    'lotus_f1': 'alpine',
    'marussia': 'haas',
    'renault': 'alpine',
    'sauber': 'alfa',
    'toro_rosso': 'alphatauri'
}

In [None]:
rel_results['grouped_constructors'] = rel_results['constructorRef'].replace(renamings)

In [None]:
rel_results['grouped_constructors'].value_counts()

In [None]:
dnf = ~rel_results['status'].str.contains('Finished|Lap').values

dnf

In [None]:
# Let's use just a single race to begin with

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
finished = rel_results[~dnf]

In [None]:
finished = finished.sort_values(['raceId', 'positionOrder'])

In [None]:
n_races = finished['raceId'].value_counts().shape[0]

In [None]:
encoder = LabelEncoder()

driver_id = encoder.fit_transform(finished['driverRef'])

team_encoder = LabelEncoder()

team_id = team_encoder.fit_transform(finished['grouped_constructors'])

In [None]:
race_encoder = LabelEncoder()

race_ids = race_encoder.fit_transform(finished['raceId'])

In [None]:
(sorted(race_ids) == race_ids).all()

In [None]:
n_finished = pd.Series(race_ids).value_counts().sort_index().values

In [None]:
total_finished = race_ids.shape[0]

In [None]:
year_encoder = LabelEncoder()

years = year_encoder.fit_transform(finished['year'])

In [None]:
not_finished = rel_results[dnf]

In [None]:
dnf_race_ids = race_encoder.transform(not_finished['raceId'])

In [None]:
driver_ids_dnf = encoder.transform(not_finished['driverRef'])
team_ids_dnf = team_encoder.transform(not_finished['grouped_constructors'])
season_ids_dnf = year_encoder.transform(not_finished['year'])

In [None]:
not_finished

In [None]:
model_code = """
functions {
    real compute_log_likelihood(vector cur_skills, int n_per_race) {
    
        real cur_lik = 0;

        for (cur_position in 1:(n_per_race - 1)) {
            vector[n_per_race - cur_position + 1] other_skills;

            for (cur_other_position in cur_position:n_per_race) {
                other_skills[cur_other_position - cur_position + 1] = cur_skills[cur_other_position];
            }

            real cur_numerator = cur_skills[cur_position];
            real cur_denominator = log_sum_exp(other_skills);
            cur_lik += cur_numerator - cur_denominator;
        }
        
        return cur_lik;

    }
}
data {
    int n_drivers;
    int n_races;
    int n_teams;
    int n_finished_by_race[n_races];
    int n_finished_total;
    int n_seasons;
    
    int driver_placings[n_finished_total];
    int team_ids[n_finished_total];
    int season_id[n_finished_total];
    
    int n_dnf;
    
    int team_ids_dnf[n_dnf];
    int driver_ids_dnf[n_dnf];
    int season_ids_dnf[n_dnf];
    int race_ids_dnf[n_dnf];
}
parameters {
    vector[n_drivers] driver_init_raw;
    matrix[n_drivers, n_seasons - 1] driver_walk_raw;
    
    vector[n_drivers] driver_risk_init_raw;
    real<lower=0> driver_risk_init_sd;
        
    vector[n_teams] team_risk_init_raw;
    real<lower=0> team_risk_init_sd;

    matrix[n_teams, n_seasons - 1] team_risk_walk_raw;
    real<lower=0> team_risk_walk_sd;
    
    real dnf_intercept;
    
    real<lower=0> driver_init_sd;
    real<lower=0> driver_season_sd;
    
    vector[n_teams] team_init_raw;
    matrix[n_teams, n_seasons - 1] team_walk_raw;
    
    real<lower=0> team_init_sd;
    real<lower=0> team_season_sd;
}
transformed parameters {
    matrix[n_drivers, n_seasons] driver_skills;
    matrix[n_teams, n_seasons] team_skills;
    
    vector[n_drivers] driver_risk;
    matrix[n_teams, n_seasons] team_risk;
    
    for (cur_driver in 1:n_drivers) {
        vector[n_seasons - 1] cur_offsets = cumulative_sum(driver_walk_raw[cur_driver])' * driver_season_sd;
        driver_skills[cur_driver, 1] = driver_init_raw[cur_driver] * driver_init_sd;
        driver_skills[cur_driver, 2:n_seasons] = driver_skills[cur_driver, 1] + cur_offsets';
        
        // DNF risk
        driver_risk[cur_driver] = driver_risk_init_raw[cur_driver] * driver_risk_init_sd;
    }

    for (cur_team in 1:n_teams) {
        vector[n_seasons - 1] cur_offsets = cumulative_sum(team_walk_raw[cur_team])' * team_season_sd;
        team_skills[cur_team, 1] = team_init_raw[cur_team] * team_init_sd;
        team_skills[cur_team, 2:n_seasons] = team_skills[cur_team, 1] + cur_offsets';
        
        // DNF risk
        cur_offsets = cumulative_sum(team_risk_walk_raw[cur_team])' * team_risk_walk_sd;
        team_risk[cur_team, 1] = team_risk_init_raw[cur_team] * team_risk_init_sd;
        team_risk[cur_team, 2:n_seasons] = team_risk[cur_team, 1] + cur_offsets';
    }

}
model {
    int cur_start_index = 1;
    
    dnf_intercept ~ normal(0, 1);
    
    driver_risk_init_sd ~ normal(0, 1);
    driver_risk_init_raw ~ std_normal();
    
    team_risk_init_raw ~ std_normal();
    to_vector(team_risk_walk_raw) ~ std_normal();
    team_risk_init_sd ~ normal(0, 1);
    team_risk_walk_sd ~ normal(0, 1);
    
    driver_init_raw ~ std_normal();
    to_vector(driver_walk_raw) ~ std_normal();
    
    team_init_raw ~ std_normal();
    to_vector(team_walk_raw) ~ std_normal();
    
    team_init_sd ~ normal(0, 1);
    driver_init_sd ~ normal(0, 1);
    
    team_season_sd ~ normal(0, 1);
    driver_season_sd ~ normal(0, 1);
    
    // Conditional on finishing
    for (cur_race in 1:n_races) {
    
        int cur_finished = n_finished_by_race[cur_race];
    
        vector[cur_finished] cur_skills;
        
        int cur_placements[cur_finished] = driver_placings[cur_start_index:cur_start_index+cur_finished-1];
        int cur_teams[cur_finished] = team_ids[cur_start_index:cur_start_index+cur_finished-1];
        int cur_seasons[cur_finished] = season_id[cur_start_index:cur_start_index+cur_finished-1];
        
        for (i in 1:cur_finished) {
            cur_skills[i] = driver_skills[cur_placements[i], cur_seasons[i]] + 
            team_skills[cur_teams[i], cur_seasons[i]];
            0 ~ bernoulli_logit(driver_risk[cur_placements[i]] + 
            team_risk[cur_teams[i], cur_seasons[i]] + dnf_intercept);
        }
        
        target += compute_log_likelihood(cur_skills, cur_finished);
        
        cur_start_index += cur_finished;
        
    }
    
    // Conditional on not finishing
    for (cur_dnf in 1:n_dnf) {
        real cur_logit_prob_dnf = driver_risk[driver_ids_dnf[cur_dnf]] + 
        team_risk[team_ids_dnf[cur_dnf], season_ids_dnf[cur_dnf]] 
            + dnf_intercept;
        1 ~ bernoulli_logit(cur_logit_prob_dnf);
    }

}
generated quantities {

    int cur_start_index = 1;

    vector[n_races] log_likelihood;    
    
    // Conditional on finishing:
    for (cur_race in 1:n_races) {
    
        int cur_finished = n_finished_by_race[cur_race];
    
        vector[cur_finished] cur_skills;
        
        int cur_placements[cur_finished] = driver_placings[cur_start_index:cur_start_index+cur_finished-1];
        int cur_teams[cur_finished] = team_ids[cur_start_index:cur_start_index+cur_finished-1];
        int cur_seasons[cur_finished] = season_id[cur_start_index:cur_start_index+cur_finished-1];
        
        log_likelihood[cur_race] = 0;
        
        for (i in 1:cur_finished) {
            cur_skills[i] = driver_skills[cur_placements[i], cur_seasons[i]] + 
            team_skills[cur_teams[i], cur_seasons[i]];
            
            log_likelihood[cur_race] += bernoulli_logit_lpmf(
                0 | driver_risk[cur_placements[i]] + 
                team_risk[cur_teams[i], cur_seasons[i]] + dnf_intercept);
        }
        
        log_likelihood[cur_race] += compute_log_likelihood(cur_skills, cur_finished);
        
        cur_start_index += cur_finished;
        
    }
    
    // Conditional on not finishing:
    for (cur_dnf in 1:n_dnf) {
        real cur_logit_prob_dnf = driver_risk[driver_ids_dnf[cur_dnf]] + 
           team_risk[team_ids_dnf[cur_dnf], season_ids_dnf[cur_dnf]] + dnf_intercept;
        log_likelihood[race_ids_dnf[cur_dnf]] += bernoulli_logit_lpmf(1 |cur_logit_prob_dnf);
    }    
}
"""

print(model_code, file=open('./f1_model.stan', 'w'))

In [None]:
from cmdstanpy import cmdstan_path, CmdStanModel
import cmdstanpy

In [None]:
data = {'n_drivers': len(encoder.classes_), 
        'n_finished_by_race': n_finished,
        'n_finished_total': total_finished,
        'n_races': n_races, 
        'driver_placings': driver_id + 1,
        'season_id': years + 1,
        'n_seasons': len(year_encoder.classes_),
        'team_ids': team_id + 1, 'n_teams': len(team_encoder.classes_),
        'n_dnf': not_finished.shape[0],
        'driver_ids_dnf': driver_ids_dnf + 1,
        'team_ids_dnf': team_ids_dnf + 1,
        'season_ids_dnf': season_ids_dnf + 1,
        'race_ids_dnf': dnf_race_ids + 1}

In [None]:
data['season_id']

In [None]:
model = CmdStanModel(stan_file='./f1_model.stan')

posterior = model.sample(data=data)

In [None]:
import arviz as az 

fit = posterior

In [None]:

arviz_version = az.from_cmdstanpy(fit, log_likelihood='log_likelihood')

In [None]:
az.rhat(arviz_version)

In [None]:
fit = posterior.stan_variables()

In [None]:
fit['driver_season_sd'].mean()

In [None]:
fit['team_season_sd'].mean()

In [None]:
fit['dnf_intercept'].mean()

In [None]:
current_drivers = finished[finished['year'] == 2022]['driverRef'].unique()
current_teams = finished[finished['year'] == 2022]['constructorRef'].unique()

In [None]:
current_drivers

In [None]:
older_drivers = finished[finished['year'] == 2016]['driverRef'].unique()

In [None]:
fit['driver_skills'].shape

In [None]:
pd.DataFrame(fit['driver_skills'].mean(axis=0), index=encoder.classes_,
             columns=year_encoder.classes_).loc[current_drivers].round(3).sort_values(2022, ascending=False)[2022]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from ml_tools.plotting import add_legend_on_right


long_drivers = np.intersect1d(current_drivers, older_drivers)

f, ax = plt.subplots(1, 1)

skills = pd.DataFrame(fit['driver_skills'].mean(axis=0), index=encoder.classes_,
             columns=year_encoder.classes_).loc[current_drivers].round(3).sort_values(
    2021, ascending=False).loc[long_drivers]

skills.T.plot(legend=False, ax=ax, marker='o')

f.set_size_inches(8, 4)

add_legend_on_right(ax)

#f.tight_layout()

plt.title('Driver ratings since 2014')
#plt.savefig('/home/martin/projects/martiningram.github.io/images/f1_post/driver_ratings.png', dpi=300)

In [None]:
from ml_tools.plotting import add_legend_on_right

f, ax = plt.subplots(1, 1)

pd.DataFrame(fit['team_skills'].mean(axis=0), index=team_encoder.classes_,
             columns=year_encoder.classes_).round(3).loc[current_teams].sort_values(2021, ascending=False).T.plot(
    ax=ax, legend=False, marker='o')

ax.grid(alpha=0.5, linestyle='--')

f.set_size_inches(10, 4)

add_legend_on_right(ax)

plt.title('Team ratings since 2014')
#plt.savefig('/home/martin/projects/martiningram.github.io/images/f1_post/team_ratings.png', dpi=300)

In [None]:
driver_df = pd.DataFrame(
    fit['driver_skills'].mean(axis=0), index=encoder.classes_, columns=year_encoder.classes_)

In [None]:
current_drivers = finished[finished['year'] >= 2021]['driverRef'].unique()

In [None]:
driver_df.loc[current_drivers][2022].sort_values(ascending=False).round(3)

In [None]:
cur_team_lookup = {encoder.transform([row.driverRef])[0]: team_encoder.transform([row.constructorRef])[0]
                   for row in finished[finished['year'] == 2021].itertuples()}

In [None]:
team_df = pd.DataFrame(
    fit['team_skills'].mean(axis=0), index=team_encoder.classes_, columns=year_encoder.classes_)

In [None]:
import matplotlib.pyplot as plt

(driver_df.loc['max_verstappen'] + team_df.loc['red_bull']).plot(marker='o', label='Verstappen + Red Bull')
(driver_df.loc['hamilton'] + team_df.loc['mercedes']).plot(marker='o', label='Hamilton + Mercedes')
plt.legend()

In [None]:
from scipy.special import expit

dnf_prob_mv = fit['driver_risk'][:, encoder.transform(['max_verstappen'])[0]] + fit['dnf_intercept'] + (
fit['team_risk'][:, team_encoder.transform(['red_bull'])[0], -1])

dnf_prob_lh = fit['driver_risk'][:, encoder.transform(['hamilton'])[0]] + fit['dnf_intercept'] + (
fit['team_risk'][:, team_encoder.transform(['mercedes'])[0], -1])

expit(dnf_prob_mv).mean(), expit(dnf_prob_lh).mean()

In [None]:
fit['driver_risk'].shape

In [None]:
f, ax = plt.subplots(1, 1)

team_risks = pd.DataFrame(expit((fit['team_risk'] + fit['dnf_intercept'].reshape(-1, 1, 1))).mean(axis=0), 
                          index=team_encoder.classes_, columns=year_encoder.classes_)

team_risks.loc[current_teams].T.plot(ax=ax, legend=False, marker='o')

ax.grid(alpha=0.5, linestyle='--')

f.set_size_inches(12, 5)

add_legend_on_right(ax)

ax.set_ylabel('DNF probability')

plt.title('DNF probability over time')
#plt.savefig('/home/martin/projects/martiningram.github.io/images/f1_post/dnf_probs.png', dpi=300)

In [None]:
driver_risks = pd.Series((fit['driver_risk']).mean(axis=0), 
                          index=encoder.classes_)

driver_risks.loc[current_drivers].sort_values(ascending=False).round(3)

In [None]:
team_df[2022].loc[current_teams].sort_values(ascending=False).round(3)

In [None]:

f, ax = plt.subplots(1, 1)

team_df.loc[current_teams].T.plot(ax=ax, marker='o')

add_legend_on_right(ax)

f.set_size_inches(10, 4)
f.tight_layout()

In [None]:
# Look at probability that Red Bull have the better car

In [None]:
merc_draws = fit['team_skills'][:, team_encoder.transform(['mercedes'])[0]]
rb_draws = fit['team_skills'][:, team_encoder.transform(['red_bull'])[0]]

In [None]:
plt.plot(team_df.columns, (rb_draws > merc_draws).mean(axis=-1), marker='o')

In [None]:
(rb_draws > merc_draws).mean(axis=-1)

In [None]:
finished.iloc[-1]

In [None]:
n_dnf = rel_results[dnf]['driverRef'].value_counts()

In [None]:
mean_skill = driver_df.mean(axis=1)

In [None]:
to_show = set(n_dnf.index) & set(mean_skill.index)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as pxe
import numpy as np

pxe.scatter(x=n_dnf[to_show], y=mean_skill[to_show], hover_name=list(to_show))

In [None]:
from scipy.stats import linregress

linregress(n_dnf[to_show], mean_skill[to_show])

In [None]:
n_dnf

In [None]:
fit['driver_init_sd'].mean(axis=-1)

In [None]:
fit['team_init_sd'].mean(axis=-1)

In [None]:
import arviz as az

In [None]:
#az.from_pystan(fit, log_likelihood='log_likelihood').to_netcdf('/media/martin/big_extra_space/f1_fits/with_extra_teams.netcdf')

In [None]:
arviz_version = az.from_pystan(fit, log_likelihood="log_likelihood")

In [None]:
#az.to_netcdf(arviz_version, '/media/martin/big_extra_space/f1_fits/constant_driver_risk_2014.netcdf')

In [None]:
#other_model = az.from_netcdf('/media/martin/big_extra_space/f1_fits/constant_driver_skills_2014.netcdf')

In [None]:
#az.compare({'dynamic_driver_skills': arviz_version, 'fixed_driver_skills': other_model})

In [None]:
az.loo(arviz_version)

In [None]:
race_liks = pd.Series(fit['log_likelihood'].mean(axis=-1))

In [None]:
races

In [None]:
race_encoder.classes_

In [None]:
with_liks = races.set_index('raceId')
with_liks.loc[race_encoder.classes_, 'race_liks'] = race_liks.values

In [None]:
race_liks.shape, len(race_encoder.classes_)

In [None]:
with_liks.dropna().sort_values('race_liks').head(20)

In [None]:
finished[finished['raceId'] == 1063][['position', 'driverRef', 'status']]

In [None]:
# Can I simulate points?

In [None]:
fit['driver_skills'].shape

In [None]:
current_drivers

In [None]:
cur_races = finished[finished['year'] == 2021]
cur_combos = {row.driverRef: row.grouped_constructors for row in cur_races.itertuples()}

combo_series = pd.Series(cur_combos)

index_version = combo_series.copy()
index_version.index = encoder.transform(index_version.index)
index_version = index_version.apply(lambda x: team_encoder.transform([x])[0])
#index_version = team_encoder.transform(index_version.values)

index_version

In [None]:
# Conditional on finishing
cur_combo_skills = (fit['driver_skills'][index_version.index, -1, :] + 
                    fit['team_skills'][index_version.values, -1, :])

# Probability of not finishing
not_finish = (fit['driver_risk'][index_version.index, :] + 
              fit['team_risk'][index_version.values, -1, :] +
              fit['dnf_intercept'])

In [None]:
pd.DataFrame(cur_combo_skills, index=combo_series.index).loc['max_verstappen'].hist()
pd.DataFrame(cur_combo_skills, index=combo_series.index).loc['hamilton'].hist()

In [None]:
driver_draws = pd.DataFrame(fit['driver_skills'][:, -1, :].T, columns=encoder.classes_)
team_draws = pd.DataFrame(fit['team_skills'][:, -1, :].T, columns=team_encoder.classes_)

In [None]:
driver_probs = pd.DataFrame((np.exp(fit['driver_skills']) / (np.exp(fit['driver_skills']) + np.exp(0))).mean(axis=-1),
             index=encoder.classes_, columns=year_encoder.classes_)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from ml_tools.plotting import add_legend_on_right

long_drivers = np.intersect1d(current_drivers, older_drivers)

f, ax = plt.subplots(1, 1)

skills = driver_probs.loc[current_drivers].round(3).sort_values(
    2021, ascending=False).loc[long_drivers]

skills.T.plot(legend=False, ax=ax, marker='o')

f.set_size_inches(8, 4)

add_legend_on_right(ax)
ax.grid(linestyle='--', alpha=0.5)
ax.set_xlabel('Year')
ax.set_ylabel('Win probability')

#f.tight_layout()

plt.title('Probability of beating average driver in a race')
plt.savefig('/home/martin/projects/martiningram.github.io/images/f1_post/driver_probs.png', dpi=300)

In [None]:
(np.exp(driver_draws['max_verstappen']) / (np.exp(driver_draws['max_verstappen']) + np.exp(driver_draws['hamilton']))).mean()

In [None]:
plt.scatter(driver_draws['stroll'], driver_draws['vettel'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(1, 1)

sns.distplot(driver_draws['hamilton'], ax=ax, label='Lewis Hamilton')
sns.distplot(driver_draws['max_verstappen'], ax=ax, label='Max Verstappen')

ax.legend()
ax.set_xlabel('Skill (logit scale)')

f.set_size_inches(7, 4)
f.tight_layout()

plt.savefig('./max_vs_lewis_kde.png', dpi=300)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(1, 1)

sns.distplot(team_draws['mercedes'], ax=ax, label='Mercedes')
sns.distplot(team_draws['red_bull'], ax=ax, label='Red Bull')

ax.legend()
ax.set_xlabel('Skill (logit scale)')

f.set_size_inches(7, 4)
f.tight_layout()

plt.savefig('./merc_rb_draws.png', dpi=300)

In [None]:
np.round(np.percentile(driver_draws['max_verstappen'], [2.5, 50., 97.5]), 2)

In [None]:
np.mean(driver_draws['max_verstappen'] > driver_draws['hamilton'])

In [None]:
points = np.array([25, 18, 15, 12, 10, 8, 6, 4, 2, 1])
len(points)

In [None]:
import numpy as np

all_results = list()

# Simulate:
for cur_draw in range(not_finish.shape[-1]):
        
    cur_points = np.zeros(cur_combo_skills.shape[0])
    
    cur_skills = cur_combo_skills[..., cur_draw].copy()
    cur_prob_not_finish = expit(not_finish[..., cur_draw]).copy()
    
    finish = np.random.uniform(size=cur_prob_not_finish.shape[0]) > cur_prob_not_finish
    
    cur_skills[~finish] = -np.inf
    
    cur_result = pd.Series(cur_skills, index=combo_series.index)
    
    all_results.append(cur_result)
    


In [None]:
all_results = pd.DataFrame(all_results)

In [None]:
(all_results == -np.inf).mean().sort_values()

In [None]:
# Gumbel: Maybe I can assume scale is 1? Check this.
np.random.gumbel()

In [None]:
points = np.array([25, 18, 15, 12, 10, 8, 6, 4, 2, 1])
point_dict = {i+1: x for i, x in enumerate(points)}

point_dict

In [None]:
# Eugh how do I actually generate a race outcome?
# I guess I have to draw from Gumbels?

all_points = list()
all_orders = list()

for i, cur_result in all_results.iterrows():
    
    cur_draw = pd.Series(np.random.gumbel(loc=cur_result, scale=1),
                          index = cur_result.index)
    
    cur_order = cur_draw.sort_values(ascending=False)
    cur_order = pd.Series(np.arange(cur_order.shape[0]) + 1, index=cur_order.index)
    all_orders.append(cur_order)
    
    points = cur_order.apply(lambda x: point_dict.get(x, 0))
    
    points[cur_draw == -np.inf] = np.nan
    
    all_points.append(points)

all_points = pd.DataFrame(all_points)

In [None]:
pd.DataFrame(all_orders)['max_verstappen'].hist()

In [None]:
all_points['max_verstappen'].hist()
all_points['perez'].hist()

In [None]:
# This doesn't make sense. Mazepin shouldn't be so high.
# And the Verstappen thing doesn't make sense either. Perez shouldn't finish _that_ much more often. Hm.
all_points.fillna(0).mean().sort_values(ascending=False).round(2)

In [None]:
(all_points['sainz'] + all_points['leclerc']).fillna(0).mean()

In [None]:
(all_points['norris'] + all_points['ricciardo']).fillna(0).mean()

In [None]:
15 * 17, 15 * 10.11

In [None]:
all_points

In [None]:
def split_given_size(a, size):
    return np.split(a, np.arange(size,len(a),size))


point_splits = split_given_size(all_points.fillna(0).values, 1)

In [None]:
only_full = point_splits[:-1]

assert(len(set([len(x) for x in only_full])) == 1)

In [None]:
point_sims = [pd.Series(x.sum(axis=0), index=all_points.columns) for x in only_full]

point_sims = pd.DataFrame(point_sims)

point_sims['max_verstappen'].hist()
point_sims['hamilton'].hist()

In [None]:
point_sims['hamilton'].describe()

In [None]:
np.percentile(point_sims['bottas'], [2.5, 50, 97.5])

In [None]:
point_sims.apply(lambda x: np.percentile(x, [2.5, 50, 97.5]), axis=0).T.sort_values(1, ascending=False)

In [None]:
(point_sims.idxmax(axis=1) == 'hamilton').mean()

In [None]:
all_points['max_verstappen'].fillna(0).hist()

In [None]:
((point_sims['hamilton'] - point_sims['max_verstappen']) > 0).mean()

In [None]:
all_points['max_verstappen']

In [None]:
1 / (1 - 0.2933)

In [None]:
all_points['hamilton'].fillna(0).hist()

In [None]:
pd.DataFrame(expit(not_finish), index=combo_series.index).mean(axis=1).sort_values().round(3)

In [None]:
pd.DataFrame(expit(not_finish), index=combo_series.index).loc['max_verstappen'].hist()
pd.DataFrame(expit(not_finish), index=combo_series.index).loc['hamilton'].hist()

In [None]:
cur_combo_skills

In [None]:
liks = pd.Series(fit['log_likelihood'].mean(axis=1), index=race_encoder.classes_)

In [None]:
liks.sort_values()

In [None]:
races[races['raceId'] == 1063]

In [None]:
print(results[results['raceId'] == 1063].sort_values('positionOrder')[['position', 'driverRef']].reset_index(drop=True).to_markdown())

In [None]:
np.exp(4) / (np.exp(4) + np.exp(2))

In [None]:
# Gasly check

In [None]:
races[races['name'].str.contains('Azerbaijan')]

In [None]:
finished.iloc[-1]