In [None]:
# Import statements
import logging
import pandas as pd
from pathlib import Path
from datetime import date
from tdf_pool.cycling_calendar import get_calendar
from tdf_pool.race import Race
from tdf_pool.score import get_score_template, score_race
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

logging.basicConfig(level='INFO')

In [None]:
# Load data from 2014 to 2024
years = list(range(2023, 2024))
tdf_races = []
for year in years:
    calendar = get_calendar(year)
    tdf_entry = calendar[calendar['Name'] == "Tour de France"].iloc[0]
    tdf = Race(tdf_entry['Name'], tdf_entry['Start'], tdf_entry['Type'], tdf_entry['PartialURL'])
    tdf_races.append(tdf)

In [None]:
# Get scores for each year
current_score_template = get_score_template()
tdf_scores = [score_race(tdf, current_score_template) for tdf in tdf_races]
score_template_2023 = get_score_template(Path("../2023_score_template.toml"))
tdf_scores_2023 = [score_race(tdf, score_template_2023) for tdf in tdf_races]

# Punten distributies
We kijken naar de punten distributie van de afgelopen 10 jaar.
- Is de distributie veranderd over tijd?
- Wat is het effect op de puntentelling als dit jaar de KOM punten tijdens de etape worden meegenomen?

In [None]:
# Distributie van punten per jaar
%matplotlib widget
all_scores = []
bins = np.arange(0, 1000, 50)
fig, ax = plt.subplots(1,1, figsize=(8,4))
colors = plt.cm.rainbow(np.linspace(0,1,len(years)))
for year, score, color in zip(years, tdf_scores, colors):
    ax.hist(score['Total'], bins=bins, label=year, density=True, histtype='step', linewidth=1, color=color)
ax.set_xlabel("Punten")
ax.set_ylabel("Percentage van renners")
ax.set_title("Distributie van TDF punten per jaar")
ax.legend()

In [None]:
# Cumulatieve distributie van punten per jaar
%matplotlib widget
all_scores = []
bins = np.arange(0, 1000, 50)
fig, ax = plt.subplots(1,1, figsize=(8,4))
colors = plt.cm.rainbow(np.linspace(0,1,len(years)))
for year, score, color in zip(years, tdf_scores, colors):
    ax.hist(score['Total'], bins=bins, label=year, density=True, histtype='step', linewidth=1, color=color, cumulative=1)
ax.set_xlabel("Punten")
ax.set_ylabel("Percentage van renners")
ax.set_title("Cumulative distributie van TDF punten per jaar")
ax.legend()

De punten distributies van de afgelopen 10 jaar zijn vergelijkbaar.

In [None]:
# Cumulative distributions per category
%matplotlib widget
all_scores = []
fig, axs = plt.subplots(2,2, figsize=(9,9))

classifications = ['GC', 'Sprint', 'Youth', 'KOM']
for ax, classification in zip(axs.flatten(), classifications):
    bin_max = np.max(list(score[classification].max() for score in tdf_scores))
    bins = np.linspace(1, bin_max, 10)
    colors = plt.cm.rainbow(np.linspace(0,1,len(years)))
    for year, score, color in zip(years, tdf_scores, colors):
        ax.hist(score[classification], bins=bins, label=year, density=True, histtype='step', linewidth=1, color=color, cumulative=1)
    ax.set_xlabel("Punten")
    ax.set_ylabel("Percentage van renners")
    ax.set_title(f"Cumulative {classification}")
    ax.legend()

In [None]:
# Ratio between cumulative distributions
%matplotlib widget
all_scores = []
fig, ax = plt.subplots(1,1, figsize=(6,4))

nbins = 20
classifications = ['GC', 'Sprint', 'Youth', 'KOM']
a = 'GC'
b = 'Sprint'

colors = plt.cm.rainbow(np.linspace(0,1,len(years)))
for year, score, color in zip(years, tdf_scores, colors):
    sprint_bins = np.linspace(1, score[a].max(), nbins)
    sprint_h, _ = np.histogram(score[a], bins=sprint_bins, density=True)
    sprint_cd = np.cumsum(sprint_h)*(sprint_bins[1]-sprint_bins[0])

    kom_bins = np.linspace(1, score[b].max(), nbins)
    kom_h, _ = np.histogram(score[b], bins=kom_bins, density=True)
    kom_cd = np.cumsum(kom_h)*(kom_bins[1]-kom_bins[0])

    ax.plot(sprint_cd/kom_cd, label=year, color=color)
    ax.set_xlabel("bin")
    ax.set_ylabel(f"CDF({a})/CDF({b})")
    ax.set_title(f"Ratio of sprint and kom cumulative distribution")
    ax.legend(bbox_to_anchor=(1,1))

fig.tight_layout()

When there are a few sprinters who dominate, the line will be higher

In [None]:
# Distributie van punten voor huidige en 2023 puntentelling
%matplotlib widget
all_scores = []
bins = np.arange(0, 1000, 25)
fig, ax = plt.subplots(1,1, figsize=(8,4))

ax.hist(pd.concat(tdf_scores)['Total'], bins=bins, label="huidige puntentelling", density=True, histtype='step', linewidth=2)
ax.hist(pd.concat(tdf_scores_2023)['Total'], bins=bins, label="2023 puntentelling", density=True, histtype='step', linewidth=2, linestyle='dotted')

ax.set_xlabel("Punten")
ax.set_ylabel("Percentage van renners")
ax.set_title("Distributie van punten gedurende TDF")
ax.legend()

In [None]:
# Cumulatieve distributie van punten voor huidige en 2023 puntentelling
%matplotlib widget
all_scores = []
bins = np.arange(0, 1000, 25)
fig, ax = plt.subplots(1,1, figsize=(8,4))

ax.hist(pd.concat(tdf_scores)['Total'], bins=bins, label="huidige puntentelling", density=True, histtype='step', linewidth=2, cumulative=1)
ax.hist(pd.concat(tdf_scores_2023)['Total'], bins=bins, label="2023 puntentelling", density=True, histtype='step', linewidth=2, linestyle='dotted', cumulative=1)
ax.set_xlabel("Punten")
ax.set_ylabel("Percentage van renners")
ax.set_title("Cumulatieve distributie van punten gedurende TDF")
ax.legend()

De cumulatieve distributie van punten gebaseerd op de 2023 puntentelling ligt in het begin iets hoger dan de huidige puntentelling. Dit is logisch omdat er simpelweg meer punten worden uitgedeeld. Echter blijft de distributie dezelfde vorm behouden en kunnen we stellen dat we dezelfde renners willen selecteren.

In [None]:
%matplotlib widget
plt.figure()
huidige_punten = pd.concat(tdf_scores)['Total']
candidates = [stats.exponweib, stats.levy, stats.norminvgauss]

plt.hist(huidige_punten, bins=50, density=True)
for dist in candidates:
    # dist = stats.levy
    res = stats.fit(dist, huidige_punten, bounds=[(-10, 100), (-10, 100)])
    print(res.nllf())
    # res.plot()
    x = np.arange(900)
    plt.plot(x, dist.pdf(x, *res.params))

plt.ylim(0, 0.03)

Distributie wordt het beste geift met een levy distributie

In [None]:
# Cumulative sums per category
%matplotlib widget
all_scores = []
fig, axs = plt.subplots(2,2, figsize=(9,9))

classifications = ['GC', 'Sprint', 'Youth', 'KOM']
for ax, classification in zip(axs.flatten(), classifications):
    colors = plt.cm.rainbow(np.linspace(0,1,len(years)))
    for year, score, color in zip(years, tdf_scores, colors):
        cumsum_score = score.sort_values(by=classification, ascending=False).reset_index()[classification].cumsum()/score[classification].sum()
        ax.plot(cumsum_score, label=year, linewidth=1, color=color)
    ax.set_xlabel("Number of riders")
    ax.set_ylabel("Percentage of points")
    ax.set_title(f"Cumulative sum of {classification}")
    ax.set_xlim(0, 40)
    ax.legend()

In [None]:
# Cumulative sums per category
data = []
classifications = ['GC', 'Sprint', 'Youth', 'KOM', 'Total']
for year, score in zip(years, tdf_scores):
    row = []
    for classification in classifications:
        cumsum_score = score.sort_values(by=classification, ascending=False).reset_index()[classification].cumsum()/score[classification].sum()
        x = np.where(cumsum_score == 1)[0][0]
        row.append(np.sum(cumsum_score[:x])/x )
    data.append([year] + row)

tdf_category_dominance = pd.DataFrame(data=data, columns=['year']+classifications).set_index('year')

for col in tdf_category_dominance.columns:
    tdf_category_dominance[col] -= tdf_category_dominance[col].mean()
    tdf_category_dominance[col] /= -tdf_category_dominance[col].min()

tdf_category_dominance

Er lijkt sterke anticorrelatie te zijn tussen aucs GC en aucs Sprint. Dit betekend dat wanneer de GC punten wordt gedomineerd door maar een paar renners, dat de sprint punten verdeeld zijn over meerdere verschillende renners. En als de GC punten verdeeld zijn over veel renners dat er meer sprinters dominant zijn.

Deze observatie komt overeen met empirische observaties.

In [None]:
tdf_races[0].stages[0].profile

data = []
for race in tdf_races:
    profiles = [stage.profile.replace('icon profile p','') for stage in race.stages]
    unique_profiles, profile_counts = np.unique(profiles, return_counts=True)
    race_profile_counts = {profile: count for profile, count in zip(unique_profiles, profile_counts)}
    race_profile_counts.update({"year": race.date.year})
    data.append(race_profile_counts)

tdf_profiles = pd.DataFrame(data=data).set_index('year')
tdf_profiles.columns
tdf_profiles['climb'] = tdf_profiles['4'] + tdf_profiles['5']
tdf_profiles['sprint'] = tdf_profiles['1'] + tdf_profiles['2']


In [None]:
tdf_race_stats = tdf_profiles.join(tdf_category_dominance)
tdf_race_stats.corr()

In [None]:
tdf_2024_profile_counts = [8, 4, 1, 3, 5]

There are more sprint stages than normal in 2024. Hence, we may expect a slightly more flat sprint point distribution and slightly more concentrated total point distribution.

There are slightly less mountain stages than normal, hence we may expect a slightly more concentrated total point distribution.