In [1]:
import biathlonresults
import itertools
import pandas
import requests
import shutil
from datetime import timedelta, datetime
from pathlib import Path
from tika import parser

In [2]:
events = biathlonresults.events(1920, 1)
competitions = {}
for event in events:
    competitions[event["EventId"]] = biathlonresults.competitions(event["EventId"])

We limit ourselves to men's competitions. Relays are a bit harder to compare than other competitions so we remove them as well:

In [3]:
men_non_relays = []
for _, comps in competitions.items():
    for comp in comps:
        desc = comp["Description"]
        if desc.startswith("Men") and not "Relay" in desc:
            men_non_relays.append(comp)

Download the analysis pdfs, which contain course times per athlete:

In [4]:
pdf_dir = Path("pdfs")
pdf_dir.mkdir(exist_ok=True)
pdfs = {}
if pdf_dir.exists(): # use cached pdfs
    for pdf in pdf_dir.iterdir():
        pdfs[pdf.stem] = pdf
else:
    for comp in men_non_relays:
        reports = biathlonresults.api._request("Reports2", {"RaceId": comp["RaceId"]})
        for report in reports["Reports"]:
            if report["Description"] == "COMPETITION ANALYSIS":
                break
        else:
            continue
        pdf_file = pdf_dir / f"{comp['RaceId']}.pdf"

        r = requests.get(report["URL"], stream=True)
        r.raise_for_status()
        with pdf_file.open("wb") as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        pdfs[comp["RaceId"]] = pdf_file

Now parse the pdfs (this is ugly, but I don't know a better source for the course times):

In [5]:
def parse_pdf(pdf_file):
    raw_data = parser.from_file(str(pdf_file))
    next_rank = 1
    last_rank = None
    current_athlete = None
    course_times = {}
    for line in raw_data["content"].split("\n"):
        line = line.split()
        if not line:
            continue
        if line[0] == "Did":
            # Did not start and Did not finish
            break
        elif line[0] == f"{next_rank}" or line[0] == f"{last_rank}":
            if line[0] == f"{next_rank}":
                last_rank = next_rank
            next_rank += 1
            current_athlete = " ".join(line[2:-5])
        elif line[0] == "Course" and line[1] == "Time":
            if not current_athlete:
                assert current_athlete
            time = line[-3]
            try:
                time = datetime.strptime(time, "%H:%M:%S.%f")
            except ValueError:
                time = datetime.strptime(time, "%M:%S.%f")
            time = timedelta(hours=time.hour, minutes=time.minute, seconds=time.second, microseconds=time.microsecond)
            course_times[current_athlete] = time
            current_athlete = None
    return course_times
            
course_times = {}
for raceid, pdf in pdfs.items():
    course_times[raceid] = parse_pdf(pdf)

We want to compare the difference of the top athletes from the mean. For the athletes we are interested in, this difference would be higher in competitions with a large number of athletes, so we wouldn't be able to compare differences across competitions.

Therefore, we have to limit the analysis to only the top 30 athletes in each competition:

In [6]:
top30 = {}
for comp, times in course_times.items():
    # works only for python 3.7+ since the dict must be ordered
    top30[comp] = dict(itertools.islice(times.items(), 30))

Now, we convert to pandas for easier analysis (athletes in rows and competitions in columns):

In [7]:
athletes = set()
athletes = athletes.union(*[athletes for _, athletes in top30.items()])
athletes = list(athletes) # we need an order
table = {}
athlete_rows = {}
for comp, times in top30.items():
    col = []
    for athlete in athletes:
        if athlete in times:
             # working with timedelta is a bit annoying in pandas, so we just use seconds
            time = times[athlete].total_seconds()
            col.append(time)
        else:
            col.append(None)
    table[comp] = col
df = pandas.DataFrame(table, index=athletes)
df

Unnamed: 0,BT1920SWRLCP02SMSP,BT1920SWRLCP06SMIN,BT1920SWRLCP01SMIN,BT1920SWRLCP01SMSP,BT1920SWRLCH__SMSP,BT1920SWRLCP03SMPU,BT1920SWRLCP05SMPU,BT1920SWRLCP04SMSP,BT1920SWRLCP05SMSP,BT1920SWRLCP02SMPU,BT1920SWRLCP06SMMS,BT1920SWRLCP03SMMS,BT1920SWRLCP04SMMS,BT1920SWRLCP03SMSP
BJOENTEGAARD Erlend,1437.5,,3068.9,1346.3,,1600.8,1676.1,1431.4,1279.7,1646.5,1970.7,2387.0,2186.6,1319.6
LOGINOV Alexander,1409.2,2724.7,3063.1,1358.3,1260.7,1679.5,1666.6,1449.0,1305.5,1686.7,2010.3,2381.7,2213.3,1349.4
STRELTSOV Kirill,,2844.7,,,,,,,,,2057.4,,,
CLAUDE Florent,,,3117.9,1381.3,,1668.1,,,,,,2471.2,,1350.8
FAK Jakov,1433.0,2744.5,,1390.4,,,1683.6,1500.0,1307.8,1676.1,2052.5,2391.8,2230.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BAUER Klemen,1465.1,,,,,,,,,,,,,
DUDCHENKO Anton,1465.1,,,,,1658.1,1728.4,,,,,,,
ELISEEV Matvey,1417.3,,3121.0,1368.6,1309.5,1729.7,1701.4,1447.0,1290.1,1699.4,2047.6,2655.7,2218.2,1361.6
LEITNER Felix,1424.9,,3040.5,,1304.4,,,,,1674.5,,2373.6,2269.8,


Construct the relative distance from the mean in each competition per athlete and then the mean per athlete across all competitions:

In [8]:
diff_from_mean = df-df.mean()
rel_per_comp_and_athlete = diff_from_mean / df.mean()
rel_mean_per_athlete = rel_per_comp_and_athlete.mean(axis=1)

For example, Johannes is on average 2.8% faster than the mean athlete:

In [9]:
rel_mean_per_athlete.sort_values()

BOE Johannes Thingnes    -0.027684
FILLON MAILLET Quentin   -0.022864
FOURCADE Martin          -0.022447
KUEHN Johannes           -0.016629
BOE Tarjei               -0.016537
                            ...   
LANDERTINGER Dominik      0.032851
GUZIK Grzegorz            0.035970
LAPSHIN Timofei           0.035998
BAUER Klemen              0.036923
BABIKOV Anton             0.044892
Length: 73, dtype: float64

Now, let's see how much have they differed from their own mean per competition:

In [10]:
rel_improv = (-1*rel_per_comp_and_athlete).add(rel_mean_per_athlete, axis=0)

For example, Matvey Eliseev was 1.3% better than his average in BT1920SWRLCP02SMSP (sprint in Hochfilzen):

In [11]:
pandas.set_option('display.max_rows', len(rel_improv))
rel_improv

Unnamed: 0,BT1920SWRLCP02SMSP,BT1920SWRLCP06SMIN,BT1920SWRLCP01SMIN,BT1920SWRLCP01SMSP,BT1920SWRLCH__SMSP,BT1920SWRLCP03SMPU,BT1920SWRLCP05SMPU,BT1920SWRLCP04SMSP,BT1920SWRLCP05SMSP,BT1920SWRLCP02SMPU,BT1920SWRLCP06SMMS,BT1920SWRLCP03SMMS,BT1920SWRLCP04SMMS,BT1920SWRLCP03SMSP
BJOENTEGAARD Erlend,-0.030432,,-0.011725,0.001876,,0.01343,-0.005497,0.015568,-0.006031,0.00632,0.01314,-0.006189,0.007691,0.00185
LOGINOV Alexander,-0.00137,0.00113,-0.000804,0.002129,0.027041,-0.025398,0.009161,0.012658,-0.017017,-0.00859,0.002605,0.00505,0.004767,-0.011363
STRELTSOV Kirill,,-0.011008,,,,,,,,,0.011008,,,
CLAUDE Florent,,,0.000642,0.00458,,0.000814,,,,,,-0.012909,,0.006872
FAK Jakov,-0.007233,0.004882,,-0.010377,,,0.010076,-0.010972,-0.007821,0.008705,-0.007267,0.011828,0.008179,
MALYSHKO Dmitry,0.002582,,,,,,,,,-0.002582,,,,
STROLIA Vytautas,,,,,0.001653,,,0.001042,-0.002695,,,,,
NAWRATH Philipp,,0.006778,,,,,0.004702,,-0.003107,,-0.008372,,,
LANGER Thierry,0.002027,,,,,,,,-0.000767,-0.00126,,,,
LATYPOV Eduard,,-0.006687,,0.006687,,,,,,,,,,


And these are the top improvers per competition (BT1920SWRLCH__SMSP is the one where Loginov won):

In [12]:
pandas.reset_option('display.max_rows')
pandas.concat((rel_improv.idxmax(), rel_improv.max()), axis=1, keys=["athlete", "improvement"])

Unnamed: 0,athlete,improvement
BT1920SWRLCP02SMSP,LAPSHIN Timofei,0.020378
BT1920SWRLCP06SMIN,DALE Johannes,0.014242
BT1920SWRLCP01SMIN,HOFER Lukas,0.038638
BT1920SWRLCP01SMSP,ILIEV Vladimir,0.017587
BT1920SWRLCH__SMSP,LAPSHIN Timofei,0.031309
BT1920SWRLCP03SMPU,WINDISCH Dominik,0.015215
BT1920SWRLCP05SMPU,JACQUELIN Emilien,0.014161
BT1920SWRLCP04SMSP,ELISEEV Matvey,0.034005
BT1920SWRLCP05SMSP,ELISEEV Matvey,0.014923
BT1920SWRLCP02SMPU,CLAUDE Fabien,0.011783


In particular, Loginov was only 2.7% faster than his average in that race:

In [13]:
rel_improv["BT1920SWRLCH__SMSP"]["LOGINOV Alexander"]

0.02704111923107036