In [1]:
import biathlonresults
import itertools
import pandas
import requests
import shutil
from datetime import timedelta, datetime
from pathlib import Path
from tika import parser

In [2]:
events = biathlonresults.events(1920, 1)
competitions = {}
for event in events:
    competitions[event["EventId"]] = biathlonresults.competitions(event["EventId"])

We limit ourselves to men's competitions. Relays are a bit harder to compare than other competitions so we remove them as well:

In [3]:
men_non_relays = []
for _, comps in competitions.items():
    for comp in comps:
        desc = comp["Description"]
        if desc.startswith("Men") and not "Relay" in desc:
            men_non_relays.append(comp)

Download the analysis pdfs, which contain course times per athlete:

In [4]:
pdf_dir = Path("pdfs")
pdf_dir.mkdir(exist_ok=True)
pdfs = {}
if pdf_dir.exists(): # use cached pdfs
    for pdf in pdf_dir.iterdir():
        pdfs[pdf.stem] = pdf
else:
    for comp in men_non_relays:
        reports = biathlonresults.api._request("Reports2", {"RaceId": comp["RaceId"]})
        for report in reports["Reports"]:
            if report["Description"] == "COMPETITION ANALYSIS":
                break
        else:
            continue
        pdf_file = pdf_dir / f"{comp['RaceId']}.pdf"

        r = requests.get(report["URL"], stream=True)
        r.raise_for_status()
        with pdf_file.open("wb") as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        pdfs[comp["RaceId"]] = pdf_file

Now parse the pdfs (this is ugly, but I don't know a better source for the course times):

In [5]:
def parse_time(time):
    try:
        time = datetime.strptime(time, "%H:%M:%S.%f")
    except ValueError:
        try:
            time = datetime.strptime(time, "%M:%S.%f")
        except ValueError:
            return None
    return timedelta(hours=time.hour, minutes=time.minute, seconds=time.second, microseconds=time.microsecond)

def parse_pdf(pdf_file):
    raw_data = parser.from_file(str(pdf_file))
    next_rank = 1
    last_rank = None
    current_athlete = None
    course_times = {}
    for line in raw_data["content"].split("\n"):
        line = line.split()
        if not line:
            continue
        if line[0] == "Did":
            # Did not start and Did not finish
            break
        elif line[0] == f"{next_rank}" or line[0] == f"{last_rank}":
            if line[0] == f"{next_rank}":
                last_rank = next_rank
            next_rank += 1
            current_athlete = " ".join(line[2:-5])
        elif line[0] == "Course" and line[1] == "Time":
            if not current_athlete:
                assert current_athlete
            times = []
            for elem in line:
                time = parse_time(elem)
                if time:
                    times.append(time)
            course_times[current_athlete] = times
            current_athlete = None
    return course_times
            
course_times = {}
for raceid, pdf in pdfs.items():
    course_times[raceid] = parse_pdf(pdf)

Now we select which times we want to analyze. For mass starts and pursuits, we remove the final lap since not all athletes have to give their best then. For the other formats, we use the official total time:

In [6]:
from pprint import pprint
for comp, athletes in course_times.items():
    if comp.endswith("MS") or comp.endswith("PU"):
        course_times[comp] = {athlete: sum(times[:-2], timedelta()) for athlete, times in athletes.items()}
    else:
        course_times[comp] = {athlete: times[-1] for athlete, times in athletes.items()}

We want to compare the difference of the top athletes from the mean. For the athletes we are interested in, this difference would be higher in competitions with a large number of athletes, so we wouldn't be able to compare differences across competitions.

Therefore, we have to limit the analysis to only the top 30 athletes in each competition:

In [7]:
top30 = {}
for comp, times in course_times.items():
    # works only for python 3.7+ since the dict must be ordered
    top30[comp] = dict(itertools.islice(times.items(), 30))

Now, we convert to pandas for easier analysis (athletes in rows and competitions in columns):

In [8]:
athletes = set()
athletes = athletes.union(*[athletes for _, athletes in top30.items()])
athletes = list(athletes) # we need an order
table = {}
athlete_rows = {}
for comp, times in top30.items():
    col = []
    for athlete in athletes:
        if athlete in times:
             # working with timedelta is a bit annoying in pandas, so we just use seconds
            time = times[athlete].total_seconds()
            col.append(time)
        else:
            col.append(None)
    table[comp] = col
df = pandas.DataFrame(table, index=athletes)
df

Unnamed: 0,BT1920SWRLCP02SMSP,BT1920SWRLCP06SMIN,BT1920SWRLCP01SMIN,BT1920SWRLCP01SMSP,BT1920SWRLCH__SMSP,BT1920SWRLCP03SMPU,BT1920SWRLCP05SMPU,BT1920SWRLCP04SMSP,BT1920SWRLCP05SMSP,BT1920SWRLCP02SMPU,BT1920SWRLCP06SMMS,BT1920SWRLCP03SMMS,BT1920SWRLCP04SMMS,BT1920SWRLCP03SMSP
BIRKELAND Lars Helge,,,,1405.9,,,1374.9,,,,,,,
BORMOLINI Thomas,,,3163.2,1401.5,,,,,,,,,,
MORAVEC Ondrej,,2774.7,3156.7,,,,1387.9,1503.0,1320.5,,,,1928.3,
CHRISTIANSEN Vetle Sjaastad,1417.3,2727.4,,,,1298.5,1360.5,1447.4,1261.7,,1593.4,1921.2,1855.4,1329.8
OTCENAS Martin,,,,,1327.6,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PONSILUOMA Martin,,,3079.7,,1303.7,1347.9,,,,,,1974.7,,1344.4
DOHERTY Sean,,,,,,,,1512.3,1302.8,,,,,
GUIGONNAT Antonin,1424.1,2739.7,,1383.2,,,1354.7,,1292.4,1365.1,1636.0,,,
JACQUELIN Emilien,1413.8,,3020.3,1347.1,1274.8,1300.2,1331.6,1434.9,1291.5,1327.9,1616.0,1865.5,1932.3,1339.4


Construct the relative distance from the mean in each competition per athlete and then the mean per athlete across all competitions:

In [9]:
diff_from_mean = df-df.mean()
rel_per_comp_and_athlete = diff_from_mean / df.mean()
rel_mean_per_athlete = rel_per_comp_and_athlete.mean(axis=1)

For example, Johannes is on average 3.0% faster than the mean athlete:

In [10]:
rel_mean_per_athlete.sort_values()

BOE Johannes Thingnes    -0.029561
FILLON MAILLET Quentin   -0.024592
FOURCADE Martin          -0.023705
BOE Tarjei               -0.017247
DALE Johannes            -0.015299
                            ...   
TRSAN Rok                 0.033569
GUZIK Grzegorz            0.035970
WIESTNER Sera n           0.036534
BAUER Klemen              0.036923
BABIKOV Anton             0.044892
Length: 73, dtype: float64

Now, let's see how much have they differed from their own mean per competition:

In [11]:
rel_improv = (-1*rel_per_comp_and_athlete).add(rel_mean_per_athlete, axis=0)

For example, Matvey Eliseev was 1.1% better than his average in BT1920SWRLCP02SMSP (sprint in Hochfilzen):

In [12]:
pandas.set_option('display.max_rows', len(rel_improv))
rel_improv

Unnamed: 0,BT1920SWRLCP02SMSP,BT1920SWRLCP06SMIN,BT1920SWRLCP01SMIN,BT1920SWRLCP01SMSP,BT1920SWRLCH__SMSP,BT1920SWRLCP03SMPU,BT1920SWRLCP05SMPU,BT1920SWRLCP04SMSP,BT1920SWRLCP05SMSP,BT1920SWRLCP02SMPU,BT1920SWRLCP06SMMS,BT1920SWRLCP03SMMS,BT1920SWRLCP04SMMS,BT1920SWRLCP03SMSP
BIRKELAND Lars Helge,,,,-0.007846,,,0.007846,,,,,,,
BORMOLINI Thomas,,,-0.001949,0.001949,,,,,,,,,,
MORAVEC Ondrej,,0.008434,-0.005704,,,,-0.001025,0.001571,-0.003097,,,,-0.000179,
CHRISTIANSEN Vetle Sjaastad,-0.013345,-0.006098,,,,0.009859,-0.012639,0.007501,0.010727,,0.004359,-0.004036,0.006645,-0.002973
OTCENAS Martin,,,,,0.0,,,,,,,,,
SAMUELSSON Sebastian,,,-0.020939,0.003038,0.006635,,0.004191,,0.004599,,,,,0.002476
HOFER Lukas,0.000924,-0.010249,0.037008,-0.020932,-0.008526,0.006365,,,,-0.010365,0.011083,0.001602,-0.012508,0.0056
PIDRUCHNYI Dmytro,0.008244,-0.009311,0.008754,,0.010047,0.009167,,,0.001547,0.006945,-0.018147,-0.027983,-0.002836,0.013573
HORN Philipp,0.001407,0.009081,-0.014874,,0.021478,-0.002951,,-0.000845,-0.003387,0.006549,0.005503,-0.032426,0.008206,0.00226
BOE Tarjei,0.002733,0.00038,0.005386,0.010916,0.00189,-0.006041,-0.00453,-0.004855,-0.008838,-0.011555,0.000394,0.022643,-0.009676,0.001154


And these are the top improvers per competition (BT1920SWRLCH__SMSP is the one where Loginov won):

In [13]:
pandas.reset_option('display.max_rows')
pandas.concat((rel_improv.idxmax(), rel_improv.max()), axis=1, keys=["athlete", "improvement"])

Unnamed: 0,athlete,improvement
BT1920SWRLCP02SMSP,LAPSHIN Timofei,0.016968
BT1920SWRLCP06SMIN,DALE Johannes,0.014633
BT1920SWRLCP01SMIN,HOFER Lukas,0.037008
BT1920SWRLCP01SMSP,ILIEV Vladimir,0.016278
BT1920SWRLCH__SMSP,LAPSHIN Timofei,0.0279
BT1920SWRLCP03SMPU,SCHEMPP Simon,0.016954
BT1920SWRLCP05SMPU,FAK Jakov,0.012332
BT1920SWRLCP04SMSP,ELISEEV Matvey,0.032213
BT1920SWRLCP05SMSP,ELISEEV Matvey,0.013131
BT1920SWRLCP02SMPU,CLAUDE Fabien,0.012331


In particular, Loginov was only 2.8% faster than his average in that race:

In [14]:
rel_improv["BT1920SWRLCH__SMSP"]["LOGINOV Alexander"]

0.02782688256576994