In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as pc

In [2]:
# Read data 
subjects = pd.read_csv("../data/treadmill/subject-info.csv")
measures = pd.read_csv("../data/treadmill/test_measure.csv")

subjects = subjects.sort_values(by=["ID_test"])
measures = measures.sort_values(by=["ID_test", "time"])

# O2 and CO2 rates in ml/min, VE in l/min
measures["O2_rate"] = measures["VO2"]
measures["CO2_rate"] = measures["VCO2"]
measures["air_rate"] = measures["VE"]

# Compute volume for the given breath
measures["O2_vol"] = measures["O2_rate"] / measures["RR"]
measures["CO2_vol"] = measures["CO2_rate"] / measures["RR"]
measures["air_vol"] = measures["air_rate"] / measures["RR"]

# Compute cumulative volume of for the given test
measures["O2_cum"] = measures.groupby("ID_test")["O2_vol"].cumsum()
measures["CO2_cum"] = measures.groupby("ID_test")["CO2_vol"].cumsum()
measures["air_cum"] = measures.groupby("ID_test")["air_vol"].cumsum()

measures = measures.drop(columns=["VO2", "VCO2", "VE"])

# Define numerical features
measures_numerical = {'time', 'Speed', 'HR', 'RR', 'O2_rate', 'CO2_rate',
       'air_rate', 'O2_vol', 'CO2_vol', 'air_vol', 'O2_cum', 'CO2_cum',
       'air_cum', 'Age', 'Weight', 'Height', 'Humidity', 'Temperature'}

In [3]:
# Normalizes the start times of the test from start speed of 5 km/h
measures_test = measures[measures["Speed"] > 5]
normal_starts = measures_test.groupby("ID_test")["time"].transform("first")
measures_test.loc[:,"time"] = measures_test["time"] - normal_starts

# Normalizes the start times of the test from start speed of 6.1 km/h
ids_slow = measures_test[(measures_test["Speed"]==6.1) &(measures_test["time"]>100)]["ID_test"].unique()
first_steps = measures_test[measures_test["ID_test"].isin(ids_slow) & (measures_test["Speed"] > 6.1)].groupby("ID_test")["time"].first()
measures_test.loc[:,"time"] = measures_test["time"] - measures["ID_test"].map(first_steps).fillna(0) + 60
measures_test = measures_test[measures_test["time"] > 0]

# Remove tests with non-standard speeds increase rates
outlier_ids = list(measures_test[(measures_test["time"].between(276,301)) & (measures_test["Speed"] > 11.4)]["ID_test"].unique())
outlier_ids += ["715_1", "796_1", "593_1", "789_1", "479_1", "73_1", "667_1", "318_1", "376_1", "695_1"]
measures_test = measures_test[(measures_test["time"] > 0) & (~measures_test["ID_test"].isin(outlier_ids))]

# Remove cooldown periods
measures_test = measures_test[~((measures_test["time"] > 600) & (measures_test["Speed"] < 12))]

# Start all tests at time 0
measures_test.loc[:,"time"] = measures_test["time"] - measures_test.groupby("ID_test")["time"].transform("min")

In [4]:
# Average measures for each second
measure_inter = measures_test.groupby(["ID_test", "time"]).mean()

# Get max time achieved by each particpant
max_secs = measures_test.groupby("ID_test")["time"].agg("max")

# Generate new index
tuples = [
    (id_test, second) 
    for id_test, max_second in max_secs.items() 
    for second in range(max_second + 1)
]
index = pd.MultiIndex.from_tuples(tuples, names=["ID_test", "time"])

# Reindex and interpolate
measures_test = measure_inter.reindex(index).interpolate(method="linear", limit_direction="both").reset_index()
measures_test.loc[:, "ID"] = measures_test["ID"].astype("int")

In [5]:
# Join with subjects
measures_test = measures_test.merge(subjects, on=["ID_test", "ID"], how="left")

# Compute max speed attained quintile
measures_test["max_speed_quintile"] = measures_test["ID_test"].map(pd.qcut(measures_test.groupby("ID_test")["time"].max(), 5, labels=False))

# Computes cumulative distance traveled
def compute_distance(group):
    t = group["time"]
    v = group["Speed"]  # km/h

    dt = t.diff().fillna(0) / 3600  # seconds → hours
    v_avg = v.rolling(2).mean().bfill()

    dist = v_avg * dt  # km
    return dist.cumsum()

# Compyte cumulative distance traveled
measures_test = measures_test.sort_values(["ID_test", "time"])
measures_test["dist_km"] = (
    measures_test.groupby("ID_test", group_keys=False)[["time", "Speed"]]
                 .apply(compute_distance)
                 .reset_index(drop=True))

In [6]:
measures

Unnamed: 0,time,Speed,HR,RR,ID_test,ID,O2_rate,CO2_rate,air_rate,O2_vol,CO2_vol,air_vol,O2_cum,CO2_cum,air_cum
68847,0,5.0,,20,100_1,100,386.0,256.0,10.1,19.300000,12.800000,0.505000,19.300000,12.800000,0.505000
68848,2,5.0,,26,100_1,100,617.0,423.0,14.2,23.730769,16.269231,0.546154,43.030769,29.069231,1.051154
68849,5,5.0,54.0,20,100_1,100,386.0,255.0,10.1,19.300000,12.750000,0.505000,62.330769,41.819231,1.556154
68850,7,5.0,,23,100_1,100,488.0,329.0,11.9,21.217391,14.304348,0.517391,83.548161,56.123579,2.073545
68851,9,5.0,91.0,25,100_1,100,506.0,344.0,12.8,20.240000,13.760000,0.512000,103.788161,69.883579,2.585545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5139,885,5.0,141.0,32,9_1,9,1174.0,1371.0,43.9,36.687500,42.843750,1.371875,22479.722523,24863.865321,705.088592
5140,888,5.0,141.0,28,9_1,9,1194.0,1395.0,44.3,42.642857,49.821429,1.582143,22522.365380,24913.686750,706.670735
5141,890,5.0,140.0,27,9_1,9,1197.0,1390.0,43.9,44.333333,51.481481,1.625926,22566.698713,24965.168231,708.296661
5142,893,5.0,140.0,27,9_1,9,1166.0,1352.0,42.2,43.185185,50.074074,1.562963,22609.883898,25015.242306,709.859624


In [7]:
subjects

Unnamed: 0,Age,Weight,Height,Humidity,Temperature,Sex,ID,ID_test
879,42.1,76.0,167.0,39.0,17.60,0,100,100_1
852,41.0,81.0,173.0,39.0,17.60,0,101,101_1
639,31.9,108.0,179.0,47.0,18.09,0,102,102_1
664,33.0,101.0,199.0,47.0,18.09,0,103,103_1
622,31.4,78.0,176.0,48.0,21.50,0,104,104_1
...,...,...,...,...,...,...,...,...
804,38.6,80.0,179.0,53.0,19.60,0,98,98_1
816,39.1,50.0,165.4,57.0,21.40,1,99,99_1
862,41.4,49.5,165.4,38.0,23.20,1,99,99_47
869,41.7,49.5,165.4,49.0,24.30,1,99,99_54


In [8]:

runner_counts = measures_test.groupby("time")["ID_test"].nunique().reset_index(name="runner_count")

fig = px.line(runner_counts, x="time", y="runner_count",
              title="Runners Remaining on Treadmill Over Time",
              labels={"time": "Time (seconds)", "runner_count": "Number of Runners"})
fig.show()

In [9]:
hr_avg = (
    measures_test.groupby(["max_speed_quintile", "time"])["HR"]
    .mean()
    .reset_index()
)

fig = px.line(hr_avg, x="time", y="HR", color="max_speed_quintile",
              title="Average Heart Rate Over Time by Performance Quintile",
              labels={"time": "Time (seconds)", "HR": "Heart Rate (bpm)", "max_speed_quintile": "Quintile"})
fig.show()

In [10]:
rr_avg = (
    measures_test.groupby(["max_speed_quintile", "time"])["RR"]
    .mean()
    .reset_index()
)

fig = px.line(rr_avg, x="time", y="RR", color="max_speed_quintile",
              title="Average Respiratory Rate Over Time by Performance Quintile",
              labels={"time": "Time (seconds)", "RR": "Breaths per Minute", "max_speed_quintile": "Quintile"})
fig.show()


In [11]:
rr_avg = (
    measures_test.groupby(["max_speed_quintile", "time"])["RR"]
    .mean()
    .reset_index()
)

fig = px.line(rr_avg, x="time", y="RR", color="max_speed_quintile",
              title="Average Respiratory Rate Over Time by Performance Quintile",
              labels={"time": "Time (seconds)", "RR": "Breaths per Minute", "max_speed_quintile": "Quintile"})
fig.show()


In [12]:
o2_avg = (
    measures_test.groupby(["max_speed_quintile", "time"])["O2_rate"]
    .mean()
    .reset_index()
)

fig = px.line(o2_avg, x="time", y="O2_rate", color="max_speed_quintile",
              title="Average Oxygen Uptake Over Time by Performance Quintile",
              labels={"time": "Time (seconds)", "O2_rate": "O₂ Uptake (ml/min)", "max_speed_quintile": "Quintile"})
fig.show()


In [13]:

#Get last row per test
last_indices = measures_test.groupby("ID_test")["time"].idxmax()
last_vals = measures_test.loc[last_indices].copy()

# Get Oxygen efficency
last_vals["O2_per_km"] = last_vals["O2_cum"] / last_vals["dist_km"]

#Get VCO2_max per runner
vco2_max = measures_test.groupby("ID_test")["CO2_rate"].max().reset_index()
vco2_max.columns = ["ID_test", "VCO2_max"]
last_vals = last_vals.merge(vco2_max, on="ID_test", how="left")

#Sample 10% of data to prevent over plotting 
sampled = last_vals.sample(frac=0.10, random_state=1)

fig = px.scatter(
    sampled,
    x="dist_km",
    y="O2_per_km",
    color="VCO2_max",
    trendline="ols",
    title="Oxygen Efficiency vs. Distance Traveled (Colored by VCO₂ Max, Sampled 10%)",
    labels={
        "dist_km": "Distance (km)",
        "O2_per_km": "Oxygen per km (ml/km)",
        "VCO2_max": "Max VCO₂ (ml/min)"
    },
    color_continuous_scale="Plasma"
)

fig.update_traces(marker=dict(size=6, opacity=0.5))
fig.update_layout(template="plotly_white")
fig.show()


In [None]:


#create two age groups: Under 40 and 40+
measures_test["age_group"] = np.where(measures_test["Age"] < 40, "Under 40", "40 and Over")


fig = go.Figure()
group_colors = {
    "Under 40": ("blue", "red"),
    "40 and Over": ("green", "orange")
}

# Loop through each age group and plot O2/CO2 rates
for group in ["Under 40", "40 and Over"]:
    group_data = measures_test[measures_test["age_group"] == group]
    if group_data.empty:
        continue

    avg = group_data.groupby("time")[["O2_rate", "CO2_rate"]].mean().reset_index()

    # Find ventilatory threshold: first time CO2_rate > O2_rate
    vt_row = avg[avg["CO2_rate"] > avg["O2_rate"]].head(1)
    vt_time = vt_row["time"].values[0] if not vt_row.empty else None

    # Add O2 line
    fig.add_trace(go.Scatter(
        x=avg["time"],
        y=avg["O2_rate"],
        mode="lines",
        name=f"O₂ ({group})",
        line=dict(color=group_colors[group][0])
    ))

    # Add CO2 line
    fig.add_trace(go.Scatter(
        x=avg["time"],
        y=avg["CO2_rate"],
        mode="lines",
        name=f"CO₂ ({group})",
        line=dict(color=group_colors[group][1])
    ))

    # Add vertical line for VT with offset label to avoid overlap
    if vt_time:
        fig.add_vline(
            x=vt_time,
            line_dash="dash",
            line_color=group_colors[group][1],
            annotation=dict(
                text=f"VT ({group})",
                showarrow=False,
                font=dict(size=12),
                y=1.05 if group == "Under 40" else 1.15,
                yref="paper"
            ),
            opacity=0.6
        )


fig.update_layout(
    title="O₂ and CO₂ Rates Over Time: Under 40 vs Over 40",
    xaxis_title="Time (seconds)",
    yaxis_title="Rate (ml/min)",
    template="plotly_white",
    legend_title="Variable (Age Group)"
)

fig.show()
