In [None]:
This file is used to preprocess and export data on runner 840 for the introduciton visualization

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors as pc
from plotly.subplots import make_subplots


In [None]:
# Read data 
subjects = pd.read_csv("../data/treadmill/subject-info.csv")
measures = pd.read_csv("../data/treadmill/test_measure.csv")

subjects = subjects.sort_values(by=["ID_test"])
measures = measures.sort_values(by=["ID_test", "time"])

# O2 and CO2 rates in ml/min, VE in l/min
measures["O2_rate"] = measures["VO2"]
measures["CO2_rate"] = measures["VCO2"]
measures["air_rate"] = measures["VE"]

# Compute volume for the given breath
measures["O2_vol"] = measures["O2_rate"] / measures["RR"]
measures["CO2_vol"] = measures["CO2_rate"] / measures["RR"]
measures["air_vol"] = measures["air_rate"] / measures["RR"]

# Compute cumulative volume of for the given test
measures["O2_cum"] = measures.groupby("ID_test")["O2_vol"].cumsum()
measures["CO2_cum"] = measures.groupby("ID_test")["CO2_vol"].cumsum()
measures["air_cum"] = measures.groupby("ID_test")["air_vol"].cumsum()

measures = measures.drop(columns=["VO2", "VCO2", "VE"])

# Define numerical features
measures_numerical = {'time', 'Speed', 'HR', 'RR', 'O2_rate', 'CO2_rate',
       'air_rate', 'O2_vol', 'CO2_vol', 'air_vol', 'O2_cum', 'CO2_cum',
       'air_cum', 'Age', 'Weight', 'Height', 'Humidity', 'Temperature'}

FileNotFoundError: [Errno 2] No such file or directory: '../gxt-physiology/data/treadmill/subject-info.csv'

In [None]:
# Normalizes the start times of the test from start speed of 5 km/h
measures_test = measures[measures["Speed"] > 5]
normal_starts = measures_test.groupby("ID_test")["time"].transform("first")
measures_test.loc[:,"time"] = measures_test["time"] - normal_starts

# Normalizes the start times of the test from start speed of 6.1 km/h
ids_slow = measures_test[(measures_test["Speed"]==6.1) &(measures_test["time"]>100)]["ID_test"].unique()
first_steps = measures_test[measures_test["ID_test"].isin(ids_slow) & (measures_test["Speed"] > 6.1)].groupby("ID_test")["time"].first()
measures_test.loc[:,"time"] = measures_test["time"] - measures["ID_test"].map(first_steps).fillna(0) + 60
measures_test = measures_test[measures_test["time"] > 0]

# Remove tests with non-standard speeds increase rates
outlier_ids = list(measures_test[(measures_test["time"].between(276,301)) & (measures_test["Speed"] > 11.4)]["ID_test"].unique())
outlier_ids += ["715_1", "796_1", "593_1", "789_1", "479_1", "73_1", "667_1", "318_1", "376_1", "695_1"]
measures_test = measures_test[(measures_test["time"] > 0) & (~measures_test["ID_test"].isin(outlier_ids))]

# Remove cooldown periods
measures_test = measures_test[~((measures_test["time"] > 600) & (measures_test["Speed"] < 12))]

# Start all tests at time 0
measures_test.loc[:,"time"] = measures_test["time"] - measures_test.groupby("ID_test")["time"].transform("min")

In [None]:
# Average measures for each second
measure_inter = measures_test.groupby(["ID_test", "time"]).mean()

# Get max time achieved by each particpant
max_secs = measures_test.groupby("ID_test")["time"].agg("max")

# Generate new index
tuples = [
    (id_test, second) 
    for id_test, max_second in max_secs.items() 
    for second in range(max_second + 1)
]
index = pd.MultiIndex.from_tuples(tuples, names=["ID_test", "time"])

# Reindex and interpolate
measures_test = measure_inter.reindex(index).interpolate(method="linear", limit_direction="both").reset_index()
measures_test.loc[:, "ID"] = measures_test["ID"].astype("int")

In [None]:
# Join with subjects
measures_test = measures_test.merge(subjects, on=["ID_test", "ID"], how="left")

# Compute max speed attained quintile
measures_test["max_speed_quintile"] = measures_test["ID_test"].map(pd.qcut(measures_test.groupby("ID_test")["time"].max(), 5, labels=False))

# Computes cumulative distance traveled
def compute_distance(group):
    t = group["time"]
    v = group["Speed"]  # km/h

    dt = t.diff().fillna(0) / 3600  # seconds → hours
    v_avg = v.rolling(2).mean().bfill()

    dist = v_avg * dt  # km
    return dist.cumsum()

# Compyte cumulative distance traveled
measures_test = measures_test.sort_values(["ID_test", "time"])
measures_test["dist_km"] = (
    measures_test.groupby("ID_test", group_keys=False)[["time", "Speed"]]
                 .apply(compute_distance)
                 .reset_index(drop=True))

In [4]:
measures_test

NameError: name 'measures_test' is not defined

Grab runner 840 to export

In [None]:
#function to round decimals to tenth place 
def round_to_hundredths(df):
    for col in df.select_dtypes(include='number').columns:
        # Only round if any value has more than 2 decimal places
        if (df[col] % 1).round(3).ne((df[col] % 1).round(2)).any():
            df[col] = df[col].round(2)
    return df


In [3]:
runner_id = "840_1"
runner_df = measures_test[measures_test["ID_test"] == runner_id].copy()

#round values
runner_df = round_to_hundredths(runner_df)
# Export for D3 animation
runner_df.to_csv("clean_runner_840.csv", index=False)

NameError: name 'measures_test' is not defined