## This specific notebook is for understanding the data's strength and weaknesses

In [10]:
# first we load the data into a pandas DataFrame
import pandas as pd

data = pd.read_csv('../data/raw-data-kaggle.csv', sep=';')

In [11]:
# next we take a look at the first few rows of the data
data.head()
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42116 entries, 0 to 42115
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   athlete                   42116 non-null  int64  
 1   gender                    41761 non-null  object 
 2   timestamp                 42116 non-null  object 
 3   distance (m)              42116 non-null  float64
 4   elapsed time (s)          42116 non-null  int64  
 5   elevation gain (m)        42116 non-null  float64
 6   average heart rate (bpm)  23732 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 2.2+ MB


Unnamed: 0,athlete,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm)
count,42116.0,42116.0,42116.0,42116.0,23732.0
mean,15038770.0,11028.352189,4245.974,242.222516,147.652225
std,8652464.0,7480.264547,15839.39,451.664624,17.291873
min,771514.0,0.0,0.0,0.0,0.0
25%,6583234.0,7189.6,2489.75,31.8,138.6
50%,13881860.0,10136.75,3442.0,86.0,148.4
75%,20653960.0,13236.3,4593.25,240.8,158.1
max,46817580.0,218950.0,2971531.0,12078.5,237.0


### There are a couple issues that need to be addressed with the data so far.
>   -Missing Values: thre is nearly half the data entries missing heart rate values. \
>   -Data Quality: There are miniumum values for distance, elapsed time, elevation gain, and average heart raete that are all 0. This is likely an error and should be fixed.



In [12]:
# Data cleaning steps

# 1. Check the extent of missing values
print("Rows with 0 distance:", (data['distance (m)'] == 0).sum())
print("Rows with 0 elapsed time:", (data['elapsed time (s)'] == 0).sum())
print("Rows with 0 heart rate:", (data['average heart rate (bpm)'] == 0).sum())
print("Missing heart rate values:", data['average heart rate (bpm)'].isna().sum())

print('\n')

# 2. Remove invalid runs (0 distance or 0 time)
data_clean = data[(data['distance (m)'] > 0) & (data['elapsed time (s)'] > 0)].copy()
print(f"Removed {len(data) - len(data_clean)} invalid runs")
print(f"Remaining runs: {len(data_clean)}")

print('\n')

# 3. Remove extreme outliers based on distance and elapsed time and elevation gain
# Remove runs longer than 12 hours (43200 seconds)
data_clean = data_clean[data_clean['elapsed time (s)'] <= 43200]
# Remove runs with suspiciously high elevation (> 3000m)
data_clean = data_clean[data_clean['elevation gain (m)'] <= 3000]
print(f"After outlier removal: {len(data_clean)} runs remaining")

print('\n')

# 4. Handle heart rate: replacing 0 with NaN
data_clean.loc[data_clean['average heart rate (bpm)'] == 0, 'average heart rate (bpm)'] = None

# 5. Converting the timestamp to datetime
data_clean['timestamp'] = pd.to_datetime(data_clean['timestamp'], format='%d/%m/%Y %H:%M')

# 6. Create derived features

# Metric (original)
data_clean['pace_min_per_km'] = (data_clean['elapsed time (s)'] / 60) / (data_clean['distance (m)'] / 1000)
data_clean['speed_kmh'] = (data_clean['distance (m)'] / 1000) / (data_clean['elapsed time (s)'] / 3600)

# Imperial (for US users)
data_clean['distance_miles'] = data_clean['distance (m)'] * 0.000621371
data_clean['pace_min_per_mile'] = (data_clean['elapsed time (s)'] / 60) / data_clean['distance_miles']
data_clean['speed_mph'] = data_clean['distance_miles'] / (data_clean['elapsed time (s)'] / 3600)




Rows with 0 distance: 154
Rows with 0 elapsed time: 54
Rows with 0 heart rate: 24
Missing heart rate values: 18384


Removed 179 invalid runs
Remaining runs: 41937


After outlier removal: 41821 runs remaining




In [13]:
# Next steps is to verify the cleaned data looks good
data_clean.describe()

Unnamed: 0,athlete,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace_min_per_km,speed_kmh,distance_miles,pace_min_per_mile,speed_mph
count,41821.0,41821,41821.0,41821.0,41821.0,23622.0,41821.0,41821.0,41821.0,41821.0,41821.0
mean,15047430.0,2018-02-16 08:44:24.821501184,10903.46677,4048.173047,231.721205,147.869351,6.438491,10.801813,6.775098,10.36175,6.711933
min,771514.0,2000-01-04 12:06:00,0.1,1.0,0.0,26.8,0.00833,0.048874,6.2e-05,0.013406,0.030369
25%,6583234.0,2017-05-02 18:41:00,7220.2,2498.0,32.0,138.7,5.06268,9.451135,4.486423,8.147596,5.872661
50%,13881860.0,2018-05-21 09:44:00,10141.0,3443.0,86.4,148.4,5.626222,10.66435,6.301323,9.05453,6.626518
75%,20653960.0,2019-03-17 13:51:00,13215.1,4582.0,239.7,158.2,6.348444,11.85143,8.21148,10.216833,7.364135
max,46817580.0,2020-01-06 05:24:00,216736.0,43090.0,2997.0,237.0,1227.642276,7202.777143,134.673465,1975.699343,4475.596836
std,8658894.0,,6482.422381,3107.580202,383.320548,16.553191,11.717165,35.986005,4.027989,18.856954,22.36066


### Still some issues with a couple of the values.
>   The minumum values of some of the distances, time, and heart rate are very low. \
>   The pace has extreme outliers like 1,227 min/km and also .008 min/km

In [14]:
# 7. Additional filtering for realistic runs
# Remove very short runs (< 500m) - likely GPS errors
data_clean = data_clean[data_clean['distance (m)'] >= 500]

# Remove unrealistic paces (< 2 min/km or > 15 min/km)
data_clean = data_clean[(data_clean['pace_min_per_km'] >= 2) & (data_clean['pace_min_per_km'] <= 15)]

# Remove unrealistic heart rates (< 60 or > 220 bpm for actual running)
data_clean = data_clean[(data_clean['average heart rate (bpm)'].isna()) | 
                         ((data_clean['average heart rate (bpm)'] >= 60) & 
                          (data_clean['average heart rate (bpm)'] <= 220))]

print(f"Final cleaned dataset: {len(data_clean)} runs")
data_clean.describe()


Final cleaned dataset: 41056 runs


Unnamed: 0,athlete,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace_min_per_km,speed_kmh,distance_miles,pace_min_per_mile,speed_mph
count,41056.0,41056,41056.0,41056.0,41056.0,23278.0,41056.0,41056.0,41056.0,41056.0,41056.0
mean,15087060.0,2018-02-18 15:40:33.257501952,11003.784711,4018.695513,229.628227,148.246967,5.989878,10.584585,6.837433,9.639777,6.576954
min,771514.0,2000-01-04 12:06:00,503.5,96.0,0.0,62.0,2.005533,4.0,0.31286,3.227593,2.485484
25%,6583234.0,2017-05-08 07:36:00,7360.6,2517.0,33.0,139.1,5.05912,9.510996,4.573663,8.141866,5.909857
50%,13892790.0,2018-05-23 15:36:30,10182.85,3443.0,87.0,148.6,5.614101,10.687374,6.327328,9.035023,6.640824
75%,20653960.0,2019-03-18 10:36:45,13262.725,4556.25,237.8,158.3,6.308487,11.859771,8.241073,10.152529,7.369318
max,46817580.0,2020-01-06 05:24:00,153534.0,43090.0,2997.0,219.0,15.0,29.917241,95.401575,24.140167,18.589706
std,8651761.0,,6343.31176,2980.105447,380.007258,15.99757,1.652162,2.245234,3.94155,2.658898,1.395123


In [15]:
# 8. Save the cleaned dataset
data_clean.to_csv('../data/cleaned-data.csv', index=False)
print("Cleaned dataset saved to 'data/cleaned-data.csv'")

Cleaned dataset saved to 'data/cleaned-data.csv'
