In [41]:
import pandas as pd 

# Import cleaned data
weight_df = pd.read_csv("data/combined_weight_data.csv")
running_df = pd.read_csv("data/cleaned/running_data.csv")

In [42]:
# Coerce dates and sort
running_df['date'] = pd.to_datetime(running_df['date'], errors='coerce')
running_df = running_df.sort_values('date').reset_index(drop=True)

weight_df['date'] = pd.to_datetime(weight_df['date'], errors='coerce')
weight_df = weight_df.sort_values('date').reset_index(drop=True)

In [59]:
# Global min / max across both sets
min_date = min(running_df['date'].min(), weight_df['date'].min())
max_date = max(running_df['date'].max(), weight_df['date'].max())

# Put weights on their date index
weight_series = weight_df.set_index('date')['weight']

# Daily grid spanning min-max
full_range = pd.date_range(min_date, max_date, freq='D')
full_range
# weight_daily = weight_df.set_index('date').reindex(full_range)
# Put weights on their date index
# weight_series = weight_df.set_index('date')['weight']

# Re-index → NaNs → interpolate → edge-fill
weight_daily = (
    weight_series
        .reindex(full_range)                     # missing → NaN
        .interpolate(method='time')              # fill interior gaps
        .ffill().bfill()                         # fill leading / trailing NaNs
        .rename_axis('date')                     # index name
        .reset_index()                           # back to DF
)

# # Step 2: Interpolate internal NaNs only (linear within valid points)
# weight_daily['weight'] = weight_daily['weight'].interpolate(method='linear', limit_direction='both')
# weight_daily

# # Step 3: Only now extrapolate edge NaNs using nearest value
# weight_daily['weight'] = weight_daily['weight'].fillna(method='ffill').fillna(method='bfill')

# # Step 4: Reset index for merge
# weight_daily = weight_daily.reset_index().rename(columns={'index': 'date'})

weight_daily


Unnamed: 0,date,weight
0,2015-05-22,79.800000
1,2015-05-23,79.799393
2,2015-05-24,79.798785
3,2015-05-25,79.798178
4,2015-05-26,79.797571
...,...,...
3691,2025-06-29,83.800000
3692,2025-06-30,83.800000
3693,2025-07-01,83.800000
3694,2025-07-02,83.800000


In [63]:
merged_df = running_df.merge(
    weight_daily,
    on='date',
    how='left'          # keep ONLY run dates (runs_df is “left”)
)
merged_df

Unnamed: 0,date,Calories Burned (kCal),Distance (km),Workout Time (seconds),Avg Pace (min/km),Max Pace (min/km),Avg Speed (km/h),Steps,Notes,Link,weight
0,2016-10-27,349,3.869056,1691,7.284272,3.416317,8.236896,4442.0,,http://www.mapmyfitness.com/workout/1837545095,79.381141
1,2016-10-29,357,4.073909,826,3.379234,2.311153,17.755570,2394.0,,http://www.mapmyfitness.com/workout/1855748714,79.373217
2,2016-11-11,56,59.857619,130,0.036197,0.111775,1657.592133,6466.0,,http://www.mapmyfitness.com/workout/1882720331,79.321712
3,2016-11-24,522,5.811148,1711,4.907229,0.927738,12.226846,4690.0,,http://www.mapmyfitness.com/workout/1882757897,79.270206
4,2019-03-28,329,3.790617,1702,7.483422,3.388921,8.017736,4460.0,,http://www.mapmyfitness.com/workout/3451593310,75.886688
...,...,...,...,...,...,...,...,...,...,...,...
451,2025-06-28,436,5.337695,2315,7.223378,0.261140,8.306339,6086.0,,http://www.mapmyfitness.com/workout/8622257110,83.800000
452,2025-06-29,437,5.401184,2227,6.869196,0.735604,8.734618,5882.0,,http://www.mapmyfitness.com/workout/8622832763,83.800000
453,2025-07-01,421,5.380616,2446,7.575571,1.830106,7.920161,6403.0,,http://www.mapmyfitness.com/workout/8624078907,83.800000
454,2025-07-02,414,5.339546,2473,7.719543,0.400474,7.772504,6464.0,,http://www.mapmyfitness.com/workout/8624764540,83.800000


In [64]:
merged_df.to_csv('data/cleaned/runs_with_weights.csv', index=False)
