In [20]:
import json
import pandas as pd
import os
from datetime import datetime, timezone
import pytz


In [21]:
DATA_DIR = "data/"
garmin_file = os.path.join(DATA_DIR, "raw/garmin_health_data.json")
csv_filename = os.path.join(DATA_DIR, "processed/garmin_data.csv")

In [22]:
print(garmin_file)

data/raw/garmin_health_data.json


In [23]:
with open(garmin_file, "r") as f:
    garmin_data = json.load(f)
print("✅ Loaded Garmin health data")

✅ Loaded Garmin health data


In [24]:
processed_data = []

# Iterate through Garmin data (by date)
for date, health in garmin_data.items():
    # Safely get heart rate values
    heart_rate_values = health.get("heart_rate", [])
    if not heart_rate_values:
        continue  # Skip days with no heart rate data

    # Convert heart rate timestamps to UTC
    hr_data = {
        datetime.fromtimestamp(ts / 1000, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"): hr
        for ts, hr in heart_rate_values
    }

    # Helper function for extracting time series data
    def extract_time_series(data, key):
        if not data or key not in data or data[key] is None:
            return {}
        return {
            datetime.fromtimestamp(ts / 1000, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"): value
            for ts, value in data[key]
        }

    # Extract all health metrics
    stress_data = extract_time_series(health.get("stress", {}), "stressValuesArray")
    respiration_data = extract_time_series(health.get("respiration", {}), "respirationValuesArray")
    body_battery_data = extract_time_series(health.get("body_battery", [{}])[0], "bodyBatteryValuesArray")
    spo2_data = extract_time_series(health.get("spo2"), "spO2HourlyAverages")
    hrv_avg = health.get("hrv_avg", {})

    # Extract HRV readings
    hrv_data = {
        entry["readingTimeGMT"]: entry["hrvValue"]
        for entry in health.get("hrvReadings", [])
    }

    # Get sleep score
    sleep_score = health.get("sleep_score")

    # Initialize last known values
    last_stress = last_resp = last_body_battery = last_spo2 = last_hrv = None

    # Merge data by timestamps
    for timestamp, heart_rate in hr_data.items():
        # Update last known values
        last_stress = stress_data.get(timestamp, last_stress)
        last_resp = respiration_data.get(timestamp, last_resp)
        last_body_battery = body_battery_data.get(timestamp, last_body_battery)
        last_spo2 = spo2_data.get(timestamp, last_spo2)
        last_hrv = hrv_data.get(timestamp, last_hrv)

        # Store processed data
        processed_data.append({
            "timestamp": timestamp,
            "heart_rate": heart_rate,
            "stress": last_stress,
            "respiration": last_resp,
            "body_battery": last_body_battery,
            "spo2": last_spo2,
            # "hrv": last_hrv,
            "sleep_score": sleep_score,
            "hrv_avg": hrv_avg
        })

print("✅ Processed Garmin health data")

✅ Processed Garmin health data


In [25]:
df = pd.DataFrame(processed_data)

# Add local time column
df['local_time'] = df['timestamp'].apply(lambda x: 
    datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")
    .replace(tzinfo=timezone.utc)
    .astimezone(pytz.timezone('Europe/Madrid'))
    .strftime("%Y-%m-%dT%H:%M:%S")  # Removed %z to exclude the timezone offset
)

display(df.head())
print("\nDataset shape:", df.shape)

Unnamed: 0,timestamp,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg,local_time
0,2025-04-12T23:00:00Z,56.0,25.0,,26.0,89.0,78.0,61.0,2025-04-13T01:00:00
1,2025-04-12T23:02:00Z,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:02:00
2,2025-04-12T23:04:00Z,56.0,25.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:04:00
3,2025-04-12T23:06:00Z,56.0,24.0,16.0,26.0,89.0,78.0,61.0,2025-04-13T01:06:00
4,2025-04-12T23:08:00Z,58.0,24.0,17.0,26.0,89.0,78.0,61.0,2025-04-13T01:08:00



Dataset shape: (26495, 9)


In [26]:
print("Summary statistics:")
display(df.describe())

print("\nMissing values:")
display(df.isnull().sum())

Summary statistics:


Unnamed: 0,heart_rate,stress,respiration,body_battery,spo2,sleep_score,hrv_avg
count,26457.0,26487.0,26445.0,23000.0,21104.0,23119.0,19975.0
mean,70.534301,27.505758,12.51072,43.788609,94.058378,74.72927,73.030688
std,18.442031,25.765325,6.079722,27.562547,2.432446,14.878259,16.719357
min,40.0,-2.0,-2.0,5.0,85.0,35.0,46.0
25%,57.0,8.0,12.0,23.0,93.0,63.0,60.0
50%,66.0,22.0,14.0,39.0,94.0,80.0,70.0
75%,79.0,42.0,16.0,62.0,96.0,85.0,85.0
max,199.0,99.0,25.0,100.0,100.0,95.0,107.0



Missing values:


timestamp          0
heart_rate        38
stress             8
respiration       50
body_battery    3495
spo2            5391
sleep_score     3376
hrv_avg         6520
local_time         0
dtype: int64

In [27]:
df.to_csv(csv_filename, index=False)
print(f"✅ Garmin health data saved to {csv_filename}")

✅ Garmin health data saved to data/processed/garmin_data.csv
