In [12]:
import sys, os
from pathlib import Path

print("python:", sys.executable)
print("cwd:", os.getcwd())
print("in fit_adv repo:", (Path(".") / "pyproject.toml").exists())


python: /home/grifin/projects/fit_adv/.venv/bin/python
cwd: /home/grifin/projects/fit_adv/notebooks
in fit_adv repo: False


In [13]:
import os
print(os.getcwd())


/home/grifin/projects/fit_adv/notebooks


In [4]:
# Cell 1

import json, pathlib
import pandas as pd

raw_dir = pathlib.Path("/home/grifin/projects/fit_adv/data/raw")
assert raw_dir.exists(), "data/raw not found"

def latest(prefix: str) -> pathlib.Path:
    files = sorted(raw_dir.glob(f"{prefix}_*.json"))
    assert files, f"No {prefix}_*.json in {raw_dir}"
    return files[-1]

paths = {
    "recovery": latest("recovery"),
    "sleep": latest("sleep"),
    "workout": latest("workout"),
    "cycle": latest("cycle"),
}
paths



{'recovery': PosixPath('/home/grifin/projects/fit_adv/data/raw/recovery_20251229_090628.json'),
 'sleep': PosixPath('/home/grifin/projects/fit_adv/data/raw/sleep_20251229_090628.json'),
 'workout': PosixPath('/home/grifin/projects/fit_adv/data/raw/workout_20251229_090628.json'),
 'cycle': PosixPath('/home/grifin/projects/fit_adv/data/raw/cycle_20251229_090628.json')}

In [5]:
# Cell 2

def read_list(p: pathlib.Path):
    return json.loads(p.read_text())

recovery = read_list(paths["recovery"])
sleep    = read_list(paths["sleep"])
workout  = read_list(paths["workout"])
cycle    = read_list(paths["cycle"])

len(recovery), len(sleep), len(workout), len(cycle)



(25, 25, 25, 25)

In [6]:
# Cell 3

#def flatten_score(records):
#    rows = []
#    for r in records:
#        base = {k: v for k, v in r.items() if k != "score"}
#        score = r.get("score") or {}
#        for k, v in score.items():
#            base[f"score_{k}"] = v
#        rows.append(base)
#    return pd.DataFrame(rows)

#df_recovery = flatten_score(recovery)
#df_sleep    = flatten_score(sleep)
#df_workout  = flatten_score(workout)
#df_cycle    = flatten_score(cycle)

#df_recovery.shape, df_sleep.shape, df_workout.shape, df_cycle.shape

from fit_adv.io_records import records_to_frames

df_recovery, df_sleep, df_workout, df_cycle = records_to_frames(
    recovery=recovery,
    sleep=sleep,
    workout=workout,
    cycle=cycle,
)

df_recovery.shape, df_sleep.shape, df_workout.shape, df_cycle.shape



((25, 12), (25, 17), (25, 20), (25, 12))

In [7]:
# Cell 4

#df_cycle["start_dt_utc"] = pd.to_datetime(df_cycle["start"], utc=True, errors="coerce")
#df_cycle["date"] = df_cycle["start_dt_utc"].dt.date.astype(str)

# Keep just the cycle fields we care about
#cycle_keep = [
#    "id", "date", "start", "end", "timezone_offset", "score_state",
#    "score_strain", "score_kilojoule", "score_average_heart_rate", "score_max_heart_rate"
#]
#df_daily = df_cycle[[c for c in cycle_keep if c in df_cycle.columns]].copy()
#df_daily = df_daily.rename(columns={
#    "id": "cycle_id",
#    "score_strain": "cycle_strain",
#    "score_kilojoule": "cycle_kilojoule",
#    "score_average_heart_rate": "cycle_avg_hr",
#    "score_max_heart_rate": "cycle_max_hr",
#})
#df_daily.head(3)

from fit_adv.build_daily_core import build_daily_from_cycle_recovery_sleep

# Guardrails so you get a clear message if upstream cells haven't run
assert "df_cycle" in globals(), "df_cycle not defined yet—run the earlier load cell(s) first."
assert "df_recovery" in globals(), "df_recovery not defined yet—run the earlier load cell(s) first."
assert "df_sleep" in globals(), "df_sleep not defined yet—run the earlier load cell(s) first."

df_daily = build_daily_from_cycle_recovery_sleep(df_cycle, df_recovery, df_sleep)
df_daily.head(3)



Unnamed: 0,cycle_id,date,cycle_start,cycle_end,timezone_offset,cycle_score_state,cycle_strain,cycle_kilojoule,cycle_avg_hr,cycle_max_hr,...,stage_total_slow_wave_sleep_time_milli,stage_total_rem_sleep_time_milli,stage_sleep_cycle_count,stage_disturbance_count,stage_total_in_bed_time_hours,stage_total_awake_time_hours,stage_total_light_sleep_time_hours,stage_total_slow_wave_sleep_time_hours,stage_total_rem_sleep_time_hours,sleep_asleep_hours_est
0,1226078030,2025-12-29,2025-12-29T02:15:56.750Z,,-06:00,SCORED,0.585624,4490.063,55,94,...,4205060.0,7445420.0,5.0,13.0,11.528286,1.125636,7.166406,1.168072,2.068172,10.40265
1,1224327645,2025-12-28,2025-12-28T03:37:24.810Z,2025-12-29T02:15:56.750Z,-06:00,SCORED,4.78461,8136.666,58,138,...,3092230.0,8677350.0,7.0,16.0,9.230114,0.997814,4.962972,0.858953,2.410375,8.2323
2,1222741445,2025-12-27,2025-12-27T02:29:52.680Z,2025-12-28T03:37:24.810Z,-06:00,SCORED,5.269515,9110.884,59,134,...,4414290.0,8914610.0,9.0,11.0,10.762169,1.054756,6.004942,1.226192,2.476281,9.707414


In [8]:
# Cell 5 — Join recovery

# --- ENSURE JOIN KEY TYPES MATCH ---
df_daily["cycle_id"] = df_daily["cycle_id"].astype(str)
df_recovery["cycle_id"] = df_recovery["cycle_id"].astype(str)

# Rename recovery columns
df_recovery = df_recovery.rename(columns={
    "cycle_id": "cycle_id",
    "score_recovery_score": "recovery_score",
    "score_hrv_rmssd_milli": "hrv_rmssd_milli",
    "score_resting_heart_rate": "resting_hr",
    "score_spo2_percentage": "spo2_pct",
    "score_skin_temp_celsius": "skin_temp_c",
})

recovery_keep = [
    "cycle_id",
    "recovery_score",
    "hrv_rmssd_milli",
    "resting_hr",
    "spo2_pct",
    "skin_temp_c",
    "score_user_calibrating"
]

df_daily = df_daily.merge(
    df_recovery[[c for c in recovery_keep if c in df_recovery.columns]],
    on="cycle_id",
    how="left"
)

df_daily.head(3)



Unnamed: 0,cycle_id,date,cycle_start,cycle_end,timezone_offset,cycle_score_state,cycle_strain,cycle_kilojoule,cycle_avg_hr,cycle_max_hr,...,stage_total_light_sleep_time_hours,stage_total_slow_wave_sleep_time_hours,stage_total_rem_sleep_time_hours,sleep_asleep_hours_est,recovery_score_y,hrv_rmssd_milli_y,resting_hr_y,spo2_pct_y,skin_temp_c_y,score_user_calibrating_y
0,1226078030,2025-12-29,2025-12-29T02:15:56.750Z,,-06:00,SCORED,0.585624,4490.063,55,94,...,7.166406,1.168072,2.068172,10.40265,44.0,59.602093,55.0,95.77273,32.927,False
1,1224327645,2025-12-28,2025-12-28T03:37:24.810Z,2025-12-29T02:15:56.750Z,-06:00,SCORED,4.78461,8136.666,58,138,...,4.962972,0.858953,2.410375,8.2323,76.0,80.22881,48.0,96.07895,32.924667,False
2,1222741445,2025-12-27,2025-12-27T02:29:52.680Z,2025-12-28T03:37:24.810Z,-06:00,SCORED,5.269515,9110.884,59,134,...,6.004942,1.226192,2.476281,9.707414,70.0,75.46285,49.0,96.36364,32.689667,False


In [9]:
# Cell 6 — Join sleep
# --- ENSURE JOIN KEY TYPES MATCH ---
df_daily["cycle_id"] = df_daily["cycle_id"].astype(str)
df_sleep["cycle_id"] = df_sleep["cycle_id"].astype(str)

def expand_dict_column(df, col, prefix):
    if col not in df.columns:
        return df
    expanded = df[col].apply(lambda x: x if isinstance(x, dict) else {}).apply(pd.Series)
    expanded = expanded.add_prefix(prefix)
    return pd.concat([df.drop(columns=[col]), expanded], axis=1)

# Expand nested dict columns from score
df_sleep = expand_dict_column(df_sleep, "score_sleep_needed", "sleep_needed_")
df_sleep = expand_dict_column(df_sleep, "score_stage_summary", "stage_")

# Rename key sleep metrics
df_sleep = df_sleep.rename(columns={
    "score_sleep_performance_percentage": "sleep_perf_pct",
    "score_sleep_efficiency_percentage": "sleep_eff_pct",
    "score_sleep_consistency_percentage": "sleep_consistency_pct",
    "score_respiratory_rate": "resp_rate",
})

sleep_keep = ["cycle_id", "start", "end", "nap", "score_state",
              "sleep_perf_pct", "sleep_eff_pct", "sleep_consistency_pct", "resp_rate"] + \
             [c for c in df_sleep.columns if c.startswith("sleep_needed_") or c.startswith("stage_")]

# Prefer the main sleep (nap == False). For ties, keep the latest start.
df_sleep["start_dt"] = pd.to_datetime(df_sleep["start"], utc=True, errors="coerce")
df_sleep_sorted = df_sleep.sort_values(["cycle_id", "nap", "start_dt"], ascending=[True, True, False])
df_sleep_one = df_sleep_sorted.drop_duplicates("cycle_id", keep="first")

# Merge into daily
df_daily = df_daily.merge(
    df_sleep_one[[c for c in sleep_keep if c in df_sleep_one.columns]],
    on="cycle_id",
    how="left"
)

# Rename merge suffixes (cycle start/end vs sleep start/end)
df_daily = df_daily.rename(columns={
    "start_x": "cycle_start",
    "end_x": "cycle_end",
    "score_state_x": "cycle_score_state",
    "start_y": "sleep_start",
    "end_y": "sleep_end",
    "score_state_y": "sleep_score_state",
})

# Convert sleep stage millis → hours AFTER merge
def ms_to_hours(s):
    return s / 1000 / 60 / 60

for col in [
    "stage_total_in_bed_time_milli",
    "stage_total_awake_time_milli",
    "stage_total_light_sleep_time_milli",
    "stage_total_slow_wave_sleep_time_milli",
    "stage_total_rem_sleep_time_milli",
]:
    if col in df_daily.columns:
        df_daily[col.replace("_milli", "_hours")] = ms_to_hours(df_daily[col])

if "stage_total_in_bed_time_hours" in df_daily.columns and "stage_total_awake_time_hours" in df_daily.columns:
    df_daily["sleep_asleep_hours_est"] = (
        df_daily["stage_total_in_bed_time_hours"] - df_daily["stage_total_awake_time_hours"]
    )

print("Sleep columns now present:", [c for c in df_daily.columns if "sleep_" in c][:10])



Sleep columns now present: ['sleep_start', 'sleep_end', 'sleep_score_state', 'sleep_perf_pct_x', 'sleep_eff_pct_x', 'sleep_consistency_pct_x', 'sleep_needed_baseline_milli_x', 'sleep_needed_need_from_sleep_debt_milli_x', 'sleep_needed_need_from_recent_strain_milli_x', 'sleep_needed_need_from_recent_nap_milli_x']


In [10]:
# Cell 7

# df_workout["start_dt_utc"] = pd.to_datetime(df_workout["start"], utc=True, errors="coerce")
# df_workout["end_dt_utc"] = pd.to_datetime(df_workout["end"], utc=True, errors="coerce")
# df_workout["date"] = df_workout["start_dt_utc"].dt.date.astype(str)
# df_workout["minutes"] = (df_workout["end_dt_utc"] - df_workout["start_dt_utc"]).dt.total_seconds() / 60.0
# workout_daily = df_workout.groupby("date", as_index=False).agg(
#     workout_count=("id", "count"),
#     workout_minutes=("minutes", "sum"),
#     workout_strain_sum=("score_strain", "sum"),
#     workout_kj_sum=("score_kilojoule", "sum"),
#     workout_avg_hr_mean=("score_average_heart_rate", "mean"),
#     workout_max_hr_max=("score_max_heart_rate", "max"),
# )
# df_daily = df_daily.merge(workout_daily, on="date", how="left")
# df_daily.head(3)

from fit_adv.build_daily_dataset import add_workout_daily_metrics

df_daily = add_workout_daily_metrics(df_daily, df_workout)
df_daily.head(3)




Unnamed: 0,cycle_id,date,cycle_start,cycle_end,timezone_offset,cycle_score_state,cycle_strain,cycle_kilojoule,cycle_avg_hr,cycle_max_hr,...,stage_total_slow_wave_sleep_time_milli_y,stage_total_rem_sleep_time_milli_y,stage_sleep_cycle_count_y,stage_disturbance_count_y,workout_count,workout_minutes,workout_strain_sum,workout_kj_sum,workout_avg_hr_mean,workout_max_hr_max
0,1226078030,2025-12-29,2025-12-29T02:15:56.750Z,,-06:00,SCORED,0.585624,4490.063,55,94,...,4205060.0,7445420.0,5.0,13.0,,,,,,
1,1224327645,2025-12-28,2025-12-28T03:37:24.810Z,2025-12-29T02:15:56.750Z,-06:00,SCORED,4.78461,8136.666,58,138,...,3092230.0,8677350.0,7.0,16.0,1.0,54.1536,2.374022,317.47568,78.0,107.0
2,1222741445,2025-12-27,2025-12-27T02:29:52.680Z,2025-12-28T03:37:24.810Z,-06:00,SCORED,5.269515,9110.884,59,134,...,4414290.0,8914610.0,9.0,11.0,,,,,,


In [None]:
# Cell 8

#out_dir = pathlib.Path("data/processed")
#out_dir.mkdir(parents=True, exist_ok=True)

# Sort by date
#df_daily = df_daily.sort_values("date").reset_index(drop=True)
#
#csv_path = out_dir / "daily_full.csv"
#df_daily.to_csv(csv_path, index=False)


#csv_path, df_daily.shape



In [11]:
#out_dir = pathlib.Path("data/processed")
#out_dir.mkdir(parents=True, exist_ok=True)

#core = [
#    "date",
#    "recovery_score","hrv_rmssd_milli","resting_hr","spo2_pct","skin_temp_c",
#    "sleep_perf_pct","sleep_eff_pct","sleep_consistency_pct","resp_rate","sleep_asleep_hours_est",
#    "cycle_strain","cycle_kilojoule","cycle_avg_hr","cycle_max_hr",
#    "workout_count","workout_minutes","workout_strain_sum","workout_kj_sum",
#]
#df_v1 = df_daily[[c for c in core if c in df_daily.columns]].copy()
#df_v1.to_csv(out_dir / "daily_v1.csv", index=False, lineterminator="\n", float_format="%.6f")

from fit_adv.config import get_settings
from fit_adv.build_daily_dataset import write_daily_outputs

s = get_settings()
outputs = write_daily_outputs(df_daily, out_dir=s.processed_dir)
outputs, df_daily.shape



(DailyBuildOutputs(out_dir=PosixPath('/home/grifin/projects/fit_adv/data/processed'), daily_full_csv=PosixPath('/home/grifin/projects/fit_adv/data/processed/daily_full.csv'), daily_v1_csv=PosixPath('/home/grifin/projects/fit_adv/data/processed/daily_v1.csv')),
 (25, 74))