##### Hirten: "We also did not collect self-reported sleep data, preventing us from assessing how objective sleep impairment relates to perceived sleep quality."

### Imports

In [2]:
import warnings
import pandas as pd
import numpy as np
import ast
import re
from datetime import date
from scipy.stats import linregress
from google.colab import drive
from scipy.optimize import curve_fit
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import ipywidgets as widgets

drive.mount('/content/drive')
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Mounted at /content/drive


### Preprocessing

In [None]:
subjective = pd.read_csv('/content/drive/My Drive/coreway_ml/Thesis - Mika/Data/raw_subjective_data_2025-11-06.csv')

subjective.drop(['hasOtherHealthProblems',
                 'hrvMeasurement',
                 'stressType',
                 'otherSymptoms_x',
                 'otherHealthProblems',
                 'mentalStressLevel',
                 'medication',
                 'otherMedication_x',
                 'otherSymptoms_y',
                 'alcoholPortions',
                 'notes',
                 'diseaseRelapsesEvaluation',
                 'generalCondition',
                 'complications',
                 'stoolsPerDay',
                 'stoolsPerNight',
                 'urgencyOfDefecation',
                 'bloodInStool',
                 'stomachPain',
                 'unformedStoolsPerDay',
                 'abdominalResistance',
                 'terraUserId',
                 'additionalIllnesses',
                 'otherKnownCatalysts',
                 'flaresPerYear',
                 'hrvMeasurementMethod',
                 'backendSymptoms',
                 'hrvMeasurementMethodName',
                 'hasAskedToConnectWearable',
                 'hasConnectedWearable',
                 'knownCatalysts',
                 'backendKnownCatalysts',
                 'connectedWearableName',
                 'hasStoma',
                 'otherAdditionalIllnesses'
                 ], axis=1, inplace=True)

# Helper Functions
def year_to_age(year_of_birth):
    current_year = date.today().year
    try:
        year = float(year_of_birth)
        if year <= 0 or year > current_year:
            return None
        return int(current_year - year)
    except (ValueError, TypeError):
        return None

# Mappings
activity_mapping = {
        'zero': 0,
        'below30min': 1,
        'below1h': 2,
        'below2h': 3,
        'below4h': 4,
        'below8h': 5,
        'above8h': 6
    }

diagnosis_mapping = {
        "colitisUlcerosa": "UC",
        "crohnsDisease": "CD",
        "Crohn's disease": "CD",
    }

gender_mapping = {
        "female": "F",
        "male": "M"
    }

alcohol_mapping = {
        "Yes": 2,
        "A little": 1,
        "No": 0,
        "Unsure": np.nan
    }

period_mapping = {
        "Yes": 1,
        "Unsure": 0,
        "False": np.nan,
        "No": 0,
    }

subjective['date'] = pd.to_datetime(subjective['date'], format='mixed').dt.normalize()
subjective['age'] = subjective['yearOfBirth'].apply(year_to_age)

subjective['hasConsumedAlcoholInLast24Hours'] = subjective['hasConsumedAlcoholInLast24Hours'].map(alcohol_mapping)
subjective['activity_dur'] = subjective['physicalEffort'].map(activity_mapping)
subjective['diagnosis'] = subjective['diagnosis'].map(diagnosis_mapping)
subjective['isOnPeriod'] = subjective['isOnPeriod'].map(period_mapping)
subjective['gender'] = subjective['gender'].map(gender_mapping)

subjective.dropna(subset=["gender", "diagnosis"], inplace=True)

# Renaming
subjective = subjective.rename(columns={
                 'userId': 'user_id',
                 'sleepQualityDegree': 'sleep',
                 'stressLevelDegree': 'stress',
                 'physicalActivityExertionDegree': 'activity_deg',
                 'symptomDegree': 'symptom_deg',
                 'hasConsumedAlcoholInLast24Hours': 'alcohol_last_24h',
                 'isOnPeriod': 'on_period',
                 'rateAsFlare': 'rate_as_flare',
                 })

# Reordering columns
subjective = subjective[[
                 'user_id',
                 'date',
                 'gender',
                 'age',
                 'diagnosis',
                 'symptoms',
                 'alcohol_last_24h',
                 'on_period',
                 'sleep',
                 'stress',
                 'activity_dur',
                 'activity_deg',
                 'symptom_deg',
                 'rate_as_flare'
                 ]]

subjective.head()

In [None]:
def fit_full_cosinor(hrv_values, sampling_interval_minutes=5.0):

    y = np.asarray(hrv_values, dtype=float)
    y = y[~np.isnan(y)]

    # Need enough points to fit 3 parameters
    if y.size < 4:
        return np.nan, np.nan, np.nan, np.nan

    # time vector in hours
    dt = sampling_interval_minutes / 60.0
    t = np.arange(y.size, dtype=float) * dt

    # Total duration T (hours) = "night length" for this recording
    T = y.size * dt
    omega_fixed = 2.0 * np.pi / T

    # Model with fixed period T (omega fixed)
    def cosinor_model_fixed(t, mesor, amplitude, acrophase):
        return mesor + amplitude * np.cos(omega_fixed * t + acrophase)

    # Initial guesses
    mesor0 = y.mean()
    amplitude0 = (y.max() - y.min()) / 2.0
    acrophase0 = 0.0
    p0 = (mesor0, amplitude0, acrophase0)

    try:
        params, _ = curve_fit(cosinor_model_fixed, t, y, p0=p0, maxfev=5000)
        mesor, amplitude, acrophase = params
    except Exception:
        return np.nan, np.nan, np.nan, np.nan

    # Enforce amplitude >= 0 by flipping sign if necessary
    if amplitude < 0:
        amplitude = -amplitude
        acrophase = (acrophase + np.pi) % (2.0 * np.pi)

    # Peak time (max of the cosine) in hours, constrained to [0, T)
    # peak when omega*t + acrophase = 0 mod 2π -> t = -acrophase/omega
    peak_time = (-acrophase / omega_fixed) % T

    return mesor, acrophase, amplitude, peak_time

# -------------------------------------------------------------------
# Load data
# -------------------------------------------------------------------
raw_summary_wearable_data = pd.read_csv(
    '/content/drive/My Drive/coreway_ml/Thesis - Mika/Data/raw_summary_wearable_data_2025-09-26.csv'
)
raw_flattened_wearable_data = pd.read_csv(
    '/content/drive/My Drive/coreway_ml/Thesis - Mika/Data/raw_flattened_wearable_data_2025-09-26.csv'
)

# -------------------------------------------------------------------
# Transform stringified HRV list to actual list
# -------------------------------------------------------------------
raw_flattened_wearable_data['hrv_rmssd'] = raw_flattened_wearable_data['hrv_rmssd'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# -------------------------------------------------------------------
# Standardize date columns for merging
# -------------------------------------------------------------------
raw_summary_wearable_data['date'] = pd.to_datetime(
    raw_summary_wearable_data['date'].str.slice(0, 19), errors='coerce'
)
raw_flattened_wearable_data['date'] = pd.to_datetime(
    raw_flattened_wearable_data['date'].str.slice(0, 19), errors='coerce'
)

objective = pd.merge(
    raw_summary_wearable_data,
    raw_flattened_wearable_data,
    on=['userId', 'date'],
    how='inner'
)

# -------------------------------------------------------------------
# Use end_time's calendar date as sleep date, extract HH:MM from start/end
# -------------------------------------------------------------------
objective['date'] = pd.to_datetime(objective['end_time'].str.slice(0, 10), errors='coerce')

objective[['start_time', 'end_time']] = (
    objective[['start_time', 'end_time']]
    .astype('string')
    .apply(lambda col: col.str.extract(r'T(\d{2}:\d{2})', expand=False))
)

# -------------------------------------------------------------------
# Drop unneeded columns
# -------------------------------------------------------------------
objective.drop([
    'terra_user_id',
    'provider_y',
    'sleep_score',
    'delta_temperature',
    'user_max_hr_bpm',
    'on_demand_reading',
    'breaths_start_time',
    'breaths_end_time',
    'max_breaths_per_min',
    'min_breaths_per_min',
    'duration_in_bed_seconds',
    'num_REM_events',
    'duration_long_interruption_seconds',
    'duration_short_interruption_seconds',
    'num_out_of_bed_events',
    'num_wakeup_events',
    'sleep_latency_seconds',
    'wake_up_latency_seconds',
    'max_hr_bpm',
    'min_hr_bpm',
    'bpm_array_length',
    'timestamp_intervals_seconds_hrv_rmssd',
    'hrv_rmssd_array_length',
    'level',
    'timestamp_intervals_seconds_level',
    'level_array_length',
    'percentage_array_length',
    'breaths_per_min_array_length',
    'timestamp_intervals_seconds_hrv_sdnn',
    'timestamp_intervals_seconds_bpm',
    'timestamp_intervals_seconds_percentage',
    'timestamp_intervals_seconds_breaths_per_min',
    'hrv_sdnn_array_length'
], axis=1, inplace=True)

# -------------------------------------------------------------------
# Renaming columns
# -------------------------------------------------------------------
objective = objective.rename(columns={
    'userId': 'user_id',
    'provider_x': 'provider',
    'start_time': 'start',
    'end_time': 'end',
    'avg_hr_bpm': 'avg_bpm',
    'resting_hr_bpm': 'rhr',
    'sleep_efficiency': 'sleep_eff',
    'breaths_per_min': 'breaths',
    'avg_breaths_per_min': 'avg_breaths',
    'avg_saturation_percentage': 'avg_SpO2',
    'duration_REM_sleep_state_seconds': 'dur_REM',
    'duration_asleep_state_seconds': 'dur_asleep',
    'duration_deep_sleep_state_seconds': 'dur_deep',
    'duration_light_sleep_state_seconds': 'dur_light',
    'duration_awake_state_seconds': 'dur_awake',
    'percentage': 'SpO2'
})

# -------------------------------------------------------------------
# Infer sleep length (HH:MM) and keep numeric duration in hours
# -------------------------------------------------------------------
start_dt = pd.to_datetime(objective['start'], format='%H:%M', errors='coerce')
end_dt   = pd.to_datetime(objective['end'],   format='%H:%M', errors='coerce')

# Handle crossing midnight (fixed boolean -> timedelta)
mask = (end_dt < start_dt).astype(int)
end_dt_corrected = end_dt + pd.to_timedelta(mask, unit="D")

diff = end_dt_corrected - start_dt
length_hours = diff.dt.total_seconds() / 3600.0

objective['length'] = (
    (diff.dt.seconds // 3600).astype(str).str.zfill(2) + ":" +
    ((diff.dt.seconds % 3600) // 60).astype(str).str.zfill(2)
)
objective['length_hours'] = length_hours

# -------------------------------------------------------------------
# Remove duplicates & aggregate; keep longest sleep per day
# -------------------------------------------------------------------
objective = (
    objective
    .groupby(['user_id', 'date', 'start', 'end'], as_index=False)
    .agg(lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan)
)

idx_longest = objective.groupby(['user_id', 'date'])['length_hours'].idxmax()
objective = objective.loc[idx_longest].reset_index(drop=True)

# -------------------------------------------------------------------
# Convert durations from seconds to hours & compute sleep stage percentages
# -------------------------------------------------------------------
objective[['dur_asleep', 'dur_REM', 'dur_deep', 'dur_light', 'dur_awake']] /= 3600.0

objective[['REM_pct', 'deep_pct', 'light_pct']] = (
    objective[['dur_REM', 'dur_deep', 'dur_light']]
    .div(objective['dur_asleep'], axis=0) * 100.0
)

objective['sleep_eff'] = objective['sleep_eff'] * 100.0

# -------------------------------------------------------------------
# Drop rows with implausible sleep metrics
# -------------------------------------------------------------------
objective = objective[objective['dur_asleep'] >= 2]

objective = objective[
    objective['REM_pct'].between(0, 40) &
    objective['light_pct'].between(0, 90) &
    objective['deep_pct'].between(0, 40)
]

total_pct = objective['REM_pct'] + objective['light_pct'] + objective['deep_pct']
objective = objective[total_pct.between(90, 110)]

objective['sleep_eff'] = objective['sleep_eff'].where(
    objective['sleep_eff'].between(25, 100), np.nan
)

# -------------------------------------------------------------------
# Replace non-physiological values with NaNs
# -------------------------------------------------------------------
objective['avg_hrv_sdnn'] = objective['avg_hrv_sdnn'].where(
    objective['avg_hrv_sdnn'].between(5, 300), np.nan
)
objective['avg_hrv_rmssd'] = objective['avg_hrv_rmssd'].where(
    objective['avg_hrv_rmssd'].between(5, 300), np.nan
)
objective['avg_SpO2'] = objective['avg_SpO2'].where(
    objective['avg_SpO2'].between(85, 100), np.nan
)
objective['avg_bpm'] = objective['avg_bpm'].where(
    objective['avg_bpm'].between(30, 150), np.nan
)
objective['rhr'] = objective['rhr'].where(
    objective['rhr'].between(30, 150), np.nan
)

# -------------------------------------------------------------------
# Derive HRV features + cosinor features from hrv_rmssd (single pass)
# -------------------------------------------------------------------
std_list = []
cv_list = []
min_list = []
max_list = []
slope_list = []
mesor_list = []
acrophase_list = []
amplitude_list = []
peak_time_list = []

for x in objective['hrv_rmssd']:
    if isinstance(x, (list, np.ndarray, pd.Series)) and len(x) > 0:
        arr = np.asarray(x, dtype='float64')

        # Basic HRV stats
        std_val = arr.std()
        mean_val = arr.mean()
        min_val = arr.min()
        max_val = arr.max()

        if arr.size > 1:
            slope = linregress(np.arange(arr.size), arr).slope
        else:
            slope = np.nan

        # Cosinor with period exactly matching this recording's length
        mesor, acrophase, amplitude, peak_time = fit_full_cosinor(
            arr, sampling_interval_minutes=5.0
        )

        std_list.append(std_val)
        cv_list.append(std_val / mean_val if mean_val != 0 else np.nan)
        min_list.append(min_val)
        max_list.append(max_val)
        slope_list.append(slope)
        mesor_list.append(mesor)
        acrophase_list.append(acrophase)
        amplitude_list.append(amplitude)
        peak_time_list.append(peak_time)
    else:
        std_list.append(np.nan)
        cv_list.append(np.nan)
        min_list.append(np.nan)
        max_list.append(np.nan)
        slope_list.append(np.nan)
        mesor_list.append(np.nan)
        acrophase_list.append(np.nan)
        amplitude_list.append(np.nan)
        peak_time_list.append(np.nan)

objective['std_rmssd'] = std_list
objective['cv_rmssd'] = cv_list
objective['min_rmssd'] = min_list
objective['max_rmssd'] = max_list
objective['range_rmssd'] = objective['max_rmssd'] - objective['min_rmssd']
objective['slope_rmssd'] = slope_list

objective['mesor_rmssd'] = mesor_list
objective['acrophase_rmssd'] = acrophase_list
objective['amplitude_rmssd'] = amplitude_list
objective['peak_time_rmssd'] = peak_time_list

# -------------------------------------------------------------------
# Reordering columns (now including cosinor features)
# -------------------------------------------------------------------
objective = objective[[
    'user_id',
    'date',
    'provider',
    'start',
    'end',
    'length',
    'length_hours',
    'sleep_eff',
    'dur_asleep',
    'dur_REM',
    'REM_pct',
    'dur_deep',
    'deep_pct',
    'dur_light',
    'light_pct',
    'dur_awake',
    'hrv_rmssd',
    'avg_hrv_rmssd',
    'std_rmssd',
    'cv_rmssd',
    'min_rmssd',
    'max_rmssd',
    'range_rmssd',
    'slope_rmssd',
    'mesor_rmssd',
    'acrophase_rmssd',
    'amplitude_rmssd',
    'peak_time_rmssd',
    'avg_hrv_sdnn',
    'avg_bpm',
    'rhr',
    'avg_SpO2',
    'avg_breaths'
]]

# -------------------------------------------------------------------
# Sorting
# -------------------------------------------------------------------
objective = objective.sort_values(
    by=['user_id', 'date', 'start', 'end']
).reset_index(drop=True)

objective.head()

In [None]:
merged = pd.merge(subjective, objective, on=['user_id', 'date'], how='inner')

merged = merged[['user_id', 'date', 'gender', 'age', 'provider', 'diagnosis', 'sleep', 'dur_asleep', 'sleep_eff', 'REM_pct', 'deep_pct', 'light_pct', 'avg_hrv_sdnn', 'avg_hrv_rmssd', 'std_rmssd', 'cv_rmssd', 'min_rmssd', 'max_rmssd', 'range_rmssd', 'slope_rmssd', 'mesor_rmssd', 'acrophase_rmssd', 'amplitude_rmssd', 'peak_time_rmssd']]
merged = merged.drop_duplicates()

merged.head()

### Mixed Effects Models Analysis

In [None]:
sns.set(style="whitegrid")

# ============================================
# 1. Configuration
# ============================================

objective_vars = [
    'dur_asleep', 'sleep_eff',
    'REM_pct', 'deep_pct', 'light_pct',
    'avg_hrv_sdnn', 'avg_hrv_rmssd',
    'slope_rmssd', 'mesor_rmssd',
    'acrophase_rmssd', 'amplitude_rmssd', 'peak_time_rmssd'
]

# provider kept in data, but NOT used as a fixed effect
base_cols = ['user_id', 'date', 'sleep', 'gender', 'age', 'provider']

# ============================================
# 2. Basic cleaning / typing
# ============================================

cols_to_keep = list(dict.fromkeys(base_cols + objective_vars))
merged = merged[cols_to_keep].copy()

# numeric types
merged['sleep'] = pd.to_numeric(merged['sleep'], errors='coerce')
merged['age'] = pd.to_numeric(merged['age'], errors='coerce')

# categorical types
for col in ['gender', 'provider', 'user_id']:
    merged[col] = merged[col].astype('category')

# core variables must be present
merged = merged.dropna(subset=['user_id', 'sleep'])

# ============================================
# 3. Standardize objective predictors (z-scores)
# ============================================

for v in objective_vars:
    mean = merged[v].mean(skipna=True)
    std = merged[v].std(skipna=True)
    if std == 0 or np.isnan(std):
        merged[f'{v}_z'] = np.nan
        print(f" {v}: zero or NaN variance → z-scores set to NaN.")
    else:
        merged[f'{v}_z'] = (merged[v] - mean) / std

z_vars = [f"{v}_z" for v in objective_vars]

# ============================================
# 4. Descriptive table by subjective sleep score
# ============================================

desc_table = (
    merged
    .groupby('sleep')[objective_vars]
    .agg(['mean', 'std', 'count'])
)

print("\n=== Descriptive statistics by subjective sleep score (0–5) ===")
display(desc_table.style.set_caption("Objective metrics by subjective sleep score"))

# ============================================
# 5. Mixed-effects models: one per objective variable
# ============================================
# Model: sleep ~ objective_z + age + gender + (1 | user_id)

results = []

for v, zv in zip(objective_vars, z_vars):
    print(f"\n--- Processing {v} ---")

    # subset for this model
    model_cols = ['sleep', 'age', 'gender', 'user_id', zv]
    df_model = merged[model_cols].dropna()

    # data sufficiency checks
    if df_model['user_id'].nunique() < 2 or df_model.shape[0] < 20:
        print(f"Skipping {v}: insufficient data (users or rows).")
        continue

    if df_model[zv].std() < 1e-6:
        print(f"Skipping {v}: {zv} has ~zero variance in this subset.")
        continue

    # choose covariates that actually vary
    covariates = []
    if df_model['age'].std() > 0:
        covariates.append("age")
    if df_model['gender'].nunique() > 1:
        covariates.append("C(gender)")

    fixed_part = " + ".join([zv] + covariates) if covariates else zv
    formula = f"sleep ~ {fixed_part}"

    print(f"Fitting model: {formula}")

    try:
        md = smf.mixedlm(
            formula=formula,
            data=df_model,
            groups=df_model["user_id"],
            re_formula="1"
        )
        mfit = md.fit(method="lbfgs", reml=False, maxiter=200, disp=False)

        coef = mfit.params.get(zv, np.nan)
        pval = mfit.pvalues.get(zv, np.nan)
        if zv in mfit.conf_int().index:
            ci_low, ci_high = mfit.conf_int().loc[zv]
        else:
            ci_low, ci_high = (np.nan, np.nan)

        results.append({
            "variable": v,
            "coef_z": coef,
            "ci_low": ci_low,
            "ci_high": ci_high,
            "p_value": pval,
            "n_rows": df_model.shape[0],
            "n_users": df_model['user_id'].nunique(),
            "converged": mfit.converged
        })

    except Exception as e:
        print(f" Model for {v} failed: {e}")
        continue

results_df = pd.DataFrame(results)

# ============================================
# 6. Add significance stars & pretty summary table
# ============================================

def sig_stars(p):
    if pd.isna(p): return ""
    if p < 0.001: return "***"
    if p < 0.01: return "**"
    if p < 0.05: return "*"
    if p < 0.10: return "·"
    return ""

if not results_df.empty:
    results_df['sig'] = results_df['p_value'].apply(sig_stars)
    results_df = results_df.sort_values('p_value')

    results_df['effect (z-scaled)'] = results_df.apply(
        lambda r: f"{r['coef_z']:.2f} [{r['ci_low']:.2f}, {r['ci_high']:.2f}] {r['sig']}",
        axis=1
    )

    summary_table = results_df[[
        'variable', 'effect (z-scaled)', 'p_value', 'n_rows', 'n_users', 'converged'
    ]]

    print("\n=== Mixed-effects model results ===")
    print("sleep ~ objective_z + age + gender + (1 | user_id)")
    display(
        summary_table
        .style
        .format({'p_value': '{:.3g}'})
        .set_caption("Effect of objective sleep metrics on subjective sleep (no provider fixed effect)")
    )
else:
    print("No models were successfully fitted.")

# ============================================
# 7. Forest plot of mixed-effects estimates
# ============================================

# Mapping from raw variable names to pretty labels (for y-tick labels only)
var_label_map = {
    'dur_asleep': 'Total Time Asleep (h)',
    'sleep_eff': 'Sleep Efficiency (%)',
    'REM_pct': 'REM Sleep (%)',
    'deep_pct': 'Deep Sleep (%)',
    'light_pct': 'Light Sleep (%)',
    'avg_hrv_sdnn': 'Mean SDNN',
    'avg_hrv_rmssd': 'Mean RMSSD',
    'slope_rmssd': 'Slope RMSSD',
    'mesor_rmssd': 'MESOR RMSSD',
    'acrophase_rmssd': 'Acrophase RMSSD',
    'amplitude_rmssd': 'Amplitude RMSSD',
    'peak_time_rmssd': 'Peak Time RMSSD',
}

if not results_df.empty:
    plot_df = results_df.dropna(subset=['coef_z']).copy()
    if not plot_df.empty:
        plot_df = plot_df.sort_values('coef_z')

        # Use pretty labels for y-axis ticks, fall back to raw name if missing
        plot_df['var_label'] = plot_df['variable'].map(var_label_map).fillna(plot_df['variable'])

        plt.figure(figsize=(8, max(5, 0.4 * len(plot_df))))
        plt.errorbar(
            x=plot_df['coef_z'],
            y=plot_df['var_label'],  # use mapped labels here
            xerr=[plot_df['coef_z'] - plot_df['ci_low'],
                  plot_df['ci_high'] - plot_df['coef_z']],
            fmt='o',
            capsize=3
        )
        plt.axvline(0, linestyle='--', linewidth=1)
        plt.xlabel("Effect on subjective Sleep-Score (per 1 SD change)")
        plt.title(r"$\bf{Mixed\text{-}Effects\ Estimates}$""\n(adjusted for age, gender (fixed-effects) and user (random-effect))")
        plt.tight_layout()
        plt.show()
    else:
        print("No coefficients available for plotting.")
else:
    print("No results to plot.")

### Cluster-Robust OLS

In [None]:
sns.set(style="whitegrid")

# ============================================
# 1. Configuration
# ============================================

objective_vars = [
    'dur_asleep', 'sleep_eff',
    'REM_pct', 'deep_pct', 'light_pct',
    'avg_hrv_sdnn', 'avg_hrv_rmssd',
    'slope_rmssd', 'mesor_rmssd',
    'acrophase_rmssd', 'amplitude_rmssd', 'peak_time_rmssd'
]

base_cols = ['user_id', 'date', 'sleep', 'gender', 'age', 'provider']

# ============================================
# 2. Basic cleaning / typing
# ============================================

cols_to_keep = list(dict.fromkeys(base_cols + objective_vars))
df = merged[cols_to_keep].copy()

df['sleep'] = pd.to_numeric(df['sleep'], errors='coerce')
df['age'] = pd.to_numeric(df['age'], errors='coerce')

for col in ['gender', 'provider', 'user_id']:
    df[col] = df[col].astype('category')

df = df.dropna(subset=['user_id', 'sleep'])

# ============================================
# 3. Standardize objective predictors
# ============================================

for v in objective_vars:
    mean = df[v].mean(skipna=True)
    std = df[v].std(skipna=True)
    if std == 0 or np.isnan(std):
        df[f"{v}_z"] = np.nan
    else:
        df[f"{v}_z"] = (df[v] - mean) / std

z_vars = [f"{v}_z" for v in objective_vars]

# ============================================
# 4. Descriptive table
# ============================================

desc_table = df.groupby('sleep')[objective_vars].agg(['mean', 'std', 'count'])
print("\n=== Descriptive statistics by subjective sleep score ===")
display(desc_table.style.set_caption("Objective metrics by subjective sleep score"))

# ============================================
# 5. Cluster-robust OLS regressions
# ============================================

results = []

for v, zv in zip(objective_vars, z_vars):
    print(f"\n--- Processing {v} ---")

    model_df = df[['sleep', 'age', 'gender', 'user_id', zv]].dropna()
    if model_df.shape[0] < 20:
        print(f"Skipping {v}: insufficient data.")
        continue

    # Build formula
    covariates = []
    if model_df['age'].std() > 0:
        covariates.append("age")
    if model_df['gender'].nunique() > 1:
        covariates.append("C(gender)")
    fixed_part = " + ".join([zv] + covariates) if covariates else zv
    formula = f"sleep ~ {fixed_part}"

    try:
        model = smf.ols(formula, data=model_df).fit(
            cov_type="cluster", cov_kwds={"groups": model_df["user_id"]}
        )

        coef = model.params.get(zv, np.nan)
        pval = model.pvalues.get(zv, np.nan)
        ci_low, ci_high = model.conf_int().loc[zv]

        results.append({
            "variable": v,
            "coef_z": coef,
            "ci_low": ci_low,
            "ci_high": ci_high,
            "p_value": pval,
            "n_rows": model_df.shape[0],
            "n_users": model_df['user_id'].nunique()
        })

    except Exception as e:
        print(f" Model for {v} failed: {e}")
        continue

results_df = pd.DataFrame(results)

# ============================================
# 6. Significance stars + pretty summary table
# ============================================

def sig_stars(p):
    if pd.isna(p): return ""
    if p < 0.001: return "***"
    if p < 0.01: return "**"
    if p < 0.05: return "*"
    if p < 0.10: return "·"
    return ""

if not results_df.empty:
    results_df['sig'] = results_df['p_value'].apply(sig_stars)
    results_df = results_df.sort_values('p_value')

    results_df['effect (z-scaled)'] = results_df.apply(
        lambda r: f"{r['coef_z']:.2f} [{r['ci_low']:.2f}, {r['ci_high']:.2f}] {r['sig']}",
        axis=1
    )

    summary_table = results_df[[
        'variable', 'effect (z-scaled)', 'p_value', 'n_rows', 'n_users'
    ]]

    print("\n=== OLS results (clustered by user_id) ===")
    display(
        summary_table.style
        .format({'p_value': '{:.3g}'})
        .set_caption("Cluster-robust effects of objective sleep metrics on subjective sleep")
    )
else:
    print("No models were successfully fitted.")

# ============================================
# 7. Forest plot of effect sizes
# ============================================

# Mapping from raw variable names to pretty labels (for y-tick labels only)
var_label_map = {
    'dur_asleep': 'Total Time Asleep',
    'sleep_eff': 'Sleep Efficiency',
    'REM_pct': 'REM Sleep (%)',
    'deep_pct': 'Deep Sleep (%)',
    'light_pct': 'Light Sleep (%)',
    'avg_hrv_sdnn': 'Mean SDNN',
    'avg_hrv_rmssd': 'Mean RMSSD',
    'slope_rmssd': 'Slope RMSSD',
    'mesor_rmssd': 'MESOR RMSSD',
    'acrophase_rmssd': 'Acrophase RMSSD',
    'amplitude_rmssd': 'Amplitude RMSSD',
    'peak_time_rmssd': 'Peak Time RMSSD',
}

if not results_df.empty:
    plot_df = results_df.dropna(subset=['coef_z']).copy()
    plot_df = plot_df.sort_values('coef_z')

    # Use pretty labels for y-axis ticks, fall back to raw name if missing
    plot_df['var_label'] = plot_df['variable'].map(var_label_map).fillna(plot_df['variable'])

    plt.figure(figsize=(8, max(5, 0.4 * len(plot_df))))
    plt.errorbar(
        x=plot_df['coef_z'],
        y=plot_df['var_label'],
        xerr=[plot_df['coef_z'] - plot_df['ci_low'],
              plot_df['ci_high'] - plot_df['coef_z']],
        fmt='o',
        capsize=3
    )
    plt.axvline(0, linestyle='--', linewidth=1)
    plt.xlabel("Effect on subjective Sleep-Score (per 1 SD change)")
    plt.title(r"$\bf{Cluster\text{-}robust\ OLS\ Estimates}$""\n(clustered by user_id)")
    plt.tight_layout()
    plt.show()

### Within-User OLS Regression (User Level)

In [None]:
sns.set(style="whitegrid")

# ============================================
# 1. Configuration
# ============================================

objective_vars = [
    'dur_asleep', 'sleep_eff',
    'REM_pct', 'deep_pct', 'light_pct',
    'avg_hrv_sdnn', 'avg_hrv_rmssd',
    'slope_rmssd', 'mesor_rmssd',
    'acrophase_rmssd', 'amplitude_rmssd', 'peak_time_rmssd'
]

base_cols = ['user_id', 'date', 'sleep', 'gender', 'age', 'provider']

# ============================================
# 2. Basic cleaning / typing (from merged)
# ============================================

df = merged[base_cols + objective_vars].copy()

df['sleep'] = pd.to_numeric(df['sleep'], errors='coerce')

for col in ['gender', 'provider', 'user_id']:
    df[col] = df[col].astype('category')

df = df.dropna(subset=['user_id', 'sleep'])

# How many nights per user?
user_counts = df['user_id'].value_counts()

# ============================================
# 3. Helpers: per-user models & display
# ============================================

def sig_stars(p):
    if pd.isna(p): return ""
    if p < 0.001: return "***"
    if p < 0.01: return "**"
    if p < 0.05: return "*"
    if p < 0.10: return "·"
    return ""


def fit_user_models(df_user, objective_vars, min_nights=10):

    df_user = df_user.copy()
    results = []

    # Standardize predictors within this user
    for v in objective_vars:
        mean = df_user[v].mean(skipna=True)
        std = df_user[v].std(skipna=True)
        if std == 0 or np.isnan(std):
            df_user[f"{v}_z"] = np.nan
        else:
            df_user[f"{v}_z"] = (df_user[v] - mean) / std

    for v in objective_vars:
        zv = f"{v}_z"
        if zv not in df_user.columns:
            continue

        model_df = df_user[['sleep', zv]].dropna()

        # require at least min_nights for this user & variable
        if model_df.shape[0] < min_nights:
            continue

        # skip if predictor has ~0 variance even after z-scoring
        if model_df[zv].std() < 1e-6:
            continue

        formula = f"sleep ~ {zv}"

        try:
            model = smf.ols(formula, data=model_df).fit()
            coef = model.params.get(zv, np.nan)
            pval = model.pvalues.get(zv, np.nan)
            ci_low, ci_high = model.conf_int().loc[zv]

            results.append({
                "variable": v,
                "coef_z": coef,
                "ci_low": ci_low,
                "ci_high": ci_high,
                "p_value": pval,
                "n_rows": model_df.shape[0],
            })
        except Exception as e:
            print(f"User {df_user['user_id'].iloc[0]} – model for {v} failed: {e}")

    if not results:
        return pd.DataFrame(columns=["variable","coef_z","ci_low","ci_high","p_value","n_rows"])

    res_df = pd.DataFrame(results)
    return res_df

# Mapping from raw variable names to pretty labels (for y-tick labels only)
var_label_map = {
    'dur_asleep': 'Total Time Asleep',
    'sleep_eff': 'Sleep Efficiency',
    'REM_pct': 'REM Sleep (%)',
    'deep_pct': 'Deep Sleep (%)',
    'light_pct': 'Light Sleep (%)',
    'avg_hrv_sdnn': 'Mean SDNN',
    'avg_hrv_rmssd': 'Mean RMSSD',
    'slope_rmssd': 'Slope RMSSD',
    'mesor_rmssd': 'MESOR RMSSD',
    'acrophase_rmssd': 'Acrophase RMSSD',
    'amplitude_rmssd': 'Amplitude RMSSD',
    'peak_time_rmssd': 'Peak Time RMSSD',
}


def show_user_dashboard(user_id, min_nights=10):
    """
    Show table + forest plot for a single user.
    """
    df_user = df[df['user_id'] == user_id].copy()
    n_nights = df_user.shape[0]

    if n_nights < min_nights:
        print(f"User {user_id} has only {n_nights} nights (< {min_nights}).")
        return

    res_df = fit_user_models(df_user, objective_vars, min_nights=min_nights)
    if res_df.empty:
        print(f"No valid models for user {user_id} with min_nights={min_nights}.")
        return

    res_df['sig'] = res_df['p_value'].apply(sig_stars)
    res_df = res_df.sort_values('p_value')

    res_df['effect (z-scaled)'] = res_df.apply(
        lambda r: f"{r['coef_z']:.2f} [{r['ci_low']:.2f}, {r['ci_high']:.2f}] {r['sig']}",
        axis=1
    )

    summary = res_df[['variable','effect (z-scaled)','p_value','n_rows']]

    print(f"\n=== Per-user effects for {user_id} ===")
    print(f"Nights (all): {n_nights}")
    display(
        summary.style
        .format({'p_value': '{:.3g}'})
        .set_caption(f"Effect of objective metrics on sleep for user {user_id}")
    )

    # Forest plot
    plot_df = res_df.sort_values('coef_z')

    # Use pretty labels for y-axis ticks, fall back to raw name if missing
    plot_df['var_label'] = plot_df['variable'].map(var_label_map).fillna(plot_df['variable'])

    plt.figure(figsize=(8, max(5, 0.4 * len(plot_df))))
    plt.errorbar(
        x=plot_df['coef_z'],
        y=plot_df['var_label'],
        xerr=[plot_df['coef_z'] - plot_df['ci_low'],
              plot_df['ci_high'] - plot_df['coef_z']],
        fmt='o',
        capsize=3
    )
    plt.axvline(0, linestyle='--', linewidth=1)
    plt.xlabel("Effect on subjective Sleep-Score (per 1 SD change, within-user)")
    plt.title(r"$\bf{Per\text{-}user\ Effect\ Estimates}$"f"\nUser: {user_id[:7]}...")
    plt.tight_layout()
    plt.show()

# ============================================
# 4. Interactive widgets: min_nights (input) + user dropdown
# ============================================

min_nights_widget = widgets.IntText(
    value=20,
    description='Min Nights:',
    layout=widgets.Layout(width="200px")
)

user_widget = widgets.Dropdown(
    options=[],
    description='User:',
    layout=widgets.Layout(width="300px")
)

out = widgets.Output()

def update_user_options(change=None):

    try:
        min_nights = int(min_nights_widget.value)
    except Exception:
        min_nights = 0

    if min_nights < 1:
        with out:
            out.clear_output()
            print("Please enter a positive integer for min nights.")
        user_widget.options = []
        user_widget.value = None
        return

    eligible = user_counts[user_counts >= min_nights]

    if eligible.empty:
        user_widget.options = []
        user_widget.value = None
        with out:
            out.clear_output()
            print(f"No users with at least {min_nights} nights.")
        return

    options = [(str(uid), uid) for uid in eligible.index]
    user_widget.options = options

    if user_widget.value not in [v for _, v in options]:
        user_widget.value = options[0][1]

    update_display()

def update_display(change=None):

    user_id = user_widget.value
    try:
        min_nights = int(min_nights_widget.value)
    except Exception:
        min_nights = 0

    if user_id is None:
        with out:
            out.clear_output()
            print("No user selected.")
        return

    with out:
        out.clear_output(wait=True)
        show_user_dashboard(user_id, min_nights=min_nights)

# Attach callbacks
min_nights_widget.observe(update_user_options, names='value')
user_widget.observe(update_display, names='value')

# Initialize
update_user_options()

# Show controls + output
display(min_nights_widget, user_widget, out)