In [21]:
import pandas as pd
from variables.variables import *

df = pd.read_csv(DATAFRAME_PATH)

In [27]:

import seaborn as sns

subset = df[(df['hours_since_last_caffeine_target_day'] >= 0) & (df['hours_since_last_caffeine_target_day'] <= 16)]

def treated(value, median):
    """Assign treatment based on a global median threshold
    Returns:
        True if value >= median, False if less, or None if value is None.
    """
    if value is None:
        return None
    return value >= median

subset['treated'] = [treated(i, 6) for i in subset['hours_since_last_caffeine_target_day']]

subset = subset.dropna(subset=['treated'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['treated'] = [treated(i, 6) for i in subset['hours_since_last_caffeine_target_day']]


In [28]:
subset['treated'].value_counts()

treated
True     2341
False    1075
Name: count, dtype: int64

In [36]:
subset[['hours_since_last_caffeine_target_day', 'gender', 'treated']].groupby('treated').mean()

Unnamed: 0_level_0,hours_since_last_caffeine_target_day,gender
treated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3.605754,0.467907
True,10.930633,0.466894


In [35]:
subset[['hours_since_last_caffeine_target_day', 'gender', 'treated']].groupby('treated').std()

Unnamed: 0_level_0,hours_since_last_caffeine_target_day,gender
treated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1.52815,0.499201
True,3.13533,0.499009


### Baseline characteristics

In [7]:
# Baseline characteristics
unique_id = df['RegistrationCode'].nunique()
male, female = df[['RegistrationCode', 'gender']].drop_duplicates()['gender'].value_counts()
age_mean, age_std = df['age'].mean(), df['age'].std()
bmi_mean, bmi_std = df['bmi'].mean(), df['bmi'].std()

min_date, max_date = df['Date'].min(), df['Date'].max()
min_date = pd.to_datetime(min_date).strftime("%B %Y")
max_date = pd.to_datetime(max_date).strftime("%B %Y")

baseline_characteristics_text = f"""
We analyzed {len(df)} person-nights with concurrent dietary logs and WatchPAT-measured sleep recordings 
from {unique_id} adults enrolled in the HPP, a deeply phenotyped longitudinal cohort in Israel (Shilo et al. 2021). 
The cohort consisted of {male} ({male / unique_id * 100:.1f}%) male and {female} ({female / unique_id * 100:.1f}%) female participants 
and included sleep and nutrition data collected between {min_date} and {max_date}. 
Participants were predominantly healthy, with a mean age of {age_mean:.1f} ± {age_std:.1f} years and a mean BMI of {bmi_mean:.1f} ± {bmi_std:.1f} kg/m². 
Days with implausible total energy intake (<500 or >5,000 kcal), fewer than three logged food items, or total sleep duration outside 300–600 minutes were excluded. 
This dataset provides one of the largest real-world resources for studying how day-to-day nutritional variation influences objectively measured sleep architecture. 
"""

print(baseline_characteristics_text)


We analyzed 4793 person-nights with concurrent dietary logs and WatchPAT-measured sleep recordings 
from 3598 adults enrolled in the HPP, a deeply phenotyped longitudinal cohort in Israel (Shilo et al. 2021). 
The cohort consisted of 1829 (50.8%) male and 1769 (49.2%) female participants 
and included sleep and nutrition data collected between February 2020 and April 2025. 
Participants were predominantly healthy, with a mean age of 52.7 ± 7.9 years and a mean BMI of 26.0 ± 4.1 kg/m². 
Days with implausible total energy intake (<500 or >5,000 kcal), fewer than three logged food items, or total sleep duration outside 300–600 minutes were excluded. 
This dataset provides one of the largest real-world resources for studying how day-to-day nutritional variation influences objectively measured sleep architecture. 



### Global landscape of nutrition and sleep associations


In [8]:
tst_mean = df['total_sleep_time_minutes'].mean()
deep_mean = df['percent_of_deep_sleep_time'].mean()
rem_mean = df['percent_of_rem_sleep_time'].mean()
light_mean = df['percent_of_light_sleep_time'].mean()
se = df['sleep_efficiency'].mean()
sol = df['sleep_latency_minutes'].mean()

total_energy_kcal = df['total_energy_kcal'].mean()
protein_g = df['protein_g'].mean()
fat_g = df['fat_g'].mean()
carbs_g = df['carbs_g'].mean()

habits_text = f"""
The analysis included {len(df)} logged days from {unique_id} participants. 
On average, participants slept {tst_mean:.1f} hours per night, with sleep composition 
consisting of {deep_mean:.1f}% deep sleep, {rem_mean:.1f}% REM sleep, and {light_mean:.1f}% light sleep. 
Mean sleep efficiency was {se:.1f}%, and average sleep latency was {sol:.0f} minutes. 
Mean daily energy intake was {total_energy_kcal:.0f} kcal, with an average macronutrient intake 
of  {protein_g:.1f} g protein, {fat_g:.1f} g fat, and {carbs_g:.1f} g carbohydrates, corresponding to 
{protein_g * 4 / total_energy_kcal * 100:.1f}%, {fat_g * 9 / total_energy_kcal * 100:.1f}%, and {carbs_g * 4 / total_energy_kcal * 100:.1f}% of total energy, respectively.
"""

print(habits_text)


The analysis included 4793 logged days from 3598 participants. 
On average, participants slept 378.3 hours per night, with sleep composition 
consisting of 17.6% deep sleep, 24.0% REM sleep, and 58.5% light sleep. 
Mean sleep efficiency was 88.8%, and average sleep latency was 15 minutes. 
Mean daily energy intake was 1815 kcal, with an average macronutrient intake 
of  79.8 g protein, 77.4 g fat, and 200.2 g carbohydrates, corresponding to 
17.6%, 38.4%, and 44.1% of total energy, respectively.



### Table 1. Baseline characteristics of study participants.

In [11]:
import pandas as pd
import numpy as np
from typing import Sequence, Mapping


def baseline_characteristics_table(
    df: pd.DataFrame,
    features: Sequence[str],
    gender_col: str = "gender",
    gender_labels = None,
    digits: int = 2,
) -> pd.DataFrame:
    """
    Create a baseline characteristics table (mean ± SD),
    split by gender, suitable for publication.

    Returns a DataFrame with rows = variables,
    columns = Overall / Male / Female.
    """
    gender_labels = gender_labels or {
        "male": "Male",
        "female": "Female",
        "M": "Male",
        "F": "Female",
    }

    def _mean_sd(x: pd.Series) -> str:
        x = pd.to_numeric(x, errors="coerce").dropna()
        if x.empty:
            return "NA"
        return f"{x.mean():.{digits}f} ± {x.std():.{digits}f}"

    rows = []

    for var in features:
        if var not in df.columns:
            continue

        row = {
            "Variable": var,
            "Overall": _mean_sd(df[var]),
        }

        for g_raw, g_label in gender_labels.items():
            mask = df[gender_col] == g_raw
            if mask.any():
                row[g_label] = _mean_sd(df.loc[mask, var])

        rows.append(row)

    out = (
        pd.DataFrame(rows)
        .set_index("Variable")
        #.sort_index()
    )

    return out

In [13]:
features = [
"age",
"bmi",
"protein_g",
"fat_g",
"carbs_g",
"sugars_g",
"fiber_g",
"sat_fat_g",
"hour_first_meal",
"hour_last_meal",
"night_calories_pct",
"last_meal_calories",
"last_meal_calories_pct",
"unique_foods_count",
"unique_plant_based_foods_count",
"fiber_g_per_1000_kcal",
"whole_foods_categories_energy",
"plant_based_whole_foods_categories_energy",
"whole_dairy_categories_energy",
"processed_categories_energy",
"fish_meat_eggs_categories_energy",
"total_energy_kcal",

"whole_food_categories_ratio",
"processed_categories_ratio",

"prot_pct",
"fat_pct",
"carb_pct",

"sleep_hour",
"percent_of_deep_sleep_time",
"percent_of_rem_sleep_time",
"percent_of_light_sleep_time",
"number_of_wakes",
"heart_rate_mean_during_sleep",
"sleep_efficiency",
"total_sleep_time_minutes",
"sleep_latency_minutes",
"total_wake_time_after_sleep_onset_minutes",

"med_Aspirin",
"med_Atozet",
"med_Cardiloc",
"med_Cipralex",
"med_Curcumin",
"med_Eltroxin",
"med_Euthyrox",       
"med_Lipitor",
"med_Litorva",
"med_Magnesium",
"med_Micropirin",
"med_Nexium",
"med_Omega 3",
"med_Stator",
"med_Tritace",
"med_Vitamin B12",
"med_Vitamin C",
"med_Vitamin D",
"med_calcium",
"med_gentle iron",

"Spinal pain",
"Haemorrhoids",
"Attention deficit hyperactivity disorder",
"Essential hypertension",
"hypothyroidism",
"Obesity",
"Migraine",
"Anxiety or fear-related disorders",
"Depressive disorders",
"Obstructive sleep apnoea",
"Iron Deficiency Anemia",
"Atopic eczema",
"Polycystic ovary syndrome",
"Heart valve diseases, unspecified",
"Endometriosis",
"Atrial fibrillation",
"Central sleep apnoeas",
"Chronic insomnia",
"Restless legs syndrome",
"Generalised anxiety disorder",
"Bipolar or related disorders",
"Post traumatic stress disorder",
"Chronic obstructive pulmonary disease",
"Asthma",
"Gastro-oesophageal reflux disease",
"Chronic widespread pain",
"Osteoarthritis",
]

baseline_table = baseline_characteristics_table(
    df=df,
    features=features,
    gender_col="gender",
    gender_labels={1:"Male", 0:"Female"},
    digits=2,
)
baseline_table

Unnamed: 0_level_0,Overall,Male,Female
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,52.68 ± 7.91,52.06 ± 7.95,53.26 ± 7.84
bmi,25.99 ± 4.05,26.56 ± 3.79,25.45 ± 4.22
protein_g,79.75 ± 38.27,89.64 ± 40.74,70.39 ± 33.16
fat_g,77.38 ± 38.00,84.83 ± 40.38,70.32 ± 34.15
carbs_g,200.22 ± 100.46,216.77 ± 103.26,184.55 ± 95.14
...,...,...,...
Chronic obstructive pulmonary disease,0.00 ± 0.05,0.00 ± 0.05,0.00 ± 0.05
Asthma,0.05 ± 0.22,0.05 ± 0.22,0.05 ± 0.22
Gastro-oesophageal reflux disease,0.01 ± 0.10,0.01 ± 0.11,0.01 ± 0.10
Chronic widespread pain,0.01 ± 0.11,0.01 ± 0.07,0.02 ± 0.13


In [14]:
baseline_table.to_csv(f"{paper_tables_folder}/table_1_baseline_characteristics.csv")