# Download the data summary table (same as "Click here to download a csv of the current summary statistics for your study.")


In [58]:
import sys
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
#Use this cell if you've moved this notebook somewhere else
#sys.path.insert(0, "/path/to/repo/beiwe/code")


In [59]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [60]:
import data_summaries

In [61]:
# Load credentials from keyring file
import data_summaries
kr = data_summaries.read_keyring("/Users/zhusiyao/Documents/Beiwe Projects/data_volume_summaries/keyring_studies.py")

# Extract credentials from keyring
ACCESS_KEY = os.environ.get('BEIWE_ACCESS_KEY')
SECRET_KEY = os.environ.get('BEIWE_SECRET_KEY')
API_URL_BASE = os.environ.get('BEIWE_URL', 'https://studies.beiwe.org/')

print(f"Using API URL: {API_URL_BASE}")
print(f"Access key loaded: {ACCESS_KEY[:10]}..." if ACCESS_KEY else "No access key found")


Using API URL: https://studies.beiwe.org/
Access key loaded: GQVW5/ybRd...


In [62]:
data_summaries_file_path = "data_volume.csv"

In [63]:
# Functions to download and filter participant data
import json

def make_session(total_retries=3, backoff=0.5):
    """Create a requests session with retry logic"""
    s = requests.Session()
    retry = Retry(
        total=total_retries,
        connect=total_retries,
        read=total_retries,
        backoff_factor=backoff,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["POST"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retry)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    return s

def get_participant_table_data(study_id, 
                               data_format="csv",   # "csv", "json", "json_table"
                               out_path=None,
                               timeout=120):
    """
    Calls /get-participant-table-data/v1 and returns or saves the result.
    """
    if not API_URL_BASE.endswith("/"):
        base = API_URL_BASE + "/"
    else:
        base = API_URL_BASE
    url = base + "get-participant-table-data/v1"

    payload = {
        "access_key": ACCESS_KEY,
        "secret_key": SECRET_KEY,
        "study_id": study_id,
        "data_format": data_format,
    }

    session = make_session()
    resp = session.post(url, data=payload, timeout=timeout, allow_redirects=False)

    if resp.status_code != 200:
        raise RuntimeError(f"HTTP {resp.status_code}: {resp.text[:300]}")

    # Handle CSV
    if data_format == "csv":
        out_path = out_path or "participant_table.csv"
        with open(out_path, "wb") as f:
            f.write(resp.content)
        print("Saved CSV to:", os.path.abspath(out_path))
        return out_path

    # Handle JSON
    obj = resp.json()
    if out_path:
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(obj, f, ensure_ascii=False, indent=2)
        print("Saved JSON to:", os.path.abspath(out_path))
        return out_path
    else:
        return obj

def get_active_participants(study_id):
    """
    Download participant data and filter out 'Not Registered' and 'Permanently Retired' participants.
    Returns a list of active participant IDs.
    """
    print("Downloading participant data...")
    
    # Get participant data as JSON
    participants_data = get_participant_table_data(study_id, data_format="json")
    
    # Filter out inactive participants
    active_participants = []
    excluded_count = 0
    
    for participant in participants_data:
        status = participant.get('Status', '')
        participant_id = participant.get('Patient ID', '')
        
        if status in ['Not Registered', 'Permanently Retired']:
            excluded_count += 1
            print(f"Excluding participant {participant_id} with status: {status}")
        else:
            active_participants.append(participant_id)
    
    print(f"\nFiltered out {excluded_count} participants with 'Not Registered' or 'Permanently Retired' status")
    print(f"Active participants remaining: {len(active_participants)}")
    
    return active_participants

In [64]:
# Get active participants (filtered)
study_id = "m4z54N5SU7Eqq2LbwmxQd2UN"
active_participants = get_active_participants(study_id)

Downloading participant data...
Excluding participant 1anurea6 with status: Permanently Retired
Excluding participant 1bllhfi7 with status: Permanently Retired
Excluding participant 1f8ujz41 with status: Permanently Retired
Excluding participant 1ib9r56g with status: Not Registered
Excluding participant 1sshhk6u with status: Permanently Retired
Excluding participant 29dam7cc with status: Permanently Retired
Excluding participant 33yib9v1 with status: Permanently Retired
Excluding participant 3hpam6bh with status: Not Registered
Excluding participant 3syylhuo with status: Permanently Retired
Excluding participant 3vsmm961 with status: Permanently Retired
Excluding participant 3xwxyr8p with status: Permanently Retired
Excluding participant 44ggnee7 with status: Permanently Retired
Excluding participant 4a517z7e with status: Not Registered
Excluding participant 4ahuk17u with status: Permanently Retired
Excluding participant 4ggpbwfj with status: Permanently Retired
Excluding participant 4

In [65]:
import os
study_id =  "m4z54N5SU7Eqq2LbwmxQd2UN"
data_summaries.get_data_summaries(study_id,
        output_file_path = data_summaries_file_path,
        keyring = kr)

# Downstream Analysis


In [66]:
# Please set lookback window
x = 7  # lookback window in days

In [67]:
import pandas as pd
from pathlib import Path

# ========= user parameters =========
CSV_PATH = "data_volume.csv"
tz = "America/New_York"

# ========= load =========
csv_path = Path(CSV_PATH)
df = pd.read_csv(csv_path)

required = {"date", "study_id", "beiwe_accelerometer_bytes", "beiwe_gps_bytes", "participant_id"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"CSV missing required columns: {sorted(missing)}")

df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])
df["beiwe_accelerometer_bytes"] = pd.to_numeric(df["beiwe_accelerometer_bytes"], errors="coerce").fillna(0)
df["beiwe_gps_bytes"] = pd.to_numeric(df["beiwe_gps_bytes"], errors="coerce").fillna(0)

if study_id is not None:
    df = df[df["study_id"] == study_id]

# ========= FILTER OUT INACTIVE PARTICIPANTS =========
print(f"Total participants in data: {df['participant_id'].nunique()}")
df = df[df["participant_id"].isin(active_participants)]
print(f"Active participants after filtering: {df['participant_id'].nunique()}")

# ========= time window =========
# Exclude today - look back from yesterday and back x days
yesterday = pd.Timestamp.now(tz=tz).normalize() - pd.Timedelta(days=1)
start = yesterday - pd.Timedelta(days=x - 1)  # inclusive [start, yesterday]

# normalize to local calendar dates (naive) for day-level logic
# Handle both naive and timezone-aware datetimes
if df["date"].dt.tz is not None:
    # If timezone-aware, convert to naive and normalize
    df["local_date"] = df["date"].dt.tz_localize(None).dt.normalize()
else:
    # If already naive, just normalize (sets time to 00:00:00)
    df["local_date"] = df["date"].dt.normalize()

# Convert start and yesterday to naive datetimes for comparison
# Convert start and yesterday to naive datetimes for comparison
# Normalize to midnight (00:00:00) to ensure exact date matching
if start.tz is not None:
    start_naive = start.tz_localize(None).normalize()
else:
    start_naive = start.normalize()

if yesterday.tz is not None:
    yesterday_naive = yesterday.tz_localize(None).normalize()
else:
    yesterday_naive = yesterday.normalize()


mask = (df["local_date"] >= start_naive) & (df["local_date"] <= yesterday_naive)
dfw = df.loc[mask, ["participant_id", "local_date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes"]].copy()

# ========= aggregate to per-participant per-day =========
daily = (
    dfw.groupby(["participant_id", "local_date"], as_index=False)[
        ["beiwe_accelerometer_bytes", "beiwe_gps_bytes"]
    ].sum()
    .rename(columns={"local_date": "date"})
)

# (ok to keep; will be recomputed later anyway)
daily["has_accel"] = daily["beiwe_accelerometer_bytes"] > 0
daily["has_gps"]   = daily["beiwe_gps_bytes"] > 0
daily["has_both"]  = daily["has_accel"] & daily["has_gps"]
daily["missing_any"] = ~daily["has_both"]

# ========= ensure full date grid per participant =========
# Use naive datetimes for the date range (already computed above)
# Normalize all dates to ensure they are at midnight (00:00:00) for proper matching
all_days = pd.date_range(start=start_naive, end=yesterday_naive, freq="D").normalize()
all_participants = df["participant_id"].dropna().drop_duplicates().sort_values()
grid = pd.MultiIndex.from_product([all_participants, all_days], names=["participant_id", "date"]).to_frame(index=False)

daily_full = grid.merge(daily, on=["participant_id", "date"], how="left")

# fill bytes first
daily_full[["beiwe_accelerometer_bytes", "beiwe_gps_bytes"]] = (
    daily_full[["beiwe_accelerometer_bytes", "beiwe_gps_bytes"]].fillna(0)
)

# ========= recompute booleans after fillna =========
daily_full["has_accel"] = daily_full["beiwe_accelerometer_bytes"] > 0
daily_full["has_gps"]   = daily_full["beiwe_gps_bytes"] > 0
daily_full["has_both"]  = daily_full["has_accel"] & daily_full["has_gps"]
daily_full["missing_any"] = ~daily_full["has_both"]
# ================================================

daily_full = daily_full.sort_values(["participant_id", "date"]).reset_index(drop=True)

# ========= save =========
out_path = f"daily_sensor_collection_last_{x}_days.csv"
daily_full.to_csv(out_path, index=False)

print(
    f"Saved day-level collection matrix for last {x} days to: {out_path}\n"
    f"Columns: participant_id, date, beiwe_accelerometer_bytes, beiwe_gps_bytes, has_accel, has_gps, has_both, missing_any"
)


Total participants in data: 341
Active participants after filtering: 182
Saved day-level collection matrix for last 7 days to: daily_sensor_collection_last_7_days.csv
Columns: participant_id, date, beiwe_accelerometer_bytes, beiwe_gps_bytes, has_accel, has_gps, has_both, missing_any


In [68]:
# ========= DEBUG: Check data for participant 15l9hyfu =========
import pandas as pd

# Load raw data
debug_df = pd.read_csv("data_volume.csv")
debug_df["date"] = pd.to_datetime(debug_df["date"], errors="coerce")
debug_df = debug_df.dropna(subset=["date"])

# Filter for participant 15l9hyfu
participant_id_debug = "15l9hyfu"
debug_pid_data = debug_df[debug_df["participant_id"] == participant_id_debug].copy()

print(f"=== DEBUG: Data for participant {participant_id_debug} ===\n")
print(f"Total rows in data_volume.csv for {participant_id_debug}: {len(debug_pid_data)}")

if len(debug_pid_data) > 0:
    # Show last 10 days of data
    debug_pid_data_sorted = debug_pid_data.sort_values("date", ascending=False).head(10)
    print(f"\nLast 10 days of data:")
    print(debug_pid_data_sorted[["date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes", "timezone", "study_id"]].to_string())
    
    # Check last 7 days specifically
    from datetime import datetime, timedelta
    tz = "America/New_York"
    yesterday = pd.Timestamp.now(tz=tz).normalize() - pd.Timedelta(days=1)
    start = yesterday - pd.Timedelta(days=6)  # last 7 days (inclusive)
    
    print(f"\n=== Checking last 7 days (from {start.strftime('%Y-%m-%d')} to {yesterday.strftime('%Y-%m-%d')}) ===")
    
    # Normalize dates for comparison
    if debug_pid_data["date"].dt.tz is not None:
        debug_pid_data["local_date"] = debug_pid_data["date"].dt.tz_localize(None).dt.normalize()
    else:
        debug_pid_data["local_date"] = debug_pid_data["date"].dt.normalize()
    
    start_naive = start.tz_localize(None) if start.tz is not None else start
    yesterday_naive = yesterday.tz_localize(None) if yesterday.tz is not None else yesterday
    
    debug_last7 = debug_pid_data[
        (debug_pid_data["local_date"] >= start_naive) & 
        (debug_pid_data["local_date"] <= yesterday_naive)
    ].copy()
    
    print(f"\nRows in last 7 days: {len(debug_last7)}")
    if len(debug_last7) > 0:
        print("\nData in last 7 days:")
        print(debug_last7[["local_date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes", "timezone"]].to_string())
        
        # Aggregate by date
        daily_debug = debug_last7.groupby("local_date", as_index=False).agg({
            "beiwe_accelerometer_bytes": "sum",
            "beiwe_gps_bytes": "sum"
        })
        print(f"\nAggregated by date (last 7 days):")
        print(daily_debug.to_string())
        
        # Check what the daily_full dataframe has
        print(f"\n=== Checking daily_sensor_collection_last_7_days.csv ===")
        try:
            daily_full_check = pd.read_csv("daily_sensor_collection_last_7_days.csv")
            daily_full_pid = daily_full_check[daily_full_check["participant_id"] == participant_id_debug].copy()
            print(f"\nRows for {participant_id_debug} in daily_sensor_collection_last_7_days.csv: {len(daily_full_pid)}")
            if len(daily_full_pid) > 0:
                print("\nData in daily_sensor_collection_last_7_days.csv:")
                print(daily_full_pid[["date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes", "has_accel", "has_gps", "has_both"]].to_string())
            else:
                print(f"\n⚠️ NO DATA found for {participant_id_debug} in daily_sensor_collection_last_7_days.csv!")
        except FileNotFoundError:
            print("\n⚠️ daily_sensor_collection_last_7_days.csv not found. Run the previous cell first.")
    else:
        print(f"\n⚠️ NO DATA found for {participant_id_debug} in last 7 days!")
        print(f"\nAll available dates for this participant:")
        all_dates = debug_pid_data.sort_values("date")["local_date"].dt.strftime("%Y-%m-%d").unique()
        print(f"Dates available: {', '.join(all_dates[:20])}")
        if len(all_dates) > 20:
            print(f"... and {len(all_dates) - 20} more dates")
else:
    print(f"\n⚠️ NO DATA found for participant {participant_id_debug} in data_volume.csv at all!")
    print(f"\nAvailable participants in data_volume.csv:")
    all_participants = debug_df["participant_id"].unique()
    print(f"Total participants: {len(all_participants)}")
    similar = [p for p in all_participants if "15l9" in str(p).lower() or "l9hy" in str(p).lower()]
    if similar:
        print(f"\nSimilar participant IDs: {similar[:10]}")


=== DEBUG: Data for participant 15l9hyfu ===

Total rows in data_volume.csv for 15l9hyfu: 760

Last 10 days of data:
           date  beiwe_accelerometer_bytes  beiwe_gps_bytes timezone                  study_id
78   2025-11-03                 14192962.0          37841.0      EST  m4z54N5SU7Eqq2LbwmxQd2UN
224  2025-11-02                 21589688.0         170500.0      EST  m4z54N5SU7Eqq2LbwmxQd2UN
318  2025-11-01                 35481957.0         195529.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
476  2025-10-31                 38910140.0         153011.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
640  2025-10-30                 29072095.0         150739.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
801  2025-10-29                  3271561.0          77761.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
913  2025-10-28                 20953559.0          84376.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
1065 2025-10-27                 25954154.0          88637.0      EDT  m4z54N5SU7Eqq2LbwmxQd2UN
1212 2025-10-26             

In [69]:
# ========= DEBUG: Check merge issue for participant 15l9hyfu =========
# Re-run the processing logic step by step to see where the merge fails
import pandas as pd
from pathlib import Path

# Replicate the exact logic from the main cell
CSV_PATH = "data_volume.csv"
tz = "America/New_York"
x = 7  # lookback window
study_id = "m4z54N5SU7Eqq2LbwmxQd2UN"

# Load data
df_debug = pd.read_csv(CSV_PATH)
df_debug["date"] = pd.to_datetime(df_debug["date"], errors="coerce")
df_debug = df_debug.dropna(subset=["date"])
df_debug["beiwe_accelerometer_bytes"] = pd.to_numeric(df_debug["beiwe_accelerometer_bytes"], errors="coerce").fillna(0)
df_debug["beiwe_gps_bytes"] = pd.to_numeric(df_debug["beiwe_gps_bytes"], errors="coerce").fillna(0)
df_debug = df_debug[df_debug["study_id"] == study_id]
df_debug = df_debug[df_debug["participant_id"].isin(active_participants)]

# Time window
yesterday = pd.Timestamp.now(tz=tz).normalize() - pd.Timedelta(days=1)
start = yesterday - pd.Timedelta(days=x - 1)

# Normalize dates
if df_debug["date"].dt.tz is not None:
    df_debug["local_date"] = df_debug["date"].dt.tz_localize(None).dt.normalize()
else:
    df_debug["local_date"] = df_debug["date"].dt.normalize()

start_naive = start.tz_localize(None) if start.tz is not None else start
yesterday_naive = yesterday.tz_localize(None) if yesterday.tz is not None else yesterday

# Filter
mask = (df_debug["local_date"] >= start_naive) & (df_debug["local_date"] <= yesterday_naive)
dfw_debug = df_debug.loc[mask, ["participant_id", "local_date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes"]].copy()

# Filter for participant 15l9hyfu
participant_debug = "15l9hyfu"
dfw_pid = dfw_debug[dfw_debug["participant_id"] == participant_debug].copy()
print(f"=== Step 1: Filtered data for {participant_debug} ===")
print(f"Rows after filtering: {len(dfw_pid)}")
if len(dfw_pid) > 0:
    print("\nFiltered data:")
    print(dfw_pid[["participant_id", "local_date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes"]])
    print(f"\nDate types:")
    print(f"local_date dtype: {dfw_pid['local_date'].dtype}")
    print(f"local_date sample: {dfw_pid['local_date'].iloc[0]}")
    print(f"local_date type: {type(dfw_pid['local_date'].iloc[0])}")

# Aggregate
daily_debug = (
    dfw_debug.groupby(["participant_id", "local_date"], as_index=False)[
        ["beiwe_accelerometer_bytes", "beiwe_gps_bytes"]
    ].sum()
    .rename(columns={"local_date": "date"})
)

daily_pid = daily_debug[daily_debug["participant_id"] == participant_debug].copy()
print(f"\n=== Step 2: Aggregated data for {participant_debug} ===")
print(f"Rows in daily: {len(daily_pid)}")
if len(daily_pid) > 0:
    print("\nAggregated data:")
    print(daily_pid[["participant_id", "date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes"]])
    print(f"\nDate types in daily:")
    print(f"date dtype: {daily_pid['date'].dtype}")
    print(f"date sample: {daily_pid['date'].iloc[0]}")
    print(f"date type: {type(daily_pid['date'].iloc[0])}")

# Create grid
all_days_debug = pd.date_range(start=start_naive, end=yesterday_naive, freq="D")
all_participants_debug = df_debug["participant_id"].dropna().drop_duplicates().sort_values()
grid_debug = pd.MultiIndex.from_product([all_participants_debug, all_days_debug], names=["participant_id", "date"]).to_frame(index=False)

grid_pid = grid_debug[grid_debug["participant_id"] == participant_debug].copy()
print(f"\n=== Step 3: Grid data for {participant_debug} ===")
print(f"Rows in grid: {len(grid_pid)}")
if len(grid_pid) > 0:
    print("\nGrid data:")
    print(grid_pid[["participant_id", "date"]])
    print(f"\nDate types in grid:")
    print(f"date dtype: {grid_pid['date'].dtype}")
    print(f"date sample: {grid_pid['date'].iloc[0]}")
    print(f"date type: {type(grid_pid['date'].iloc[0])}")

# Try merge
print(f"\n=== Step 4: Merge attempt ===")
if len(daily_pid) > 0 and len(grid_pid) > 0:
    # Check if dates match
    print("\nDate comparison:")
    for idx, row in daily_pid.iterrows():
        date_daily = row["date"]
        # Check if this date exists in grid
        matching = grid_pid[grid_pid["date"] == date_daily]
        if len(matching) > 0:
            print(f"  ✓ {date_daily} matches in grid")
        else:
            print(f"  ✗ {date_daily} NOT found in grid!")
            print(f"    Available grid dates: {grid_pid['date'].tolist()}")
    
    # Try the merge
    daily_full_debug = grid_pid.merge(daily_pid, on=["participant_id", "date"], how="left", indicator=True)
    print(f"\nMerge result:")
    print(f"Rows after merge: {len(daily_full_debug)}")
    print(f"Merge indicator value_counts:")
    print(daily_full_debug["_merge"].value_counts())
    
    if len(daily_full_debug) > 0:
        print("\nMerged data:")
        print(daily_full_debug[["participant_id", "date", "beiwe_accelerometer_bytes", "beiwe_gps_bytes", "_merge"]])
else:
    print("Cannot test merge - missing data in daily or grid")


=== Step 1: Filtered data for 15l9hyfu ===
Rows after filtering: 6

Filtered data:
    participant_id local_date  beiwe_accelerometer_bytes  beiwe_gps_bytes
224       15l9hyfu 2025-11-02                 21589688.0         170500.0
318       15l9hyfu 2025-11-01                 35481957.0         195529.0
476       15l9hyfu 2025-10-31                 38910140.0         153011.0
640       15l9hyfu 2025-10-30                 29072095.0         150739.0
801       15l9hyfu 2025-10-29                  3271561.0          77761.0
913       15l9hyfu 2025-10-28                 20953559.0          84376.0

Date types:
local_date dtype: datetime64[ns]
local_date sample: 2025-11-02 00:00:00
local_date type: <class 'pandas._libs.tslibs.timestamps.Timestamp'>

=== Step 2: Aggregated data for 15l9hyfu ===
Rows in daily: 6

Aggregated data:
  participant_id       date  beiwe_accelerometer_bytes  beiwe_gps_bytes
0       15l9hyfu 2025-10-28                 20953559.0          84376.0
1       15l9hyfu 2025

In [70]:
# ========= CATEGORIZE PARTICIPANTS BY MISSING DATA TYPE (ACTIVE PARTICIPANTS ONLY) =========
# Check if data is missing on ALL of the last X days (not just any day)

# Group by participant and check if missing on ALL days
participant_summary = daily_full.groupby("participant_id").agg({
    "has_accel": "all",  # True only if has_accel is True on ALL days
    "has_gps": "all",    # True only if has_gps is True on ALL days
    "has_both": "all"    # True only if has_both is True on ALL days
}).reset_index()

# Check missing patterns - missing on ALL days
missing_accel_all_days = ~participant_summary["has_accel"]  # True if accel missing on ALL days
missing_gps_all_days = ~participant_summary["has_gps"]      # True if GPS missing on ALL days
has_accel_all_days = participant_summary["has_accel"]        # True if accel present on ALL days
has_gps_all_days = participant_summary["has_gps"]           # True if GPS present on ALL days

# Categorize by missing data type (must be missing on ALL X days)
missing_accel_gps = []  # Missing both on ALL days
missing_only_gps = []   # Has accel on ALL days, missing GPS on ALL days
missing_only_accel = [] # Has GPS on ALL days, missing accel on ALL days

for idx, row in participant_summary.iterrows():
    participant_id = row["participant_id"]
    
    # Missing both accelerometer and GPS on ALL days
    if missing_accel_all_days.iloc[idx] and missing_gps_all_days.iloc[idx]:
        missing_accel_gps.append(participant_id)
    # Has accelerometer on ALL days but missing GPS on ALL days
    elif has_accel_all_days.iloc[idx] and missing_gps_all_days.iloc[idx]:
        missing_only_gps.append(participant_id)
    # Has GPS on ALL days but missing accelerometer on ALL days
    elif has_gps_all_days.iloc[idx] and missing_accel_all_days.iloc[idx]:
        missing_only_accel.append(participant_id)

# Get total participants with issues (missing something on all days)
participants_with_missing = missing_accel_gps + missing_only_gps + missing_only_accel

print(f"Active participants with missing data on ALL {x} days: {len(participants_with_missing)}")
print(f"(Note: This excludes 'Not Registered' and 'Permanently Retired' participants)")
print(f"(Analysis period: from {start.strftime('%Y-%m-%d')} to {yesterday.strftime('%Y-%m-%d')})\n")

# Display categorized results
print("=" * 60)
print("CATEGORIZED RESULTS (Missing on ALL days):")
print("=" * 60)

print(f"\n1. Missing Accel & GPS on ALL {x} days: {len(missing_accel_gps)} participants")
if missing_accel_gps:
    print(f"   Participants: {missing_accel_gps}")

print(f"\n2. Missing Only GPS on ALL {x} days (has Accel on all days): {len(missing_only_gps)} participants")
if missing_only_gps:
    print(f"   Participants: {missing_only_gps}")

print(f"\n3. Missing Only Accelerometer on ALL {x} days (has GPS on all days): {len(missing_only_accel)} participants")
if missing_only_accel:
    print(f"   Participants: {missing_only_accel}")

print(f"\n" + "=" * 60)
print(f"TOTAL ACTIVE PARTICIPANTS WITH DATA ISSUES (missing on ALL {x} days): {len(participants_with_missing)}")
print(f"=" * 60)

# ========= SAVE RESULTS TO CSV FILES =========
import os
from datetime import datetime

# Create output directory
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamp for file naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save each category as a separate CSV file
def save_participant_list_to_csv(participant_list, category_name, filename_prefix):
    """Save a list of participants to CSV with metadata"""
    if participant_list:
        df = pd.DataFrame({
            'participant_id': participant_list,
            'category': category_name,
            'count': len(participant_list),
            'analysis_date': datetime.now().strftime("%Y-%m-%d"),
            'lookback_days': x
        })
        
        filename = f"{filename_prefix}_{timestamp}.csv"
        filepath = os.path.join(output_dir, filename)
        df.to_csv(filepath, index=False)
        print(f"Saved {category_name}: {filepath}")
        return filepath
    else:
        print(f"No participants in {category_name} category")
        return None

# Save categorized results
print(f"\n" + "=" * 60)
print("SAVING RESULTS TO CSV FILES:")
print("=" * 60)

files_saved = []

# Save each category
files_saved.append(save_participant_list_to_csv(
    missing_accel_gps, 
    f"Missing Accel & GPS on ALL {x} days", 
    "missing_both_sensors"
))

files_saved.append(save_participant_list_to_csv(
    missing_only_gps, 
    f"Missing Only GPS on ALL {x} days (has Accel on all days)", 
    "missing_only_gps"
))

files_saved.append(save_participant_list_to_csv(
    missing_only_accel, 
    f"Missing Only Accelerometer on ALL {x} days (has GPS on all days)", 
    "missing_only_accelerometer"
))

# Save summary file with all categories
summary_data = []
if missing_accel_gps:
    summary_data.extend([{'participant_id': pid, 'category': f'Missing Accel & GPS on ALL {x} days'} for pid in missing_accel_gps])
if missing_only_gps:
    summary_data.extend([{'participant_id': pid, 'category': f'Missing Only GPS on ALL {x} days (has Accel on all days)'} for pid in missing_only_gps])
if missing_only_accel:
    summary_data.extend([{'participant_id': pid, 'category': f'Missing Only Accelerometer on ALL {x} days (has GPS on all days)'} for pid in missing_only_accel])

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    summary_df['analysis_date'] = datetime.now().strftime("%Y-%m-%d")
    summary_df['lookback_days'] = x
    
    summary_filename = f"data_quality_summary_{timestamp}.csv"
    summary_filepath = os.path.join(output_dir, summary_filename)
    summary_df.to_csv(summary_filepath, index=False)
    print(f"Saved summary file: {summary_filepath}")
    files_saved.append(summary_filepath)

print(f"\n" + "=" * 60)
print(f"EXPORT COMPLETE - {len([f for f in files_saved if f])} files saved to '{output_dir}' folder")
print("=" * 60)

Active participants with missing data on ALL 7 days: 55
(Note: This excludes 'Not Registered' and 'Permanently Retired' participants)
(Analysis period: from 2025-10-27 to 2025-11-02)

CATEGORIZED RESULTS (Missing on ALL days):

1. Missing Accel & GPS on ALL 7 days: 51 participants
   Participants: ['11hfsajc', '1czziou5', '3284n9os', '32v16fk5', '33vc2l25', '3xhi1mj3', '45mrp24c', '6c5fwr29', '6qmqfkeq', '7kknukw5', '8ucpk1qn', 'ajqv1ibq', 'b4paoqw8', 'bdl5r3es', 'bvavxgux', 'ckfvlrem', 'dpvsqpqf', 'e1kcbop5', 'fk2v9sst', 'i9sd5p7g', 'iadeeya3', 'jei5zxc9', 'jeipst7r', 'khy1h93g', 'l65wznsj', 'mmorx98z', 'mrfmuhvj', 'msj113qe', 'n3soi9u5', 'nudob5v2', 'nwmp8vip', 'o8ga8skc', 'oday9ezi', 'qwpf5ivi', 'rropj9pm', 's14yyl3j', 's63659si', 'sbxh8tts', 'sdnzt337', 'sr2s9fpt', 'swlxsbws', 'teuan5j2', 'thgx9byc', 'txo6lahs', 'v6yrqsb8', 'vyokddrj', 'x6svw2ps', 'xgrb4ude', 'zhcaek8a', 'ztcichn8', 'zwvlgzh4']

2. Missing Only GPS on ALL 7 days (has Accel on all days): 4 participants
   Participan