In [None]:
import os
import pandas as pd

In [None]:
from api_utils import *
import apple_health as AppleHealth
import health_connect as HealthConnect

Select whether to query the pilot study conducted in English for CAS participants or the main study in German:

## Specify project for export

In [None]:
# PROJECT = 'PILOT_EN'
PROJECT = "MAIN_DE"

In [None]:
load_dotenv()

# Read environment variables
private_key_path = os.getenv('RKS_PRIVATE_KEY_PATH')
service_account_name = os.getenv('RKS_SERVICE_ACCOUNT')
if PROJECT == 'PILOT_EN':
    project_id = os.getenv('RKS_PROJECT_ID_CAS')
else: 
    project_id = os.getenv('RKS_PROJECT_ID')
base_url = os.getenv('BASE_URL')
token_url = f'{base_url}/identityserver/connect/token'

In [None]:
access_token = get_service_access_token(service_account_name, token_url)

In [None]:
ids = get_participant_ids(access_token, base_url, project_id)
ids

Check if this worked, `ids` should contain a list of all participant IDs for this project.

In [None]:
if PROJECT == 'PILOT_EN':
    segment_ids = {
        "iOS": "fd09bd40-a26b-42b3-86af-4a59cbba489a",
        "Android": "2c3457ae-3c5b-4616-8480-e1e4ac750cdd",
        "Garmin": "df0accf3-49ac-4436-a07c-26c2dc8a0319",
    }
else: 
    segment_ids = {
        "iOS": "d06bb52f-fecb-4625-94ee-26fddbbec8d6",
        "Android": "126ab0db-2207-47ac-afbc-f8925270c4e4"}    

## Specify data for export

Specify in the `data_specs` dictionary what kind of data and what data range you want to export.

In [None]:
data_specs = {
    "dates": ["2025-10-07", "2025-11-30"],
    "measurements": [
        "active_calories",
        "active_calories_daily",
        "blood_glucose",
        "blood_pressure_sys",
        "blood_pressure_dia",
        "body_temp",
        "distance",
        "distance_daily",
        "exercise_segments",
        "exercise_lat",
        "exercise_lon",
        "exercise_alt",
        "exercise_hacc",
        "exercise_vacc",
        "exercise_laps",
        "exercise_time",
        "heart_rate",
        "heart_rate_min",
        "heart_rate_max",
        "oxygen_saturation",
        "respiratory_rate",
        "resting_hr",
        "sleep",
        "steps",
        "steps_daily",
        "steps_hourly",
        "steps_half_hourly",
        "total_calories",
        "total_calories_daily",
        "vo2_max",
        "weight"
        ]
}

If needed, check additionally available data types, e.g.:

In [None]:
# HC_datatypes = HealthConnect.get_all_datatypes(access_token, project_id, base_url)
# iOS_datatypes = AppleHealth.get_all_datatypes(access_token, project_id, base_url)

## Get all AppleHealth data

In [None]:
iOS_ids = get_participants_in_segment(access_token, base_url, project_id, segment_ids['iOS'], page_size=500)

In [None]:
iOS_ids = get_participants_in_segment(access_token, base_url, project_id, segment_ids['iOS'], page_size=500)

apple_results = AppleHealth.fetch_measurements(
    service_access_token=access_token,
    project_id=project_id,
    ids=iOS_ids,
    base_url=base_url,
    data_specs=data_specs
)

apple_results

## Get all HealthConnect data

In [None]:
android_ids = get_participants_in_segment(access_token, base_url, project_id, segment_ids['Android'], page_size=500)

print(android_ids)

android_results = HealthConnect.fetch_measurements(
    service_access_token=access_token,
    project_id=project_id,
    ids=android_ids,
    base_url=base_url,
    data_specs=data_specs
)

In [None]:
def export_results(results, by="pID", export_dir="./export", data_specs=None):
    """
    Export results dict to CSVs with optional date filtering.

    Args:
        results (dict): {participantID: {measurement: DataFrame}}
        by (str): "pID" for one CSV per participant,
                  "data_type" for one CSV per measurement type
        export_dir (str): folder to save CSVs into
        data_specs (dict): may contain 'dates': ['YYYY-MM-DD', 'YYYY-MM-DD']
    """
    os.makedirs(export_dir, exist_ok=True)

    # --- Date filter setup ---
    start_date, end_date = None, None
    if data_specs and "dates" in data_specs:
        if len(data_specs["dates"]) >= 1 and data_specs["dates"][0]:
            start_date = pd.to_datetime(data_specs["dates"][0]).tz_localize("UTC")
        if len(data_specs["dates"]) >= 2 and data_specs["dates"][1]:
            # include full last day by adding 23:59:59
            end_date = pd.to_datetime(data_specs["dates"][1]).tz_localize("UTC") + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

    def _apply_date_filter(df):
        if "observationDate" not in df.columns:
            return df
        df["observationDate"] = pd.to_datetime(df["observationDate"], errors="coerce", utc=True)
        mask = pd.Series(True, index=df.index)
        if start_date is not None:
            mask &= df["observationDate"] >= start_date
        if end_date is not None:
            mask &= df["observationDate"] <= end_date
        return df.loc[mask]

    # --- Export logic ---
    if by == "pID":
        for pid, meas_dict in results.items():
            df_list = []
            for meas, df in meas_dict.items():
                df_filtered = _apply_date_filter(df)
                if not df_filtered.empty:
                    df_list.append(df_filtered)
            if not df_list:
                continue
            merged = pd.concat(df_list, ignore_index=True)
            out_path = os.path.join(export_dir, f"{pid}.csv")
            merged.to_csv(out_path, index=False)

    elif by == "data_type":
        all_by_type = {}
        for pid, meas_dict in results.items():
            for meas, df in meas_dict.items():
                df_filtered = _apply_date_filter(df)
                if df_filtered.empty:
                    continue
                if meas not in all_by_type:
                    all_by_type[meas] = []
                all_by_type[meas].append(df_filtered)

        for meas, df_list in all_by_type.items():
            merged = pd.concat(df_list, ignore_index=True)
            out_path = os.path.join(export_dir, f"{meas}.csv")
            merged.to_csv(out_path, index=False)

    else:
        raise ValueError("Invalid value for 'by'. Use 'pID' or 'data_type'.")


Select a type of export. You can either export all data for a participant in a separate CSV file per participant (i.e., you will have all data for a person in a single file), or you can export each type of data (steps, RHR, glucose) into a separate file containing that measurement for all participants. Use the keyword `by=pID` or `by=data_type` respectively.

## Export iOS Data

In [None]:
# Export one CSV per participant
export_results(apple_results, by="pID", export_dir="./export_participants/applehealth", data_specs=data_specs)

# # Export one CSV per measurement type
export_results(apple_results, by="data_type", export_dir="./export_datatypes/applehealth", data_specs=data_specs)


## Export Android Data

In [None]:
# Export one CSV per participant
export_results(android_results, by="pID", export_dir="./export_participants/healthconnect", data_specs=data_specs)

# # Export one CSV per measurement type
export_results(android_results, by="data_type", export_dir="./export_datatypes/healthconnect", data_specs=data_specs)
