# Libraries

In [None]:
import os
import math
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats

# Statistical Analysis

In [None]:
str_baseline_acro = 'baseline_acro'
str_correct_acro = 'correct_acrophase'
str_dependent_var = 'percentage_of_window_days_of_change'
str_difference = 'Difference'
str_mean = 'mean'
str_pid = 'p_id'
str_rhythm_detect_p = 'rhythm_detect_P'
str_std = 'std'

In [None]:
loc_root = '/Users/wyd2hu/Documents/SA39/Hridita/'
loc_findings_rhythm = os.path.join(loc_root, 'Findings/Rhythms')

period = 24
loc_root_features = loc_root + '/Findings/Rhythms/' + str(period)+ '_Tiles18_HeartRatePPG.xlsx' # '_K-EmoPhone HR_bpm.xlsx' #

df_rhy_tiles = pd.ExcelFile(loc_root_features)
id_list = df_rhy_tiles.sheet_names

print('Should be equal', len(id_list), len(set(id_list)))
if 'Tiles18' in loc_root_features:
    change_file_name = 'tiles_18_change_of_rhythm.xlsx'
elif 'Tiles19' in loc_root_features:
    change_file_name = 'tiles_19_change_of_rhythm.xlsx'
elif 'EmoPhone' in loc_root_features:
    change_file_name = 'EmoPhone_change_of_rhythm.xlsx'
elif 'Breast Cancer' in loc_root_features:
    change_file_name = 'breast_cancer_change_of_rhythm.xlsx'
data_list_n_change = []

for p_id in id_list:
    df_baseline_window_data = pd.read_excel(loc_root_features, p_id)
    df_baseline_window_data = df_baseline_window_data[df_baseline_window_data[str_rhythm_detect_p] < 0.05]
    df_baseline_window_data[str_difference] = abs(df_baseline_window_data[str_baseline_acro] - df_baseline_window_data[str_correct_acro])
    df_baseline_window_data[str_difference]  = (df_baseline_window_data[str_difference] >= 2).astype(int)

    if df_baseline_window_data.shape[0] > 0:
        percentage = df_baseline_window_data[str_difference].tolist().count(1) / df_baseline_window_data.shape[0] # percentage of 2 days window when there were change in rhythm.
        data_list_n_change.append([p_id, percentage])
        print(p_id, percentage)
    else:
        print('Papi🔥🔥🔥🔥', p_id)

df_change_rhythm_counter = pd.DataFrame(columns=[str_pid, str_dependent_var], data = data_list_n_change)
df_change_rhythm_counter.to_excel(os.path.join(loc_findings_rhythm, change_file_name), index=False)

In [None]:
def detect_outlier_z_score(value_list):
    outliers_indices = []
    threshold = 3  # A previous study remarks to use this threshold to detect outliers
    mean = np.mean(value_list)
    std = np.std(value_list, ddof=1)

    index = 0
    for value in value_list:
        z_score = (value - mean) / std
        if np.abs(z_score) > threshold:
            outliers_indices.append(index)
        index += 1

    return outliers_indices

In [None]:
def calculate_correlation(ds_name, depen_var, ind_var, n_missing, data_list1, data_list2):
  if len(data_list1) > 7:  # So that we can do the normaltest (N>=8 is required for this function).
      n_outlier = len(detect_outlier_z_score(data_list1))
      n_outlier += len(detect_outlier_z_score(data_list2))

      normality, p_norm_data1 = stats.normaltest(data_list1)
      normality, p_norm_data2 = stats.normaltest(data_list2)

      if p_norm_data1 > 0.05 and p_norm_data2 > 0.05 and n_outlier == 0:  # If ture, this means data are normally distributed and do not have any outlier
          cc, cc_p_value = stats.pearsonr(data_list1, data_list2)  # Pearson correlation
          r_text = "r="
      else:
          cc, cc_p_value = stats.spearmanr(data_list1, data_list2)  # Spearman correlation
          r_text = "rs="
  else:
      cc, cc_p_value = stats.spearmanr(data_list1, data_list2)  # Spearman correlation
      r_text = "rs="
  if (ds_name + depen_var + ind_var) not in relation_data_tracker:
      relation_list_df.append([ds_name, depen_var, ind_var, n_missing, r_text, cc, cc_p_value, len(data_list1)])
      relation_data_tracker.add(ds_name + depen_var + ind_var)
    
  return r_text, cc, cc_p_value, len(data_list1)

In [None]:
def relation_with_objective_data():
    loc_root_change = 'C:/Users/wyd2hu/OneDrive - University of Virginia/Hridita/Rhythms/Findings/Jenks/'

    for sleep_var in ['deep', 'light', 'rem', 'wake', 'restless']:
        df_step = pd.DataFrame()
        df_sleep = pd.DataFrame()
    
        for loc_data in (loc_root +'/Data/Tiles18/fitbit/sleep/', loc_root + '/Data/Tiles18/fitbit/step-count',
                         loc_root + '/Data/Tiles19/fitbit/sleep-data/', loc_root + '/Data/Tiles19/fitbit/step-count'):
            print(loc_data)
            dic_id_sleep = {}

            ind_var = ''
            ds_name = ''
            n_missing_values = 0

            for p_id in os.listdir(loc_data):
                if p_id.endswith('.gz'):
                    df_temp = pd.read_csv(os.path.join(loc_data, p_id), compression='gzip')
                    df_temp.dropna(inplace=True)
                    if 'step' in loc_data:
                        ind_var = "step_" + str_mean
                        dic_id_sleep[p_id[:25]] = df_temp['StepCount'].mean()
                    else:
                        ind_var = os.path.basename(loc_data) +'_count_'+ sleep_var
                        if 'Tiles18' in loc_data:
                            # dic_id_sleep[p_id[:25]] = np.std(df_temp['duration'], ddof=1)
                            dic_id_sleep[p_id[:25]] = df_temp['sleep_phase'].tolist().count(sleep_var)
                        else:
                            # dic_id_sleep[p_id[:25]] = np.std(df_temp['seconds'], ddof=1)
                            dic_id_sleep[p_id[:25]] = df_temp['level'].tolist().count(sleep_var)

            if 'Tiles18' in loc_data:
                ds_name = 'Tiles18'
                change_file = 'tiles_18_change_of_rhythm.xlsx'
            else:
                ds_name = 'Tiles19'
                change_file = 'tiles_19_change_of_rhythm.xlsx'

            df_data = pd.DataFrame(dic_id_sleep.items(), columns=['p_id', 'data'])
            df_change = pd.read_excel(os.path.join(loc_findings_rhythm, change_file))

            print((set(df_change['p_id']) - set(df_data['p_id']))) # Participants whose data are in rhythm change, but missing in sleep, or step ...

            df_change.set_index('p_id', inplace=True)
            df_data.set_index('p_id', inplace=True)

            df_data_change = pd.merge(df_data, df_change, left_index=True, right_index=True, how='right')
            n_missing_values = df_data_change.isna().sum().sum()
            df_data_change.dropna(inplace=True)

            if "step" in loc_data:
                df_step = pd.concat([df_step, df_data_change])
            elif "sleep" in loc_data:
                df_sleep = pd.concat([df_sleep, df_data_change])

            print(n_missing_values, ds_name, str_dependent_var, ind_var)
            print(calculate_correlation(ds_name, str_dependent_var, ind_var, n_missing_values, df_data_change['data'], df_data_change[str_dependent_var]))

        print(calculate_correlation("both dataset", str_dependent_var, "sleep"+sleep_var, "sum(tiles 18 & 19)", df_sleep['data'], df_sleep[str_dependent_var]))
        print(calculate_correlation("both dataset", str_dependent_var, "step", "sum(tiles 18 & 19)", df_step['data'], df_step[str_dependent_var]))

relation_data_tracker = set()
relation_list_df = []

relation_with_objective_data()

In [None]:
def relation_with_subjective_data():
    df_stress = pd.DataFrame()
    
    for loc_explore_var in (
                            loc_root +"/Data/Tiles18/surveys/scored/EMAs/stressd.csv.gz",
                            loc_root +"/Data/Tiles18/surveys/scored/EMAs/anxiety.csv.gz",
                            loc_root +"/Data/Tiles18/surveys/scored/EMAs/engage_psycap_is_cs_hs.csv.gz",
                            loc_root +"/Data/Tiles18/surveys/scored/EMAs/pand.csv.gz",
                            loc_root +"/Data/Tiles19/surveys/scored/EMA/daily_ema.csv.gz"):
        ds_name = ''
        ind_var = ''
        n_missing_values = 0

        if 'Tiles18' in loc_explore_var:
            ds_name = 'Tiles18'
        elif "Tiles19" in loc_explore_var:
            ds_name = 'Tiles19'
        
        if 'engage_psycap_is_cs_hs' in loc_explore_var:
            explore_var = 'psycap'
        elif "pand" in loc_explore_var:
            explore_var = "pand_NegAffect"
        elif "daily_ema" in loc_explore_var:
            explore_var = "stress"
        else:
            explore_var = os.path.basename(loc_explore_var).replace('.csv.gz', '')

        ind_var = explore_var + '_sum'
        df_explore_var = pd.read_csv(loc_explore_var, compression='gzip')

        if 'Tiles18' in loc_explore_var:
            df_explore_var.rename(columns={'participant_id': 'p_id', explore_var: 'explore_var'}, inplace=True)
            df_change = pd.read_excel(os.path.join(loc_findings_rhythm, 'tiles_18_change_of_rhythm.xlsx')) #.groupby(['p_id']).std(ddof=1).reset_index()
        elif "Tiles19" in loc_explore_var:
            df_explore_var.rename(columns={'id': 'p_id', explore_var: 'explore_var'}, inplace=True)
            df_change = pd.read_excel(os.path.join(loc_findings_rhythm, 'tiles_19_change_of_rhythm.xlsx')) # .groupby(['p_id']).std(ddof=1).reset_index()
        
        df_explore_var.dropna(inplace=True)
        df_explore_var = df_explore_var[['p_id', 'explore_var']]
        df_explore_var['p_id'] = df_explore_var['p_id'].str[:25] # we are taking the first 25 characters since we saved the rhythm info using 25 characters. The reason is the char limit in write_xlsx up to 31 char (as far as I can remember)
        df_explore_var = df_explore_var.groupby(['p_id']).sum().reset_index()
        
        print('Missing participants for which data are available in df_change, not in df_explore_var', (set(df_change['p_id']) - set(df_explore_var['p_id'])))
        
        df_change.set_index('p_id', inplace=True)
        df_explore_var.set_index('p_id', inplace=True)
        df_explore_var_change = pd.merge(df_explore_var, df_change, left_index=True, right_index=True, how='right')
        
        n_missing_values = df_explore_var_change.isna().sum().sum()
        df_explore_var_change.dropna(inplace=True)
        print('n_missing_values: ', n_missing_values)
        # print(df_change)
        # print(df_explore_var_change['explore_var'], df_explore_var_change[str_dependent_var])

        df_explore_var_change = df_explore_var_change.astype(np.float64) 
        print(loc_explore_var)
        print(calculate_correlation(ds_name, str_dependent_var, ind_var, n_missing_values, df_explore_var_change['explore_var'], df_explore_var_change[str_dependent_var]))

        if "stress" in ind_var:
            df_stress = pd.concat([df_stress, df_explore_var_change])
    
    print(calculate_correlation("both dataset", str_dependent_var, 'stress', "sum(tiles 18 & 19)", df_stress['explore_var'], df_stress[str_dependent_var]))

relation_with_subjective_data()

## Save the Findings of Statistical Analysis

In [None]:
def save_stat_findings():
    df_stat_find = pd.DataFrame(data=relation_list_df, columns = ['ds_name', 'depen_var', 'ind_var', 'n_missing',
                                                                'r_text', 'cc', 'cc_p_value', 'len(data_list1)'])
    df_stat_find.to_excel(loc_root+"/Findings/"+ os.path.basename(loc_root_features), index=False)
    df_stat_find.to_latex("output.tex")
save_stat_findings()

# K-EmoPhone dataset (Used for more exploration to understand the robustness after exploring the TILES-18 and -19 datasets as per proposal)

In [None]:
# import os 
# import zipfile

# zip_file_path = "/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone/"
# file_list = os.listdir(zip_file_path)
# abs_path = []
# for a in file_list:
#     abs_path.append(zip_file_path+'//'+a)
# for f in abs_path:
#     if not os.path.exists(f.replace('.zip', '')):
#         os.makedirs(f.replace('.zip', ''))
#         zip=zipfile.ZipFile(f)
#         zip.extractall(f.replace('.zip', ''))

In [None]:
loc_root_kEmo  = '/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone/'
loc_hr_kEmo = '/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone HR/'
loc_kemo_subjective_data = '/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone/SubjData/EsmResponse.csv'
loc_kemo_scales_data = '/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone/SubjData/UserInfo.csv'
loc_step_kEmo = '/Users/wyd2hu/Documents/SA39/Hridita/Data/K-EmoPhone StepCount/'
loc_rhythm_k_emophone = '/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/EmoPhone_change_of_rhythm.xlsx'

In [None]:
def create_folder_containing_heart_rate_data_only():
    for p_id in sorted(os.listdir(loc_root_kEmo)):
        if os.path.isdir(os.path.join(loc_root_kEmo, p_id)) and "SubjData" not in p_id:
            df_hr = pd.read_csv(os.path.join(loc_root_kEmo, p_id, 'StepCount.csv'))
            df_hr.to_csv(os.path.join(loc_step_kEmo, p_id +'_'+ 'StepCount.csv'), index=False)

create_folder_containing_heart_rate_data_only()

In [None]:
def kemo_find_relation_with_subjective_data():
    df_kemo_rhythm = pd.read_excel(loc_rhythm_k_emophone)
    df_kemo_rhythm[str_pid] = df_kemo_rhythm[str_pid].str.replace('_HR.csv', '')
    columns_to_drop = ['participationStartDate', 'age', 'gender'] # ['responseTime', 'scheduledTime']

    df_kemo_subjective_data = pd.read_csv(loc_kemo_scales_data)
    df_kemo_subjective_data.rename(columns={'pcode': str_pid}, inplace=True)
    df_kemo_subjective_data = df_kemo_subjective_data.drop(columns= columns_to_drop)
    df_kemo_subjective_data = df_kemo_subjective_data.groupby(by=[str_pid]).mean().reset_index()
    df_rhythm_kemo_subject = pd.merge(df_kemo_rhythm, df_kemo_subjective_data, on=[str_pid], how='inner')
    for ind_var in ['PSS', 'PHQ','GHQ']: # ['valence', 'arousal', 'attention', 'stress', 'duration', 'disturbance', 'change']:
        print(ind_var, calculate_correlation('kEmoPhone', str_dependent_var, ind_var, 0, df_rhythm_kemo_subject[str_dependent_var].tolist(), df_rhythm_kemo_subject[ind_var].tolist()))
        
kemo_find_relation_with_subjective_data()


# Breast Cancer Dataset (Used for more exploration to understand the robustness after exploring the TILES-18 and -19 datasets as per proposal)

In [None]:
loc_bch_dataset = '/Users/wyd2hu/Documents/SA39/Medication Adherence/Fitbit_data_101_120/'
df_rhythm_bch = pd.read_excel('/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/breast_cancer_change_of_rhythm.xlsx')
df_rhythm_bch[str_pid] = df_rhythm_bch[str_pid].str.replace('_heartrate_seconds_202', '')
df_rhythm_bch[str_pid] = df_rhythm_bch[str_pid].astype(int)

def process_bch_data_find_relation():
    df_data_for_relation = pd.DataFrame()
    data_list_df = []
    for pid in list(range(101, 121)):
        df_sleep = pd.read_csv(os.path.join(loc_bch_dataset + str(pid)+'_30secondSleepStages_20230101_20240509.csv'))
        for sleep_stage in ['deep', 'light', 'rem', 'wake', 'restless']:
            data_list_df.append([pid, sleep_stage, df_sleep['Level'].tolist().count(sleep_stage)])
    
    df_data_for_relation = pd.DataFrame(data= data_list_df, columns=[str_pid, 'sleep_stage', 'count'])
    df_bch_sleep_rhythm = pd.merge(df_data_for_relation, df_rhythm_bch, on=str_pid, how='inner')
    
    for sleep_stage in ['deep', 'light', 'rem', 'wake', 'restless']:
        sub_df_sleep_rhythm = df_bch_sleep_rhythm[df_bch_sleep_rhythm['sleep_stage'] == sleep_stage].copy()
        print(sleep_stage, calculate_correlation(ds_name='BCH', depen_var=str_dependent_var, ind_var=sleep_stage, n_missing=0,
                                    data_list1=sub_df_sleep_rhythm[str_dependent_var].tolist(), data_list2=sub_df_sleep_rhythm['count'].tolist()))
    
    df_stat_find = pd.DataFrame(data=relation_list_df, columns = ['ds_name', 'depen_var', 'ind_var', 'n_missing',
                                                                'r_text', 'cc', 'cc_p_value', 'len(data_list1)'])
    df_stat_find.to_excel(loc_root+"/Findings/BCH_relation_with_sleep.xlsx", index=False)
    df_stat_find.to_latex("output.tex")
    
relation_data_tracker = set()
relation_list_df = []

process_bch_data_find_relation()

# SensingText

# Based on single day's data (as per the proposal)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

access_token = '....'
model_id = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=access_token)


In [None]:
'Cardio_caloriesOut', 'Cardio_minutes', 'Fat Burn_caloriesOut', 'SleepMinutesAsleep', 'SleepMinutesInBed', 'SleepPerDay'

In [None]:

import pandas as pd

def dict_to_sentence_zero_shot(data_list, data_name, last_part_of_prompt):
    """
    Converts a dictionary into a meaningful sentence using a language model.

    :param data_dict: Dictionary containing data.
    :return: Generated sentence as a string.
    """
    # Prepare the prompt
    prompt = (
    "Analyze the data which presents "+data_name + last_part_of_prompt +"."
    f"Data: {data_list}\n\n"
    "Steps to follow:\n"
    "1. Look for patterns or trends.\n"
    "2. Highlight any subtle changes or outliers.\n"
    "3. Summarize the insights in a paragraph.\n\n"
    "Insightful summary:"
)

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        inputs.input_ids, attention_mask=inputs.attention_mask,
        max_length=350, temperature=0.30, do_sample=True, top_p = 0.9,
        num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

    sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)    
    sentence = sentence[len(prompt):]
    return sentence

In [None]:
print(len('of a single person at different times of the day.'))

In [None]:
import pandas as pd
import os
from openpyxl import load_workbook

str_date = 'date'
str_start_date = 'start_date'
str_end_date = 'end_date'
str_difference  = 'Difference'
str_baseline_acro = 'baseline_acro'
str_correct_acro = 'correct_acrophase'
dataset = 'Tiles18'
period = 24
str_pid = 'PID'

def str_from_list(list_data):
    return ' , '.join(map(str, list_data))

def sense2Text(date):
    if df_step[df_step[str_date] == date].shape[0] > 0:
        step_para = dict_to_sentence_zero_shot(str_from_list(df_step[df_step[str_date] == date]['StepCount'].tolist())[:250], 'Step Count', " of a single person at different times of the day.")
        sleep_para = dict_to_sentence_zero_shot(str_from_list(df_sleep[df_sleep[str_date] == date]['sleep_phase'].tolist())[:250], 'Sleep phases', " of a single person at different times of the day.")
        sleep_duration_para = dict_to_sentence_zero_shot(str_from_list(df_sleep[df_sleep[str_date] == date]['duration'].tolist())[:250], 'Sleep duration', " of a single person at different times of the day.")
        phone_usage_pattern = dict_to_sentence_zero_shot(str_from_list(df_phone[df_phone[str_date] == date]['SecondsOnPhone'].tolist())[:250], 'App usage duration', " of a single person at different times of the day.")

        str_day_level = ''
        # For these data (e.g., Cardio_caloriesOut), there were only the aggregated features, no raw data were present in the TILES-18 & TILES-19 datasets.
        for day_level_f in ['Cardio_caloriesOut', 'Cardio_minutes', 'Fat Burn_caloriesOut', 'SleepMinutesAsleep', 'SleepMinutesInBed']: # for these, the tiles dataset does not contain any raw data
            if df_daily_summary[df_daily_summary[str_date] == date][day_level_f].isna().sum() == 0:
                str_day_level +=''+ day_level_f +' is '+ str(df_daily_summary[df_daily_summary[str_date] == date][day_level_f].tolist()[0]) +'. '

        aggregated_data_based_text = dict_to_sentence_zero_shot(str_day_level[:250], "", ". This is data of only 1 person.")
        return step_para +". "+ sleep_para +". "+ sleep_duration_para +".  "+ phone_usage_pattern + ". "+ aggregated_data_based_text
    else:
        return ""
    


rhythm_sheets = load_workbook('/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/'+ str(period)+'_' +dataset+ '_HeartRatePPG.xlsx', read_only=True)
data_list_df_llm = []
df_data_llm = pd.read_excel('/Users/wyd2hu/Documents/SA39/Hridita/Data/data_for_LLM.xlsx')
list_already_pid_dates = (df_data_llm[str_pid].astype(str) + df_data_llm[str_date].astype(str)).tolist()

for hr_file in os.listdir("/Users/wyd2hu/Documents/SA39/Hridita/Data/" +dataset+ "/fitbit/heart-rate/"): # Here, the hr_file works like a pid
    print(hr_file)
    bool_dont_include_for_analysis = False # If this becomes True, then, that participant's have missing data in case of at least data :)
    for data_name in ['sleep', 'step-count', 'daily-summary', 'sleep-data', 'realizd']:
        
        if 'realizd' in data_name:
            if not os.path.exists('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/' +data_name+ '/'+hr_file):
                bool_dont_include_for_analysis = True
        else:
            if not os.path.exists('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/' +data_name+ '/'+hr_file):
                bool_dont_include_for_analysis = True
        
        if hr_file[:25] not in rhythm_sheets:
            bool_dont_include_for_analysis = True
    
    if bool_dont_include_for_analysis or 'README.md' in hr_file:
        continue
    
    df_daily_summary = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/daily-summary/'+hr_file, on_bad_lines='skip')
    df_daily_summary[str_date] = pd.to_datetime(df_daily_summary['Timestamp']).dt.date

    df_sleep = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/Tiles18/fitbit/sleep/'+hr_file)
    df_step = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/Tiles18/fitbit/step-count/'+hr_file)

    df_rhythm = pd.read_excel('/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/24_Tiles18_HeartRatePPG.xlsx', hr_file[:25]) 
    df_rhythm = df_rhythm.iloc[1:]

    df_sleep['timestamp'] = pd.to_datetime(df_sleep['timestamp'])
    df_step['Timestamp'] = pd.to_datetime(df_step['Timestamp'])
    df_sleep[str_date] = df_sleep['timestamp'].dt.date
    df_step[str_date] = df_step['Timestamp'].dt.date

    df_rhythm[str_start_date]  = pd.to_datetime(df_rhythm[str_start_date]).dt.date
    df_rhythm[str_end_date]  = pd.to_datetime(df_rhythm[str_end_date]).dt.date
    df_rhythm[str_difference] = abs(df_rhythm[str_baseline_acro] - df_rhythm[str_correct_acro])
    df_rhythm[str_difference]  = (df_rhythm[str_difference] >= 2).astype(int)

    df_step = df_step[df_step['StepCount'] != 0]

    df_phone = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/realizd/'+hr_file, on_bad_lines='skip')
    if 'Tiles19' in dataset:
        df_phone[str_date] = pd.to_datetime(df_phone['TimestampStart']).dt.date
        df_phone.drop(by=['TimestampEnd'], inplace=True)
    else:
        df_phone['session_start'] = pd.to_datetime(df_phone['session_start'])
        df_phone['session_stop'] = pd.to_datetime(df_phone['session_stop'])
        df_phone['SecondsOnPhone'] = (df_phone['session_stop'] - df_phone['session_start']).dt.total_seconds() # WARNING: without using dt.total_seconds(), I got 0 days 06:16:16 for a case. Why is that? Can it be so large? One reason can be considering seconds as something else when calling str_from_list()
        df_phone[str_date] = df_phone['session_start'].dt.date
        df_phone.drop(columns=['session_stop', 'session_start'], inplace=True)

    for start_date, end_date, cat in zip(df_rhythm[str_start_date].tolist(), df_rhythm[str_end_date].tolist(), df_rhythm[str_difference].tolist()):
        temp_pid = hr_file + str(start_date)
        if temp_pid not in list_already_pid_dates:
            print(hr_file, start_date)
            data_list_df_llm.append([hr_file, start_date, sense2Text(start_date), cat])
            data_list_df_llm.append([hr_file, end_date, sense2Text(end_date), cat])

            df_data_llm_temp = pd.DataFrame(data=data_list_df_llm, columns=[str_pid, str_date, 'Narration', 'class'])
            df_data_llm = pd.concat([df_data_llm, df_data_llm_temp])
            df_data_llm.to_excel('/Users/wyd2hu/Documents/SA39/Hridita/Data/data_for_LLM.xlsx', index=False)
            

In [None]:
import pandas as pd

# Example dataframe
data = {'col1': ['1', '2', '3'], 'col2': ['4', '5', '6']}
df = pd.DataFrame(data)

# Create a list where each value is col1 + col2
result = (df['col1'] + df['col2']).tolist()

print(result)


In [None]:
df_data_llm.to_excel('/Users/wyd2hu/Documents/SA39/Hridita/Data/data_for_LLM.xlsx', index=False)

In [None]:
data_list_llm = [[0, 'The data shows a set of step counts aligned with specific sleep states and their associated durations, offering a more granular view of human behavior over a measured period. The first sequence presents a dense and highly variable distribution of step counts. These fluctuations could represent periods of high activity interspersed with rest, or they might reflect irregular daily routines influenced by external factors such as work schedules, exercise habits, or lifestyle choices. The second sequence focuses on sleep phases—wake, light, deep, and REM—suggesting the underlying complexity of sleep architecture. Transitioning between these stages does not follow a simple, repetitive pattern, indicating that various physiological and environmental triggers might influence how a person cycles through these states. The third sequence assigns durations to each sleep state, revealing how long an individual spends in each phase. These duration data points vary greatly, hinting at potential sleep disturbances, stress factors, or inconsistencies in sleep hygiene. From a modeling perspective, this kind of multivariate data requires methods that can handle temporal dynamics, non-linear relationships, and potentially missing or noisy data. Techniques like recurrent neural networks, hidden Markov models, or other sequence-aware algorithms can be valuable. Additionally, feature extraction and correlation analysis between step counts and subsequent sleep stage durations could illuminate how daytime activity patterns influence nighttime sleep quality and stability.'],
                 [0, 'The dataset again aligns three main data streams—step counts, sleep stages, and sleep durations—to reveal a complex interplay between daily activity and nighttime rest. The first sequence indicates substantial variability in step counts, reflecting highly irregular patterns of physical activity. Such fluctuations might be influenced by differing daily routines, stressors, work shifts, or inconsistent exercise habits. The second sequence, capturing sleep stages like wake, light, deep, REM, asleep, restless, and awake, shows that the person’s sleep architecture is equally dynamic. The presence of phases like ‘asleep’ and ‘restless,’ alongside standard sleep stages, suggests that this individual may experience significant disturbance or variability in sleep quality. The third sequence provides the duration of each sleep state, presenting an extensive range of values that highlight the non-uniformity of how long the person remains in each stage. This complexity suggests that simple, linear approaches may not be sufficient to model or predict sleep behavior. Advanced, sequence-aware methods—such as machine learning models capable of detecting non-linear patterns or hidden state transitions—could be employed to uncover underlying factors that shape these activity and sleep dynamics. By doing so, we may identify meaningful correlations and causal relationships that, in turn, could inform interventions or lifestyle adjustments aimed at improving sleep quality and overall well-being.'],
                 [0, 'the dataset expands in both complexity and length, further illustrating the intricate interplay of activity levels, sleep states, and temporal durations. The first sequence’s step counts remain highly erratic, with a wide range of values that suggest fragmented, unpredictable patterns of movement throughout the observed period. Compared to previous examples, the second sequence now includes an even richer set of sleep and wake states—such as wake, light, deep, REM, asleep, restless, and awake—plus additional transitions and repetitions. These states appear to shift frequently, reflecting a sleep architecture that is neither strictly cyclical nor easily predictable. The third sequence, representing corresponding durations, shows an extensive variety of time intervals spent in each state. These durations fluctuate markedly and do not settle into stable, recurring patterns over time. For modeling, this scenario reinforces the need for robust analytical techniques. Simple approaches may fail to capture the dynamic, non-linear, and context-dependent nature of these data. Instead, sophisticated sequence-aware models—like deep recurrent neural networks, attention-based transformers, or advanced state-space models—could prove more suitable. These methods can handle the complexity and variability present in the data, identify subtle correlations or causal links, and ultimately support more informed decision-making about interventions to improve sleep quality and overall well-being.'],
                 [1, "the dataset is once again composed of three aligned sequences—step counts, sleep/wake states, and corresponding durations—that paint a comprehensive picture of an individual's daily activity and sleep patterns. The first sequence, a long and highly variable list of steps, suggests that the person’s movement may fluctuate drastically throughout the recorded period, hinting at an irregular or unpredictable lifestyle. The second sequence details a wide spectrum of sleep-related states including wake, light, deep, REM, asleep, restless, and awake, interspersed with transitions that appear neither uniform nor periodic. Instead, the individual’s sleep architecture seems layered with complexity, shifting through states at irregular intervals. The third sequence provides durations for these recorded sleep phases, further emphasizing the non-linear and dynamic nature of the observed behavior. The considerable variability in how long each state persists suggests that simple averages or basic statistical methods may not capture the underlying structure. This complex interplay of step counts, sleep states, and durations once again points toward the necessity of advanced, context-aware modeling methods. Techniques capable of handling large, irregular, and heterogeneous time-series data—such as deep learning models with attention mechanisms or hybrid statistical-machine learning approaches—may be best suited for extracting meaningful patterns, identifying causal factors, and ultimately guiding interventions aimed at improving sleep quality and activity balance."],
                 [1, "This new dataset similarly presents three core sequences—steps, sleep/wake states, and durations—yet it is far more expansive and seemingly even more irregular than prior examples. The first sequence offers a staggering range of step counts, potentially spanning months or even years of intermittent recordings. This variability in steps suggests periods of intense activity contrasted with stretches of near-sedentary behavior. The second sequence describes a dense tapestry of sleep states, including wake, light, deep, REM, asleep, restless, and awake phases, interspersed with transitions that appear chaotic and non-repetitive. Notably, it also introduces more extended sequences of “restless” and “asleep” states that disrupt standard sleep architecture. The third sequence, representing durations, features a vast assortment of time intervals, from very short to extremely long, which makes discerning any stable cycle or rhythm challenging. From a modeling standpoint, this complexity and scale underscore the need for sophisticated, data-driven approaches. Traditional methods might fail to capture the subtle patterns hidden within the chaos. Instead, advanced machine learning models—especially those designed for high-dimensional, irregular time-series data—may be required to identify latent structures, correlate daytime activities with nocturnal rest quality, and ultimately provide actionable insights to improve sleep hygiene and overall health."],
                 [1, " the complexity has escalated to an extreme level, combining an enormous variety of step counts, a complex array of sleep/wake states, and a wide range of associated durations. The step count data is massive, varied, and likely spans a very long timeframe, suggesting activity patterns that may be influenced by numerous external factors, seasonal changes, or shifting lifestyle patterns. The sleep data now incorporates numerous states—wake, light, deep, REM, asleep, restless, and even extended sequences of disturbed sleep—which appear to follow no simple, repetitive architecture. The durations also exhibit astounding variability, from very short intervals to extremely prolonged periods in certain states. Such data may be rife with missing values, anomalies, and non-linear relationships that defy basic statistical or time-series models. To extract value from this data for model development, cutting-edge techniques are required. Methods like deep neural networks with attention mechanisms, hierarchical hidden Markov models, or advanced state-space models may be necessary to capture the intricate temporal dependencies and hidden patterns. Approaches from anomaly detection, transfer learning, and domain adaptation might also prove invaluable. Moreover, careful feature engineering—combining domain knowledge of sleep biology, circadian rhythms, and human behavior—could help reduce the noise and highlight essential signals. Ultimately, these data underscore that modeling human activity and sleep patterns is a profoundly complex task, demanding sophisticated tools that can integrate context, learn from sparse and irregular observations, and provide insights robust to the intrinsic variability of human behavior."]]

# Depending on variance of pattern over the days (after exploration what I said in proposal)

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os

str_date = 'date'
str_start_date = 'start_date'
str_end_date = 'end_date'
str_difference  = 'Difference'
str_baseline_acro = 'baseline_acro'
str_correct_acro = 'correct_acrophase'
dataset = 'Tiles18'
period = 24

data_list_df_llm = []
rhythm_sheets = load_workbook('/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/'+ str(period)+'_' +dataset+ '_HeartRatePPG.xlsx', read_only=True)

def str_from_list(list_data):
    return ' , '.join(map(str, list_data))

for hr_file in os.listdir("/Users/wyd2hu/Documents/SA39/Hridita/Data/" +dataset+ "/fitbit/heart-rate/"):
    bool_dont_include_for_analysis = False # If this becomes True, then, that participant's have missing data in case of at least data :)
    for data_name in ['sleep', 'step-count', 'daily-summary', 'sleep-data', 'realizd']:
        
        if 'realizd' in data_name:
            if not os.path.exists('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/' +data_name+ '/'+hr_file):
                bool_dont_include_for_analysis = True
        else:
            if not os.path.exists('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/' +data_name+ '/'+hr_file):
                bool_dont_include_for_analysis = True
        
        if hr_file[:25] not in rhythm_sheets:
            bool_dont_include_for_analysis = True
    
    if bool_dont_include_for_analysis or 'README.md' in hr_file:
        continue
    
    print(hr_file)
    df_sleep = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/sleep/'+hr_file, on_bad_lines='skip') # WARNING: Check the rows of the files manually for which I needed to use on_bad_lines='skip' :)
    df_step = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/step-count/'+hr_file, on_bad_lines='skip')

    df_daily_summary = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/daily-summary/'+hr_file, on_bad_lines='skip')
    df_daily_summary[str_date] = pd.to_datetime(df_daily_summary['Timestamp']).dt.date

    # df_sleep_meta_data = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/fitbit/sleep-data/'+hr_file, on_bad_lines='skip')
    # df_sleep_meta_data[str_date] = pd.to_datetime(df_sleep_meta_data['dateTime']).dt.date

    df_rhythm = pd.read_excel('/Users/wyd2hu/Documents/SA39/Hridita/Findings/Rhythms/24_' +dataset+ '_HeartRatePPG.xlsx', hr_file[:25]) 
    df_rhythm = df_rhythm.iloc[1:]

    df_sleep[str_date] = pd.to_datetime(df_sleep['timestamp'])
    df_step[str_date] = pd.to_datetime(df_step['Timestamp'])
    df_sleep[str_date] = df_sleep[str_date].dt.date
    df_step[str_date] = df_step[str_date].dt.date

    df_sleep = df_sleep.groupby(by=[str_date]).sum().reset_index()
    df_step = df_step.groupby(by=[str_date]).sum().reset_index()

    df_phone = pd.read_csv('/Users/wyd2hu/Documents/SA39/Hridita/Data/' +dataset+ '/realizd/'+hr_file, on_bad_lines='skip')
    if 'Tiles19' in dataset:
        df_phone[str_date] = pd.to_datetime(df_phone['TimestampStart']).dt.date
        df_phone.drop(by=['TimestampEnd'], inplace=True)
        df_phone = df_phone.groupby(by=[str_date]).sum().reset_index()
    else:
        df_phone['session_start'] = pd.to_datetime(df_phone['session_start'])
        df_phone['session_stop'] = pd.to_datetime(df_phone['session_stop'])
        df_phone['SecondsOnPhone'] = (df_phone['session_stop'] - df_phone['session_start']).dt.total_seconds() # WARNING: without using dt.total_seconds(), I got 0 days 06:16:16 for a case. Why is that? Can it be so large? One reason can be considering seconds as something else when calling str_from_list()
        df_phone[str_date] = df_phone['session_start'].dt.date
        df_phone.drop(columns=['session_stop', 'session_start'], inplace=True)
        df_phone = df_phone.groupby(by=[str_date]).sum().reset_index()
    
    df_rhythm[str_start_date]  = pd.to_datetime(df_rhythm[str_start_date]).dt.date
    df_rhythm[str_end_date] = pd.to_datetime(df_rhythm[str_end_date]).dt.date
    df_rhythm[str_difference] = abs(df_rhythm[str_baseline_acro] - df_rhythm[str_correct_acro])
    df_rhythm[str_difference]  = (df_rhythm[str_difference] >= 2).astype(int)

    df_step = df_step[df_step['StepCount'] != 0]

    merged_df = pd.merge(df_sleep, df_step, how='inner', on=str_date)
    merged_df = pd.merge(merged_df, df_daily_summary, how='inner', on=str_date)
    merged_df = pd.merge(merged_df, df_phone, how='inner', on=str_date)
    # merged_df = pd.merge(merged_df, df_sleep_meta_data, how='inner', on=str_date)

    for start_date, end_date, cat in zip(df_rhythm[str_start_date].tolist(), df_rhythm[str_end_date].tolist(), df_rhythm[str_difference].tolist()):
        last_date_to_collect_data =  start_date - pd.Timedelta(days=3)
        data_list_df_llm.append([start_date, last_date_to_collect_data, hr_file, 
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['StepCount'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['duration'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['Cardio_caloriesOut'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['Cardio_minutes'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['Fat Burn_caloriesOut'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['SleepMinutesAsleep'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['SleepMinutesInBed'].tolist()),
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['SleepPerDay'].tolist()), 
                                 str_from_list(merged_df[(merged_df[str_date] >= last_date_to_collect_data) & (merged_df[str_date] < start_date)]['SecondsOnPhone'].tolist()), cat])
        
df_data_llm = pd.DataFrame(data=data_list_df_llm, columns=['start_date', 'end_date', 'pid', 'steps', 'duration_of_sleep_phases',
                                                           'Cardio_caloriesOut', 'Cardio_minutes', 'Fat Burn_caloriesOut', 
                                                           'SleepMinutesAsleep', 'SleepMinutesInBed', 'SleepPerDay', 
                                                           'SecondsOnPhone', 'class'])