In [None]:
import numpy as np
import pandas as pd
import datetime
import dcarte
import pytz
#%load_ext autoreload
import pyarrow as pa
import pyarrow.parquet as pq

from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
pd.options.mode.chained_assignment = None  # default='warn
pd.set_option('display.max_columns', None)

In [None]:
data_1 = pd.read_parquet('clusters_hour_gap.parquet', engine='pyarrow')
data_1 = data_1.dropna()
data_1['length'] = data_1['length']/60

hourgap = data_1[(data_1['classification']=='nocturnal')].groupby(['date','Participant ID'], group_keys=False).apply(lambda x: x.loc[x.length.idxmax()])
hourgap.head(3)

In [None]:
sleep = pd.read_parquet('sleep_data.parquet', engine='pyarrow')
sleep = sleep.drop('duplicates', axis=1)
sleep['datetime'] = sleep['start_date_tz']
if sleep['datetime'].dt.tz!='Europe/London':
    sleep['datetime'] = sleep['datetime'].dt.tz_convert(pytz.timezone('Europe/London'))
sleep.head(3)

In [None]:
def process_sleep_mat(patient_id, df):
    df = df[df['ID']==patient_id].reset_index(drop=True)

    df.index = pd.DatetimeIndex(df['datetime'])
    df = df.drop(columns=['datetime'])
    df = df.resample('1T').asfreq()
    df = df.rename_axis('datetime').reset_index()
    if df['datetime'].dt.tz!='Europe/London':
        df['datetime'] = df['datetime'].dt.tz_convert(pytz.timezone('Europe/London'))
    df['value'] = np.where(df['state'].isna(), 'bed_out', 'bed_in')
    return df[['ID','datetime', 'state','value','snoring','heart_rate','respiratory_rate']]

def start_sleep_table():
    return pd.DataFrame(columns=['participant_id', 'date','cluster_number','cluster_classification',
                     'start_dt_total','total_time_in_bed','total_time_in_sleep',
                     'time_in_bed_duration','tib_onset', 'tib_offset','time_in_bed_period',
                     'in_bed_bouts','out_bed_bouts',
                    'sleep_onset', 'sleep_offset', 'sleep_period','midpoint_sleep', 'sleep_duration',          
                    'sleep_bouts', 'wake_bouts',
                  'withings_wake_state_during_tib', 'withings_nulls_during_tib', 'wake_during_tib',
                 'withings_light_state_duration', 'withings_deep_state_duration','withings_rem_state_duration',
                'withings_nrem_sleep_duration',
                 'withings_wake_state_during_sleep_period', 'withings_nulls_during_sleep_period', 'withings_wake_after_sleep_onset',
                  'snoring_duration','mean_heart_rate_asleep','mean_respiratory_rate_asleep'])

def append_row(df, row):
    return pd.concat([ df, pd.DataFrame([row], columns=row.index)]).reset_index(drop=True)

In [None]:
def find_bows(df, col_to_find_mismatch, desired_value):
    df['mismatch'] = df[col_to_find_mismatch] != df[col_to_find_mismatch].shift(1)
    transitions = df[df['mismatch']==True].reset_index(drop=True)
    transitions['duration'] = (transitions['datetime'].shift(-1) - transitions['datetime']).dt.total_seconds()/3600
    bows = transitions[(transitions[col_to_find_mismatch]==desired_value)]['duration'].to_list()
    return bows

def prepare_for_bows(df, start, end, col_to_find_mismatch, undesired_value):
    for_transitions = df[df['datetime'].between(start, end)].reset_index(drop=True)                
    empty_df = pd.DataFrame({start-pd.Timedelta(minutes=1):undesired_value, 
                             end+pd.Timedelta(minutes=1):undesired_value}.items(), columns=['datetime', col_to_find_mismatch])
    for_transitions = pd.concat([for_transitions,empty_df])
    return for_transitions.sort_values(by='datetime')

In [None]:
def update_sleep_table(old_df, i_df, patient, cluster_df):

    for index, row in cluster_df.iterrows():
        
        daily_start = pd.to_datetime(row['date']).tz_localize(pytz.timezone('Europe/London'))+pd.Timedelta(hours=7)
        daily_end = pd.to_datetime(row['date']).tz_localize(pytz.timezone('Europe/London'))+pd.Timedelta(hours=31)-pd.Timedelta(minutes=1)### this needs a minute correction to have exactly 24 hrs
        daily_in_bed = i_df.loc[(i_df['datetime']>=daily_start) &
                                (i_df['datetime']<=daily_end)].reset_index(drop=True)
        daily_tib = len(daily_in_bed[daily_in_bed['state'].isin(['DEEP', 'LIGHT', 'REM','AWAKE'])])/60
        daily_in_sleep = len(daily_in_bed[daily_in_bed['state'].isin(['DEEP', 'LIGHT', 'REM'])])/60

        start_dt = row['datetime']
        end_dt = row['end_datetime']  #participant goes out of bed in this minute, and start the outbed cluster

        nocturnal_in_bed = i_df.loc[(i_df['datetime']>=start_dt) & 
                                    (i_df['datetime']<=(end_dt-pd.Timedelta(minutes=1)))].reset_index(drop=True)### this needs a minute correction to select only time in bed
        nocturnal_in_bed['in_sleep'] = np.where(nocturnal_in_bed['state'].isin(['DEEP', 'LIGHT', 'REM']), 'yes', 'no')
        
#         print(nocturnal_in_bed[['datetime','state','value']])
        if len(nocturnal_in_bed[nocturnal_in_bed['in_sleep']=='yes'])<2:
          #  print(len(nocturnal_in_bed[nocturnal_in_bed['in_sleep']=='yes']))
            continue
            
        sleep_onset = nocturnal_in_bed[nocturnal_in_bed['in_sleep']=='yes']['datetime'].iloc[0]
        sleep_offset = nocturnal_in_bed[nocturnal_in_bed['in_sleep']=='yes']['datetime'].iloc[-1]+pd.Timedelta(minutes=1)
        asleep = nocturnal_in_bed.loc[(nocturnal_in_bed['datetime']>=sleep_onset) &
                                      (nocturnal_in_bed['datetime']<=(sleep_offset -pd.Timedelta(minutes=1)))].reset_index(drop=True)  

        tib_period = (end_dt-start_dt).total_seconds()/3600 #  ##in hours
        sleep_period = (sleep_offset-sleep_onset).total_seconds()/3600#  ###in hours

        deeps = len(asleep[asleep['state']=='DEEP'])
        lights = len(asleep[asleep['state']=='LIGHT'])
        rems = len(asleep[asleep['state']=='REM'])
        awakes = len(asleep[asleep['state']=='AWAKE'])
        nulls = len(asleep[asleep['state'].isna()])
        
        just_periods = nocturnal_in_bed[['datetime','state','value']]
        for_inbed_bows = prepare_for_bows(just_periods, nocturnal_in_bed['datetime'].iloc[0], nocturnal_in_bed['datetime'].iloc[-1],'value', 'bed_out')
        inbed_bows = find_bows(for_inbed_bows, 'value', 'bed_in')
        
        just_periods = nocturnal_in_bed[['datetime','state','value']]
        for_outbed_bows = prepare_for_bows(just_periods, nocturnal_in_bed['datetime'].iloc[0], nocturnal_in_bed['datetime'].iloc[-1],'value', 'bed_in')
        outbed_bows = find_bows(for_outbed_bows, 'value', 'bed_out')
        
        just_periods = nocturnal_in_bed[['datetime','state','in_sleep']]
        for_sleep_bows = prepare_for_bows(just_periods, asleep['datetime'].iloc[0], asleep['datetime'].iloc[-1],'in_sleep', 'no')
        sleep_bows = find_bows(for_sleep_bows, 'in_sleep', 'yes')
        
        just_periods = nocturnal_in_bed[['datetime','state','in_sleep']]
        for_wake_bows = prepare_for_bows(just_periods, asleep['datetime'].iloc[0], asleep['datetime'].iloc[-1],'in_sleep', 'yes')
        wake_bows = find_bows(for_wake_bows, 'in_sleep', 'no')
        
        awake = nocturnal_in_bed[nocturnal_in_bed['in_sleep']=='no'].reset_index(drop=True)#this is minutes awake in and out of bed

        new_row = pd.Series({'participant_id':patient, 'date':row['date'], 'cluster_number':row['cluster'],
                             'cluster_classification':row['classification'],
                             'start_dt_total':daily_start,'total_time_in_bed':daily_tib,'total_time_in_sleep':daily_in_sleep,
                            'time_in_bed_duration':len(nocturnal_in_bed[nocturnal_in_bed['state'].isin(['DEEP', 'LIGHT', 'REM','AWAKE'])])/60,
                             'tib_onset':start_dt, 'tib_offset':end_dt,
                             'time_in_bed_period':tib_period,
                             'in_bed_bouts':inbed_bows, 'out_bed_bouts':outbed_bows,
                             'sleep_onset':sleep_onset, 'sleep_offset':sleep_offset, 
                             'sleep_period':sleep_period,
                             'midpoint_sleep':sleep_onset + timedelta(hours=sleep_period/2), 
                             'sleep_duration':len(asleep[asleep['in_sleep']=='yes'])/60,          
                             'sleep_bouts':sleep_bows, 'wake_bouts':wake_bows,
                             'withings_wake_state_during_tib':len(nocturnal_in_bed[nocturnal_in_bed['state']=='AWAKE'])/60,
                             'withings_nulls_during_tib':len(nocturnal_in_bed[nocturnal_in_bed['state'].isna()])/60, 
                             'wake_during_tib':(len(nocturnal_in_bed[nocturnal_in_bed['state']=='AWAKE'])/60)+(len(nocturnal_in_bed[nocturnal_in_bed['state'].isna()])/60),
                             'withings_light_state_duration':lights/60, 'withings_deep_state_duration':deeps/60, 'withings_rem_state_duration':rems/60, 
                             'withings_nrem_sleep_duration':(lights+deeps)/60,
                             'withings_wake_state_during_sleep_period':awakes/60, 
                             'withings_nulls_during_sleep_period':nulls/60, 
                             'withings_wake_after_sleep_onset':(len(asleep[asleep['state']=='AWAKE'])/60)+(len(asleep[asleep['state'].isna()])/60),
                             'snoring_duration':len(asleep[asleep['snoring']==True])/60,
                              'mean_heart_rate_asleep':asleep['heart_rate'].mean(),
                              'mean_respiratory_rate_asleep':asleep['respiratory_rate'].mean()
                            })

        if abs(sum(inbed_bows)-new_row['time_in_bed_duration'])>0.008:
            print('error in bed bouts in day ',index)
            #print(bed_bows,'\ntib', new_row['Time in bed (hrs)'])
            continue 
            
        if abs(sum(sleep_bows)-new_row['sleep_duration'])>0.008:
            print('error in sleep bouts in day ',index)
          #  print(sleep_bows, sum(sleep_bows), new_row['sleep_duration'])
            continue 
        
        if (awakes+deeps+lights+rems+nulls)/60 != new_row['sleep_period']:
            print('error in sleep period or states ','st',awakes,deeps,lights,rems,nulls,'period',new_row['sleep_period']*60)
            print(asleep[['state','datetime']].groupby('state').count())
            #rint('error en transitions \n',asleep[['datetime','in_sleep']],'\n', bows, new_row['Sleep Time (hrs)'])
            continue 
            
        if new_row['time_in_bed_duration'] > (new_row['time_in_bed_period']):
            print ('error in duration greater than period', new_row['time_in_bed_duration'], new_row['time_in_bed_period'])
            print(new_row)
            continue
            
        if len(nocturnal_in_bed)/60 != (new_row['time_in_bed_period']):
            print ('error in bed period', len(nocturnal_in_bed), new_row['time_in_bed_period'])
            print(new_row)
            continue
            
        old_df = append_row(old_df, new_row)
        
    return old_df

In [None]:
ids_list = hourgap['Participant ID'].unique()
len(ids_list)

In [None]:
sleep_stats=start_sleep_table()

for idx,i in enumerate(ids_list):
    if i not in ['']:
        print('current length table:', len(sleep_stats))
        print(idx,i, len(sleep_stats))
        
        i_df = process_sleep_mat(i, sleep)
        sleep_stats = update_sleep_table(sleep_stats, i_df, i, hourgap[hourgap['Participant ID']==i])
        df2 = pa.Table.from_pandas(sleep_stats)
        pq.write_table(df2, 'in_bed_metrics_using_cluster.parquet', compression='BROTLI')

In [None]:
sleep_stats[['time_in_bed_duration','tib_onset', 'tib_offset','time_in_bed_period',
                    'sleep_onset', 'sleep_offset', 'sleep_period','midpoint_sleep', 'sleep_duration', ]]

sleep_stats = pd.read_parquet('in_bed_stats_using_cluster.parquet', engine='pyarrow')

In [None]:
sleep_stats['wake_during_tib_percentage'] = (100*sleep_stats['wake_during_tib'])/sleep_stats['time_in_bed_period']


sleep_stats['withings_light_state_percentage']= (100*sleep_stats['withings_light_state_duration'])/(sleep_stats['sleep_period']) 
sleep_stats['withings_deep_state_percentage']= (100*sleep_stats['withings_deep_state_duration'])/(sleep_stats['sleep_period']) 
sleep_stats['withings_rem_state_percentage']= (100*sleep_stats['withings_rem_state_duration'])/(sleep_stats['sleep_period']) 
sleep_stats['withings_wake_state_percentage']= (100*sleep_stats['withings_wake_state_during_sleep_period'])/(sleep_stats['sleep_period'])
sleep_stats['withings_nulls_during_sleep_period_percentage']= (100*sleep_stats['withings_nulls_during_sleep_period'])/(sleep_stats['sleep_period'])

sleep_stats['check_percs'] = round(sleep_stats['withings_light_state_percentage']+ sleep_stats['withings_deep_state_percentage']
                            +sleep_stats['withings_rem_state_percentage']+sleep_stats['withings_wake_state_percentage']+
                            sleep_stats['withings_nulls_during_sleep_period_percentage'])
#sleep_stats

In [None]:
sleep_stats.loc[:, sleep_stats.isna().any()].columns
#sleep_stats[sleep_stats.isnull().any(axis=1)]

In [None]:
sleep_stats[sleep_stats['check_percs']!=100][['check_percs','sleep_period','withings_light_state_duration','withings_deep_state_duration',
                                             'withings_rem_state_duration','withings_wake_state_during_sleep_period',
                                             'withings_nulls_during_sleep_period']]

In [None]:
df2 = pa.Table.from_pandas(sleep_stats)
pq.write_table(df2, 'in_bed_metrics_using_cluster.parquet', compression='BROTLI')