In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

In [2]:
def concat_slist(slist):
    s = ''
    for vs in slist:
        s+= str(vs)+','
    return s

def correct_col_type(df,col):
    # raw_type = str(type(df[col].dtype)).split('.')[-1].split('\'')[0]
    raw_type = str(df[col].dtype)
    
    # print(col,raw_type)
    if 'object' in raw_type:
        if 'date' in col:
            return pd.to_datetime(df[col])
        else:
            return df[col].astype('category')
    else:
        return df[col]
    
def gen_date_col(df, tcol):
    df['date'] = df[tcol].dt.date
    return df

def transform_category_to_counts(df,col,keys):
    tmp = df.groupby([col]+ keys).size().to_frame('size').reset_index().pivot_table(values = 'size', columns=col, index=keys)
    tmp = tmp.drop(tmp.index[tmp.values.sum(axis=1)==0],axis=0).reset_index()
    return tmp

def get_personal_df(df,pid):
    if not 'patient_id' in df.columns:
        df = df.reset_index()
    tmp = df.loc[df.patient_id==pid].drop('patient_id',axis=1)
    
    return tmp

def min_max_perpatient(df,skip=[]):
    for pid in df.patient_id.unique():
        ptmp = df.loc[df.patient_id==pid]
        for c in ptmp.columns:
            if 'int' in str(ptmp[c].dtype) or 'float' in str(ptmp[c].dtype):
                if ptmp[c].notna().sum() > 0 and c not in skip:
                    min_v = np.nanmin(ptmp[c].values)
                    max_v = np.nanmax(ptmp[c].values)
                    if max_v > min_v:
                        df.loc[df.patient_id==pid,c] = (ptmp[c].values-min_v)/(max_v-min_v)
                    elif max_v!=0:
                        df.loc[df.patient_id==pid,c] = 0.5 #only one record 
    return df   

def gen_summary(df):
    sm = pd.DataFrame(columns=['Value Type','Value Number','Description'])
    for stc in df.columns:
        sm.loc[stc,'Value Type'] = str(type(df[stc].dtype)).split('.')[-1].split('\'')[0]
        if 'Categorical' in sm.loc[stc,'Value Type'] or 'object' in sm.loc[stc,'Value Type']:
            vset = set(df[stc].values)
            sm.loc[stc, 'Value Number'] = len(vset)
            dl = len(vset) if 5 > len(vset) else 5
            if 'id' in stc:
                sm.loc[stc,'Description'] = 'hash code'
            else:
                sm.loc[stc,'Description'] = concat_slist(list(vset)[:dl])
        elif 'datetime' in sm.loc[stc,'Value Type'].lower(): 
            sm.loc[stc,'Description'] = 'from '+ str(df[stc].min()) + ' to ' + str(str(df[stc].max()))
        elif 'float' in sm.loc[stc,'Value Type'] or 'int' in sm.loc[stc,'Value Type']: 
            sm.loc[stc,'Description'] = 'min: ' + str(df[stc].min()) + ', max: ' + str(str(df[stc].max()))
        elif 'bool' in sm.loc[stc,'Value Type']: 
            sm.loc[stc, 'Value Number'] = 2
            sm.loc[stc,'Description'] = 'True or False'

    return sm

In [3]:
# Import data

df_demog = pd.read_csv("./../data/raw/Demographics.csv", encoding='latin1', sep = ",")

df_activity = pd.read_csv("./../data/raw/Activity.csv", encoding='latin1', sep = ",")
df_physiology = pd.read_csv("./../data/raw/Physiology.csv", encoding='latin1', sep = ",")
df_sleep = pd.read_csv("./../data/raw/Sleep.csv", encoding='latin1', sep = ",")

df_labels = pd.read_csv("./../data/raw/Labels.csv", encoding='latin1', sep = ",")

df_demog.shape, df_activity.shape, df_physiology.shape, df_sleep.shape, df_labels.shape

((56, 3), (1030559, 3), (17679, 5), (461423, 6), (608, 3))

In [4]:
DPATH = '../data/raw/'

for fname in os.listdir(DPATH):
    print(fname)

Labels.csv
Demographics.csv
Activity.csv
Sleep.csv
Physiology.csv


In [5]:
# Read all tables into data_dict

files = os.listdir(DPATH)
data_dict = {}
summaries = {}
for f in files:
    if 'csv' not in f:
        continue
    print(f)
    fpth = os.path.join(DPATH,f)
    df = pd.read_csv(fpth)

    for col in df.columns:
        df[col] = correct_col_type(df, col)
    if 'date' in df.columns:
        df = df.rename(columns={'date':'timestamp'})
                
    fname = f.split('.')[0]
    data_dict[fname] = df

Labels.csv
Demographics.csv
Activity.csv
Sleep.csv
Physiology.csv


In [6]:
## Generate a date column for Labels and Activity table 
lbl_df = gen_date_col(data_dict['Labels'], tcol='timestamp')
act_df1 = gen_date_col(data_dict['Activity'], tcol='timestamp')

## Generate a hour column for Activity table 
act_df1['hour'] = act_df1.timestamp.dt.hour
act_df = transform_category_to_counts(act_df1,col='location_name',keys=['patient_id','date','hour'])

  tmp = df.groupby([col]+ keys).size().to_frame('size').reset_index().pivot_table(values = 'size', columns=col, index=keys)
  tmp = df.groupby([col]+ keys).size().to_frame('size').reset_index().pivot_table(values = 'size', columns=col, index=keys)


In [7]:
act_df

location_name,patient_id,date,hour,Back Door,Bathroom,Bedroom,Fridge Door,Front Door,Hallway,Kitchen,Lounge
0,0697d,2019-06-28,13,11.0,5.0,10.0,21.0,13.0,9.0,10.0,14.0
1,0697d,2019-06-28,14,0.0,0.0,7.0,2.0,7.0,9.0,20.0,16.0
2,0697d,2019-06-28,15,0.0,0.0,0.0,0.0,0.0,3.0,5.0,6.0
3,0697d,2019-06-28,16,0.0,0.0,0.0,0.0,0.0,0.0,10.0,6.0
4,0697d,2019-06-28,17,1.0,0.0,1.0,0.0,7.0,9.0,12.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...
47526,fd100,2019-06-30,19,0.0,0.0,1.0,0.0,0.0,0.0,4.0,6.0
47527,fd100,2019-06-30,20,0.0,3.0,2.0,0.0,4.0,7.0,4.0,10.0
47528,fd100,2019-06-30,21,0.0,1.0,8.0,0.0,0.0,3.0,9.0,12.0
47529,fd100,2019-06-30,22,0.0,6.0,5.0,0.0,2.0,5.0,3.0,7.0


In [8]:
## Aggregate doors
act_df['Door'] = act_df['Front Door'] + act_df['Back Door']
act_df = act_df.drop(columns=['Front Door','Back Door'])
act_df.head()

location_name,patient_id,date,hour,Bathroom,Bedroom,Fridge Door,Hallway,Kitchen,Lounge,Door
0,0697d,2019-06-28,13,5.0,10.0,21.0,9.0,10.0,14.0,24.0
1,0697d,2019-06-28,14,0.0,7.0,2.0,9.0,20.0,16.0,7.0
2,0697d,2019-06-28,15,0.0,0.0,0.0,3.0,5.0,6.0,0.0
3,0697d,2019-06-28,16,0.0,0.0,0.0,0.0,10.0,6.0,0.0
4,0697d,2019-06-28,17,0.0,1.0,0.0,9.0,12.0,7.0,8.0


In [9]:
## generate features by statistics of hourly counts for each day, ##
## missing values will not be counted ##

keys = ['patient_id','date']
act_df.drop('hour',axis=1,inplace=True)
act_cols = list(act_df.set_index(keys).columns)
agg_fns = {'sum':act_cols,'std':act_cols,'mean':act_cols,'max':act_cols} #{'sum':act_cols,'std':act_cols,'max':act_cols,'mean':act_cols}#
methods = {}
for agf,cols in agg_fns.items():
    print(agf)
    methods[agf] = act_df.loc[:,cols+keys].groupby(keys).agg(agf)
    
tmp_df=None

for mhd,tdf in methods.items():
    cols = tdf.columns
    vcols = [c+'_count_'+mhd for c in cols]
    if tmp_df is None:
        tmp_df = tdf
    tmp_df[vcols] = tdf.loc[tmp_df.index,cols]

act_df = tmp_df.drop(act_cols,axis=1)
act_df.head()

sum
std
mean
max


  methods[agf] = act_df.loc[:,cols+keys].groupby(keys).agg(agf)
  methods[agf] = act_df.loc[:,cols+keys].groupby(keys).agg(agf)
  methods[agf] = act_df.loc[:,cols+keys].groupby(keys).agg(agf)
  methods[agf] = act_df.loc[:,cols+keys].groupby(keys).agg(agf)


Unnamed: 0_level_0,location_name,Bathroom_count_sum,Bedroom_count_sum,Fridge Door_count_sum,Hallway_count_sum,Kitchen_count_sum,Lounge_count_sum,Door_count_sum,Bathroom_count_std,Bedroom_count_std,Fridge Door_count_std,...,Kitchen_count_mean,Lounge_count_mean,Door_count_mean,Bathroom_count_max,Bedroom_count_max,Fridge Door_count_max,Hallway_count_max,Kitchen_count_max,Lounge_count_max,Door_count_max
patient_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0697d,2019-04-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
0697d,2019-04-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
0697d,2019-04-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
0697d,2019-04-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
0697d,2019-04-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,


In [10]:
data_dict['Physiology']

Unnamed: 0,patient_id,timestamp,device_type,value,unit
0,0697d,2019-06-28 13:42:09,Body Temperature,36.072,Cel
1,0697d,2019-06-28 13:42:44,Body Temperature,35.831,Cel
2,0697d,2019-06-28 13:43:40,Body Temperature,35.831,Cel
3,0697d,2019-06-28 13:45:15,Systolic blood pressure,165.000,mm[Hg]
4,0697d,2019-06-28 13:45:15,Diastolic blood pressure,82.000,mm[Hg]
...,...,...,...,...,...
17674,fd100,2019-06-28 21:09:27,Heart rate,61.000,beats/min
17675,fd100,2019-06-28 21:32:19,Body Temperature,36.879,Cel
17676,fd100,2019-06-28 21:32:46,Body Temperature,36.879,Cel
17677,fd100,2019-06-29 21:05:11,Body Temperature,36.240,Cel


In [11]:
# Ensures the timestamp is a datetime and creates a 'date' column.
phys_df = gen_date_col(data_dict['Physiology'], tcol='timestamp')

# Ensures that the value is numeric
phys_df['value'] = pd.to_numeric(phys_df['value'], errors='coerce')

# Performs daily aggregation only on 'value'
phys_df = phys_df.groupby(['patient_id','date','device_type'])['value'].max().reset_index()

# Remove zero values
phys_df = phys_df.loc[phys_df['value'] != 0]

# Pivot to long format
phys_df = phys_df.pivot_table(values='value', columns='device_type', index=['patient_id','date'])

phys_df.head()

  phys_df = phys_df.groupby(['patient_id','date','device_type'])['value'].max().reset_index()
  phys_df = phys_df.pivot_table(values='value', columns='device_type', index=['patient_id','date'])


Unnamed: 0_level_0,device_type,Body Temperature,Body weight,Diastolic blood pressure,Heart rate,O/E - muscle mass,Skin Temperature,Systolic blood pressure,Total body water
patient_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0697d,2019-06-28,36.599,86.3,82.0,42.0,64.5,,165.0,50.6
0697d,2019-06-29,37.257,86.1,80.0,62.0,64.6,,168.0,51.2
0697d,2019-06-30,36.779,,83.0,55.0,,,158.0,
099bc,2019-05-15,40.876,53.3,88.0,99.0,39.8,,141.0,54.0
099bc,2019-05-16,37.045,52.7,78.0,71.0,,,152.0,


In [12]:
## Set parameters for cross-validation

window = pd.Timedelta('7D')     # time window of test set
K = 5     # 5-fold cross-validation
sd=0      # random seed
keys = ['patient_id','date']
label_type = 'Blood pressure'

lbl_df = lbl_df.set_index(keys)
label_scores = {}
shap_values, tX = {}, []

In [13]:
## Merge Activity and Physiology tables

act_df = act_df.reset_index()
phys_df = phys_df.reset_index()

X = act_df.merge(phys_df, how='outer', on=['patient_id','date']).fillna(0)

num_cols = X.select_dtypes(include='number').columns
X = X.loc[X[num_cols].sum(axis=1) != 0]

inter_ids = set(zip(X.patient_id, X.date)) & set(lbl_df[lbl_df.type==label_type].index)

X['label'] = 0
X.loc[X.set_index(['patient_id','date']).index.isin(inter_ids), 'label'] = 1

# X = min_max_perpatient(X, skip=['label'])

dates = np.sort(X.loc[X.label==1, 'date'])
end_date = dates[-1]

X.shape


(2796, 39)

In [14]:
X

Unnamed: 0,patient_id,date,Bathroom_count_sum,Bedroom_count_sum,Fridge Door_count_sum,Hallway_count_sum,Kitchen_count_sum,Lounge_count_sum,Door_count_sum,Bathroom_count_std,...,Door_count_max,Body Temperature,Body weight,Diastolic blood pressure,Heart rate,O/E - muscle mass,Skin Temperature,Systolic blood pressure,Total body water,label
86,0697d,2019-06-28,7.0,24.0,23.0,40.0,106.0,80.0,42.0,1.636392,...,24.0,36.599,86.3,82.0,42.0,64.5,0.0,165.0,50.6,1
87,0697d,2019-06-29,11.0,26.0,8.0,57.0,120.0,117.0,25.0,1.649916,...,6.0,37.257,86.1,80.0,62.0,64.6,0.0,168.0,51.2,1
88,0697d,2019-06-30,14.0,53.0,0.0,57.0,119.0,103.0,12.0,2.250146,...,4.0,36.779,0.0,83.0,55.0,0.0,0.0,158.0,0.0,0
132,099bc,2019-05-15,17.0,31.0,27.0,32.0,71.0,6.0,25.0,1.567021,...,8.0,40.876,53.3,88.0,99.0,39.8,0.0,141.0,54.0,0
133,099bc,2019-05-16,42.0,85.0,22.0,50.0,104.0,9.0,20.0,2.693071,...,6.0,37.045,52.7,78.0,71.0,0.0,0.0,152.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,f220c,2019-06-30,12.0,61.0,24.0,32.0,0.0,0.0,18.0,1.011628,...,5.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5017,fd100,2019-06-27,24.0,48.0,23.0,47.0,100.0,97.0,27.0,1.601136,...,11.0,36.903,55.8,84.0,55.0,37.5,0.0,162.0,48.7,1
5018,fd100,2019-06-28,32.0,91.0,7.0,58.0,145.0,120.0,21.0,3.314968,...,10.0,37.137,0.0,74.0,61.0,0.0,0.0,138.0,0.0,0
5019,fd100,2019-06-29,33.0,56.0,27.0,61.0,110.0,96.0,15.0,2.704494,...,4.0,36.240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
# Function to split into train/val/test ('patient' or 'time' mode)

def split_train_val_test(X, mode='patient', train_frac=0.7, val_frac=0.15, test_frac=0.15, random_state=0):
    
    """
    Splits DataFrame X into (train, val, test).

    Inputs:
      - X: DataFrame with columns 'patient_id', 'date' (datetime/date), and 'label' (0/1).
      - mode: 'patient' (disjoint patients) or 'time' (temporal split per patient).
      - train_frac, val_frac, test_frac: fractions (must sum to 1.0).

    Returns: train_df, val_df, test_df
    """

    assert abs(train_frac + val_frac + test_frac - 1.0) < 1e-6, "Fractions must sum to 1.0"

    Xc = X.copy()

    if mode == 'patient':
        pids = np.array(Xc['patient_id'].unique())
        
        # stratify by whether the patient has at least one positive label
        pid_has_label = Xc.groupby('patient_id')['label'].max().reindex(pids).fillna(0).astype(int)

        p_train, p_temp = train_test_split(pids, train_size=train_frac, random_state=random_state, stratify=pid_has_label)
        rem = 1.0 - train_frac
        if rem <= 0:
            p_val = np.array([], dtype=pids.dtype)
            p_test = np.array([], dtype=pids.dtype)
        else:
            val_rel = val_frac / rem
            pid_has_label_temp = pid_has_label.reindex(p_temp).fillna(0).astype(int)
            p_val, p_test = train_test_split(p_temp, train_size=val_rel, random_state=random_state, stratify=pid_has_label_temp)

        train_df = Xc[Xc['patient_id'].isin(p_train)].reset_index(drop=True)
        val_df = Xc[Xc['patient_id'].isin(p_val)].reset_index(drop=True)
        test_df = Xc[Xc['patient_id'].isin(p_test)].reset_index(drop=True)
        return train_df, val_df, test_df

    elif mode == 'time':
        Xc = Xc.sort_values(['patient_id', 'date'])
        train_parts, val_parts, test_parts = [], [], []
        for pid, g in Xc.groupby('patient_id'):
            n = len(g)
            if n == 0:
                continue
            n_test = int(np.round(n * test_frac))
            n_val = int(np.round(n * val_frac))
            
            # ensure at least 1 sample in each split when possible
            if n_test == 0 and test_frac > 0 and n > 1:
                n_test = 1
            if n_val == 0 and val_frac > 0 and n - n_test > 1:
                n_val = 1
            n_train = n - n_val - n_test
            if n_train <= 0:
                
                # push first samples to train; val/test remain empty if not enough data
                n_train = max(0, n - n_val - n_test)
            train_parts.append(g.iloc[:n_train])
            val_parts.append(g.iloc[n_train:n_train+n_val])
            test_parts.append(g.iloc[n_train+n_val:])
        train_df = pd.concat([d for d in train_parts if not d.empty], ignore_index=True) if train_parts else Xc.iloc[0:0]
        val_df = pd.concat([d for d in val_parts if not d.empty], ignore_index=True) if val_parts else Xc.iloc[0:0]
        test_df = pd.concat([d for d in test_parts if not d.empty], ignore_index=True) if test_parts else Xc.iloc[0:0]
        return train_df, val_df, test_df

    else:
        raise ValueError("mode must be 'patient' or 'time'")

In [16]:
train_df, val_df, test_df = split_train_val_test(X, mode='patient', train_frac=0.7, val_frac=0.15, test_frac=0.15, random_state=sd)

print(f"Unique patients (train/val/test): {train_df.patient_id.nunique()} / {val_df.patient_id.nunique()} / {test_df.patient_id.nunique()}")
print(f"Rows (train/val/test): {len(train_df)} / {len(val_df)} / {len(test_df)}")

for name, df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    pos = int(df.label.sum()) if 'label' in df.columns else 0
    total = len(df)
    rate = df.label.mean() if total > 0 else 0.0
    print(f"{name}: positives={pos}, total={total}, pos_rate={rate:.4f}")

Unique patients (train/val/test): 39 / 8 / 9
Rows (train/val/test): 2005 / 336 / 455
train: positives=161, total=2005, pos_rate=0.0803
val: positives=20, total=336, pos_rate=0.0595
test: positives=70, total=455, pos_rate=0.1538


In [17]:
train_df

Unnamed: 0,patient_id,date,Bathroom_count_sum,Bedroom_count_sum,Fridge Door_count_sum,Hallway_count_sum,Kitchen_count_sum,Lounge_count_sum,Door_count_sum,Bathroom_count_std,...,Door_count_max,Body Temperature,Body weight,Diastolic blood pressure,Heart rate,O/E - muscle mass,Skin Temperature,Systolic blood pressure,Total body water,label
0,0697d,2019-06-28,7.0,24.0,23.0,40.0,106.0,80.0,42.0,1.636392,...,24.0,36.599,86.3,82.0,42.0,64.5,0.000,165.0,50.6,1
1,0697d,2019-06-29,11.0,26.0,8.0,57.0,120.0,117.0,25.0,1.649916,...,6.0,37.257,86.1,80.0,62.0,64.6,0.000,168.0,51.2,1
2,0697d,2019-06-30,14.0,53.0,0.0,57.0,119.0,103.0,12.0,2.250146,...,4.0,36.779,0.0,83.0,55.0,0.0,0.000,158.0,0.0,0
3,0d5ef,2019-05-13,24.0,32.0,36.0,114.0,81.0,138.0,53.0,2.091322,...,25.0,36.876,98.9,87.0,78.0,67.2,0.000,127.0,48.6,0
4,0d5ef,2019-05-14,10.0,34.0,47.0,100.0,86.0,110.0,38.0,0.825420,...,9.0,36.652,98.7,92.0,75.0,67.1,0.000,154.0,48.3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,f220c,2019-06-26,18.0,84.0,18.0,9.0,0.0,0.0,8.0,1.314751,...,8.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0
2001,f220c,2019-06-27,13.0,28.0,14.0,9.0,0.0,0.0,8.0,1.505545,...,6.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0
2002,f220c,2019-06-28,11.0,34.0,18.0,24.0,0.0,0.0,17.0,1.195478,...,9.0,0.000,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0
2003,f220c,2019-06-29,21.0,74.0,26.0,46.0,0.0,0.0,20.0,1.629408,...,7.0,37.050,0.0,68.0,84.0,0.0,35.399,110.0,0.0,0


In [18]:
# normalize train, val, test with min-max per patient

train_df = min_max_perpatient(train_df, skip=['label'])
val_df = min_max_perpatient(val_df, skip=['label'])
test_df = min_max_perpatient(test_df, skip=['label'])

In [19]:
train_df.to_csv('../data/processed/train_data.csv', index=False)
val_df.to_csv('../data/processed/val_data.csv', index=False)
test_df.to_csv('../data/processed/test_data.csv', index=False)