In [1]:
import datetime
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import collections
import scipy.stats
import time

In [2]:
# function: parses date string into DateTime Object
# input: date
# output: DateTime Object
def dateParser(date):
    mainFormat = '%Y-%m-%d %H:%M:%S.%f'
    altFormat = '%Y-%m-%d %H:%M:%S'
    try:
        return datetime.datetime.strptime(date, mainFormat)
    except ValueError:
        return datetime.datetime.strptime(date, altFormat)

# function: return a DataFrame from directory
# input: file directory dexcom
# output: DataFrame
def getGlucoseData(fileDir):
    df = pd.read_csv(fileDir)
    data = pd.DataFrame()
    data['Time'] = df['Timestamp (YYYY-MM-DDThh:mm:ss)'] 
    data['Glucose'] = pd.to_numeric(df['Glucose Value (mg/dL)'])
    data.drop(data.index[:12], inplace=True)
    data['Time'] = np.array([dateParser(dateStr) for dateStr in data['Time']])
    data['Day'] = np.array([date.day for date in data['Time']])
    data = data.reset_index()
    return data

# function: return a DataFrame from directory
# input: file directory hr
# output: DataFrame
def getHRData(fileDir):
    df = pd.read_csv(fileDir)
    data = pd.DataFrame()
    data['Time'] = df['datetime']
    data['Heart Rate'] = pd.to_numeric(df[' hr'])
    data.drop(data.index[:12], inplace=True)
    data['Time'] = np.array([dateParser(dateStr) for dateStr in data['Time']])
    data['Day'] = np.array([date.day for date in data['Time']])
    data = data.reset_index()
    return data

In [3]:
directory = "{0}/{1}"
dexcomFormat = "Dexcom_{0}.csv"
accFormat = "ACC_{0}.csv"
foodLogFormat = "Food_Log_{0}.csv"
ibiFormat = "IBI_{0}.csv"
bvpFormat = "BVP_{0}.csv"
edaFormat = "EDA_{0}.csv"
hrFormat = "HR_{0}.csv"
tempFormat = "TEMP_{0}.csv"

In [4]:
sns.set_theme()
dexcomFiles = [directory.format(str(i).zfill(3), dexcomFormat.format(str(i).zfill(3))) for i in range(1,17)]
hrFiles = [directory.format(str(i).zfill(3), hrFormat.format(str(i).zfill(3))) for i in range(1,17)]
bvpFiles = [directory.format(str(i).zfill(3), bvpFormat.format(str(i).zfill(3))) for i in range(1,17)]
edaFiles = [directory.format(str(i).zfill(3), edaFormat.format(str(i).zfill(3))) for i in range(1,17)]
tempFiles = [directory.format(str(i).zfill(3), tempFormat.format(str(i).zfill(3))) for i in range(1,17)]

# Demographics

In [5]:
# function: get biological sex
# input: Demographic.txt path
# output: DataFrame with ID and Gender
def biologicalSex(demographic):
    df = pd.read_csv(demographic, sep='\t')
    return df[['ID', 'Biological Sex']]

# function: get HbA1c
# input: Demographic.txt path
# output: DataFrame with ID and HbA1c
def HbA1c(demographic):
    df = pd.read_csv(demographic, sep='\t')
    return df[['ID', 'HbA1c']]


In [57]:
class summaryStats:
    def __init__(self, csv, ID, pp5, colName):
        self.csv = csv
        self.ID = ID
        self.pp5 = pp5
        self.colName = colName
    
    def mean(self):
        df = pd.read_csv(self.csv)
        time = []
        avg = []
        idx = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            avg.append(np.mean(df[self.colName][i*self.pp5:self.pp5+i*self.pp5]))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Mean'] = avg
        return pd.DataFrame(d)
    
    def std(self):
        df = pd.read_csv(self.csv)
        time = []
        std = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            std.append(np.std(df[self.colName][i*self.pp5:self.pp5+i*self.pp5]))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Std'] = std
        return pd.DataFrame(d)

    def minimum(self):
        df = pd.read_csv(self.csv)
        time = []
        minimum = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            minimum.append(np.min(df[self.colName][i*self.pp5:self.pp5+i*self.pp5]))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Min'] = minimum
        return pd.DataFrame(d)

    def maximum(self):
        df = pd.read_csv(self.csv)
        time = []
        maximum = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            maximum.append(np.max(df[self.colName][i*self.pp5:self.pp5+i*self.pp5]))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Max'] = maximum
        return pd.DataFrame(d)

    def Q1g(self):
        df = pd.read_csv(self.csv)
        time = []
        q1g = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            q1g.append(np.percentile(df[self.colName][i*self.pp5:self.pp5+i*self.pp5], 25))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Q1G'] = q1g
        return pd.DataFrame(d)

    def Q3g(self):
        df = pd.read_csv(self.csv)
        time = []
        q3g = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            q3g.append(np.percentile(df[self.colName][i*self.pp5:self.pp5+i*self.pp5], 75))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Q3G'] = q3g
        return pd.DataFrame(d)

    def skew(self):
        df = pd.read_csv(self.csv)
        time = []
        skew = []
        for i in range(round(len(df[self.colName])/self.pp5)):
            skew.append(scipy.stats.skew(df[self.colName][i*self.pp5:self.pp5+i*self.pp5]))
            time.append(df['datetime'][i*self.pp5])
        d = {}
        d['ID'] = self.ID
        d['datetime'] = time
        d[f'{self.colName}_Skew'] = skew
        return pd.DataFrame(d)
    
    def mergedDf(self):
        final = self.mean()
        final = final.merge(self.std(), how='outer', on=['ID', 'datetime'])
        final = final.merge(self.minimum(), how='outer', on=['ID', 'datetime'])
        final = final.merge(self.maximum(), how='outer', on=['ID', 'datetime'])
        final = final.merge(self.Q1g(), how='outer', on=['ID', 'datetime'])
        final = final.merge(self.Q3g(), how='outer', on=['ID', 'datetime'])
        final = final.merge(self.skew(), how='outer', on=['ID', 'datetime'])
        return final

In [63]:
eda = summaryStats(edaFiles[0], '001', 1200, ' eda')
eda_df = eda.mergedDf()
eda_df.head()

Unnamed: 0,ID,datetime,eda_Mean,eda_Std,eda_Min,eda_Max,eda_Q1G,eda_Q3G,eda_Skew
0,1,2020-02-13 15:28:50.000,0.067655,0.021658,0.0,0.135805,0.066621,0.080714,-1.461376
1,1,2020-02-13 15:33:50.000,0.118297,0.114228,0.071746,0.654681,0.084558,0.088401,3.642046
2,1,2020-02-13 15:38:50.000,0.344416,0.23137,0.031015,0.88556,0.140929,0.556939,0.390533
3,1,2020-02-13 15:43:50.000,0.113136,0.010795,0.081995,0.133242,0.105056,0.122993,0.3782
4,1,2020-02-13 15:48:50.000,0.102946,0.003486,0.088401,0.119149,0.099932,0.105056,-0.067487


In [64]:
hr = summaryStats(hrFiles[0], '001', 300, ' hr')
hr_df = hr.mergedDf()
hr_df.head()

Unnamed: 0,ID,datetime,hr_Mean,hr_Std,hr_Min,hr_Max,hr_Q1G,hr_Q3G,hr_Skew
0,1,2020-02-13 15:29:00.000000,92.654533,6.398803,78.02,101.33,87.8425,97.63,-0.54031
1,1,2020-02-13 15:34:00.000000,71.751633,8.754781,57.42,87.9,62.96,78.5075,0.072303
2,1,2020-02-13 15:39:00.000000,88.197733,14.614037,66.32,117.25,74.355,97.5425,0.36147
3,1,2020-02-13 15:44:00.000000,75.0138,6.761581,62.53,87.65,69.665,79.2775,0.302427
4,1,2020-02-13 15:49:00.000000,82.4493,10.307789,68.43,104.08,76.715,86.1375,0.763512


In [65]:
temp = summaryStats(tempFiles[0], '001', 300, ' temp')
temp_df = temp.mergedDf()
temp_df.head()

Unnamed: 0,ID,datetime,temp_Mean,temp_Std,temp_Min,temp_Max,temp_Q1G,temp_Q3G,temp_Skew
0,1,2020-02-13 15:28:50.000,29.4372,1.886221,24.89,30.95,28.51,30.83,-1.269811
1,1,2020-02-13 15:30:05.000,31.0068,0.051333,30.87,31.09,30.97,31.05,-0.70224
2,1,2020-02-13 15:31:20.000,31.062533,0.025461,30.99,31.11,31.05,31.07,-0.904557
3,1,2020-02-13 15:32:35.000,31.011867,0.014301,30.97,31.05,31.01,31.03,-0.138516
4,1,2020-02-13 15:33:50.000,30.990533,0.033302,30.93,31.05,30.95,31.01,-0.267539
