In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from glom import glom
from datetime import timedelta


## Read, Load, Extract, Save Physioloigcal Signal (Class)

In [2]:
class readloadsignal:
    
    '''
    This classreads and load physiological signals exported from fitbit. It extracts the heart rate (json file), estimated oxygen variation (csv file)
    and skin temperature (csv file) and saves as seperate folders into specified directory
    '''

    def __init__(self,filename,output_name):
        self.filename = filename
        self.output_name = output_name

    def extractvaluejson(self):

        '''
        This method reads and extracts the heartrate from the input jsonfile for heartrate from fitbit
        '''

        df = pd.read_json(self.filename)
        df_1 = df["value"].apply(lambda row:glom(row,"bpm"))
        df_2 = df["value"].apply(lambda row:glom(row, "confidence"))
        df_3 = df["dateTime"]
        df = pd.concat([df_3, df_2, df_1], axis=1, ignore_index=True)
        df.columns = ["datetime","heartrate","confidence"] #rename column
        df.set_index("datetime",inplace=True)
        df = df.loc["2022-07-10 17:24:00": "2022-07-10 18:08:00"]
        df.reset_index(inplace=True)
        return df

    def extractvaluecsv(self):

        '''
        This method reads and extracts the skintemp and estimated oxygen variation from the input csv files from fitbit
        '''

        column_dict = {"timestamp":"datetime", "recorded_time":"datetime", "dateTime":"datetime"}

        df = pd.read_csv(self.filename)
        df.rename(columns=column_dict, inplace=True)
        df["datetime"] = pd.to_datetime(df.datetime)
        df.set_index("datetime", inplace=True)
        df = df.loc["2022-07-10 17:24:00": "2022-07-10 18:08:00"]
        df.reset_index(inplace=True)
        
        return df

    def savesignal(self):

        '''
        This method saves each extracted physiological signal as a sepreate csv into specified directory
        '''

        try:
            df = self.extractvaluejson()
        except ValueError:
            df = self.extractvaluecsv()
        df.to_csv('Extracted data 2/{}.csv'.format(self.output_name))
        print("signal successfully saved")
  

    

## Instances -  Experiment 1

Heart Rate - Exp 1

In [3]:
df_hr = readloadsignal("MyFitbitData/OlumideOdetunde/Physical Activity/heart_rate-2022-07-10.json","df_hr")
df_hr.extractvaluejson()
df_hr.savesignal()

signal successfully saved


Skin Temperature Exp 1

In [4]:
df_skt = readloadsignal("MyFitbitData/OlumideOdetunde/Sleep/Wrist Temperature - 2022-07-11.csv","df_skt")
df_skt.extractvaluecsv()
df_skt.savesignal()

signal successfully saved


 Estimated O2 Variation Exp 1

In [5]:
df_eo = readloadsignal("MyFitbitData/OlumideOdetunde/Other/estimated_oxygen_variation-2022-07-10.csv","df_eo")
df_eo.extractvaluecsv()
df_eo.savesignal()

signal successfully saved


## Dataset Creation (Class)

In [6]:
class aligndatset:

    '''
    Argument: In positional order takes in extracted csvs of heart rate, estimated oxygen variation, skin temperature from the readloadsignal class.
            Followed by excel file of data label and intended name of combined dataset created 
    '''
    
    def __init__(self,filenamecsv1,filenamecsv2,filenamecsv3, filenameexcel,final_df):
        self.filenamecsv1 = filenamecsv1
        self.filenamecsv2 = filenamecsv2
        self.filenamecsv3 = filenamecsv3
        self.filenameexcel = filenameexcel
        self.final_df = final_df 

    def read_clean_signals (self):

        '''
        This method  reads the first three csv entered and saves created dataframes into a list
        '''

        csv_files = [self.filenamecsv1, self.filenamecsv2, self.filenamecsv3]
        dfs = []

        for csv in csv_files:
            column_droplist = ["Unnamed: 0"]
            df  = pd.read_csv(csv)
            df.drop(column_droplist,axis=1,inplace=True)
            df["datetime"] = pd.to_datetime(df.datetime)
            dfs.append(df)
        return dfs

    def merge_signals(self):

        '''
        This method merges the the three dataframes obatined from the readcleansignal method call on date time using closest key align technique
        '''

        column_renamelist = {"Infrared to Red Signal Ratio":"est_02_variation", "temperature":"skin_temp","confidence":"heart_rate"}
        column_droplist = ["heartrate"]
        df1, df2, df3 = self.read_clean_signals()
        
        # print (df1)
        # print (df2)
        # print (df3)

        df_total = pd.merge_asof(df1, df2, on="datetime",direction="backward",\
            tolerance=pd.Timedelta(seconds = 60),allow_exact_matches=True)
        df_total = pd.merge_asof(df_total,df3, on="datetime",direction="nearest",\
            tolerance=pd.Timedelta(seconds = 60),allow_exact_matches=True)
        df_total.rename(columns=column_renamelist, inplace=True)
        df_total.drop(column_droplist, inplace=True, axis=1)

        return df_total

    def load_datalabel(self):

        '''
        This method reads the data label excel file and divides into three dataframes using specified datetimes peculiar to this project
        '''
        
        #Load and extract data label
        df_dl = pd.read_excel(self.filenameexcel)
        df_dl.drop(["UserID","Soundgroup","Soundgroupselection"], axis=1, inplace=True)
        df_dl[["Starttime", "Endtime"]] = df_dl[["Starttime","Endtime"]].apply(pd.to_datetime)
        df_dl["datetime"] = df_dl["Starttime"] + timedelta(seconds = 6)
        
        #Create 3 dataframe with different time frames
        #Dataframe with Starttime
        df_dl_start = df_dl.drop(["datetime","Endtime"], axis=1)
        df_dl_start.rename(columns={"Starttime":"datetime"}, inplace=True)

        #Dataframe with starttime plus six seconds
        df_dl_after6sec = df_dl.drop(["Starttime","Endtime"], axis=1)

        #Dataframe with endtime
        df_dl_end = df_dl.drop(["datetime","Starttime"], axis=1)
        df_dl_end.rename(columns={"Endtime":"datetime"},inplace=True)

        return df_dl_start, df_dl_after6sec, df_dl_end

    def initial_dataset_merge (self):

        '''
        This method performs an inital merge using the three dataframes (created by the load_datalabel method call) and combined physiological
        signal dataframe (created by the merge_signal method call). Returns three unique dataframe of physiological signals merged to 
        the three data label dataframes
        '''

        df_dl_start, df_dl_after6sec, df_dl_end = self.load_datalabel()
        df_total = self. merge_signals()

        df_dataset_1 = pd.merge_asof(df_total, df_dl_start, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=3),allow_exact_matches=True)

        df_dataset_2 = pd.merge_asof(df_total, df_dl_after6sec, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=3),allow_exact_matches=True)
       
        df_dataset_3 = pd.merge_asof(df_total, df_dl_end, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=3),allow_exact_matches=True)

        #join all three together
        df_dataset = pd.concat([df_dataset_1,df_dataset_2,df_dataset_3])

        return df_dataset


    def final_dataset_merge (self):

        '''
        This method uses the combines the  three dataframes returned by the initial_dataset_merge method call and groups by selected key columns
        and generates required statistical features.

        '''

        df_dataset = self.initial_dataset_merge()
        df_dataset.sort_values(by=["SoundID"], inplace=True)

        #Groupby soundid and datetime
        df_dataset = df_dataset.groupby(["SoundID","datetime","Emotion"]).mean()
        df_dataset.sort_values("datetime",inplace=True)
        df_dataset.reset_index(inplace=True)
        print(df_dataset)

        #derive statistical features
        try:
            df_dataset = df_dataset.groupby(["SoundID"]).agg({"datetime":max, "Emotionrating":"mean","Emotion":max, "heart_rate":['mean', 'std'],
                                    "skin_temp":['mean', "std"],"est_02_variation":["mean","std"]})
        except KeyError:
            df_dataset = df_dataset.groupby(["SoundID"]).agg({"datetime":max, "Emotionrating":"mean","Emotion":max, "heart_rate":['mean', 'std']})
        return df_dataset


    def save_final_dataset (self):

        '''
        This method saves the combined and final dataset created by the final_data_set_merge method call into a specified directory
        '''
        
        df = self.final_dataset_merge()
        df.to_csv('Extracted data 2/{}.csv'.format(self.final_df))
        print ("combined dataset successfully created")
        print(df.head())
        

## Instances

User 001 - First Experiment

In [7]:
df_combined= aligndatset("Extracted data 2/df_hr.csv", "Extracted data 2/df_skt.csv", "Extracted data 2/df_eo.csv", "Excel Database.xlsx", "df_combinedined_userolu_exp001" )
df_combined.read_clean_signals()
df_combined.merge_signals()
df_combined.load_datalabel()
df_combined.initial_dataset_merge()
df_combined.final_dataset_merge()
df_combined.save_final_dataset()

    SoundID            datetime  Emotion  heart_rate  Emotionrating
0    0085_2 2022-07-10 17:24:04  Sadness        87.0            3.0
1    0085_2 2022-07-10 17:24:09  Sadness        89.0            3.0
2    0085_2 2022-07-10 17:24:14  Sadness        88.0            3.0
3    0085_2 2022-07-10 17:24:19  Sadness        86.0            3.0
4    0166_2 2022-07-10 17:24:29  Sadness        91.0            2.0
..      ...                 ...      ...         ...            ...
431  0934_2 2022-07-10 18:06:36     Fear        72.0            4.0
432  1025_2 2022-07-10 18:06:36  Sadness        72.0            5.0
433  1025_2 2022-07-10 18:06:46  Sadness        70.0            5.0
434  1025_2 2022-07-10 18:06:51  Sadness        69.0            5.0
435  1382_2 2022-07-10 18:06:51  Sadness        69.0            2.0

[436 rows x 5 columns]
    SoundID            datetime  Emotion  heart_rate  Emotionrating
0    0085_2 2022-07-10 17:24:04  Sadness        87.0            3.0
1    0085_2 2022-07-10 1

In [8]:
df_combined= aligndatset("Extracted data 2/df_hr.csv", "Extracted data 2/df_skt.csv", "Extracted data 2/df_eo.csv", "Excel Database.xlsx", "df_combinedined_userolu_exp001" )
df_combined.read_clean_signals()
df_combined.merge_signals()
df_combined.load_datalabel()

(    SoundID            datetime  Emotion  Emotionrating
 0    0085_2 2022-10-07 16:24:06  Sadness              3
 1    0166_2 2022-10-07 16:24:24  Sadness              2
 2    0167_2 2022-10-07 16:24:40     Fear              5
 3    0173_2 2022-10-07 16:24:52  Sadness              1
 4    0217_2 2022-10-07 16:25:05     Fear              5
 ..      ...                 ...      ...            ...
 159  0818_2 2022-10-07 17:06:03  Sadness              3
 160  0889_2 2022-10-07 17:06:14     Fear              4
 161  0934_2 2022-10-07 17:06:27     Fear              4
 162  1025_2 2022-10-07 17:06:38  Sadness              5
 163  1382_2 2022-10-07 17:06:50  Sadness              2
 
 [164 rows x 4 columns],
     SoundID  Emotion  Emotionrating            datetime
 0    0085_2  Sadness              3 2022-10-07 16:24:12
 1    0166_2  Sadness              2 2022-10-07 16:24:30
 2    0167_2     Fear              5 2022-10-07 16:24:46
 3    0173_2  Sadness              1 2022-10-07 16:24:58
 4  

## Another Class - Nothing was wrong with the above class, Just keeping what i wrote during htis epxerimentation

In [19]:
class Datacreator:

    def __init__(self,filenamecsv,filenameexcel,final_df):
        self.filenamecsv = filenamecsv
        self.filenameexcel = filenameexcel
        self.final_df = final_df

    def read_signal(self):
        column_droplist = ["Unnamed: 0","heartrate"]
        column_renamelist = {'confidence':'heart_rate'}
        df  = pd.read_csv(self.filenamecsv)
        df.drop(column_droplist,axis=1,inplace=True)
        df.rename(columns=column_renamelist,inplace=True)
        df["datetime"] = pd.to_datetime(df.datetime)
        return df

    def load_datalabel(self):

        df_dl = pd.read_excel(self.filenameexcel)
        df_dl.drop(["UserID","Soundgroupselection"], axis=1, inplace=True)
        df_dl[["Starttime", "Endtime"]] = df_dl[["Starttime","Endtime"]].apply(pd.to_datetime)
        df_dl["datetime"] = df_dl["Starttime"] + timedelta(seconds = 6)
        
        #Create 3 dataframe with different time frames
        #Dataframe with Starttime
        df_dl_start = df_dl.drop(["datetime","Endtime"], axis=1)
        df_dl_start.rename(columns={"Starttime":"datetime"}, inplace=True)

        #Dataframe with starttime plus six seconds
        df_dl_after6sec = df_dl.drop(["Starttime","Endtime"], axis=1)

        #Dataframe with endtime
        df_dl_end = df_dl.drop(["datetime","Starttime"], axis=1)
        df_dl_end.rename(columns={"Endtime":"datetime"},inplace=True)

        return df_dl_start, df_dl_after6sec, df_dl_end

    def merge_initial(self):
        
        df_dl_start, df_dl_after6sec, df_dl_end = self.load_datalabel()
        df = self.read_signal()

        df_dl_start.sort_values(by='datetime', inplace=True)
        df.sort_values(by='datetime', inplace=True)

        # df_dataset_1 = pd.merge_asof(df, df_dl_start, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=1),allow_exact_matches=False)

        # df_dataset_2 = pd.merge_asof(df, df_dl_after6sec, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=1),allow_exact_matches=False)
    
        # df_dataset_3 = pd.merge_asof(df, df_dl_end, on="datetime",direction="nearest", tolerance=pd.Timedelta(seconds=1),allow_exact_matches=False)

        df_test = pd.merge(df_dl_start, df, on='datetime', how='outer')

        #join all three together
        # df_dataset = pd.concat([df_dataset_1,df_dataset_2,df_dataset_3])
        # df_test.to_csv('Extracted data 2/{}.csv'.format(self.final_df))

        return df_test
        # return df_dataset



In [20]:
df = Datacreator('Extracted data 2/df_hr.csv','Excel Database.xlsx', 'df_test')
df.read_signal()
df.load_datalabel()
df.merge_initial()

Unnamed: 0,SoundID,datetime,Emotion,Emotionrating,Soundgroup,heart_rate
0,0085_2,2022-10-07 17:24:06,Sadness,3.0,Transport,
1,0166_2,2022-10-07 17:24:24,Sadness,2.0,Transport,
2,0167_2,2022-10-07 17:24:40,Fear,5.0,Transport,
3,0173_2,2022-10-07 17:24:52,Sadness,1.0,Transport,
4,0217_2,2022-10-07 17:25:05,Fear,5.0,Transport,
...,...,...,...,...,...,...
550,,2022-07-10 18:07:31,,,,72.0
551,,2022-07-10 18:07:41,,,,74.0
552,,2022-07-10 18:07:46,,,,72.0
553,,2022-07-10 18:07:51,,,,74.0


### New experimentation

In [33]:
## Function to find closest date between the datetime of 2 dataframes

def find_closest_date(timepoint, time_series, add_time_delta_column=True):
     # takes a pd.Timestamp() instance and a pd.Series with dates in it
     # calcs the delta between `timepoint` and each date in `time_series`
     # returns the closest date and optionally the number of days in its time delta
     deltas = np.abs(time_series - timepoint)
     idx_closest_date = np.argmin(deltas)
     res = {"closest_date": time_series.iloc[idx_closest_date]}
     idx = ['closest_date']
     if add_time_delta_column:
         res["closest_delta"] = deltas[idx_closest_date]
         idx.append('closest_delta')
     return pd.Series(res, index=idx)

In [37]:
find_closest_date(df_key.key_date, df_hr.datetime,add_time_delta_column=True )
# print(x)

df_key[['closest', 'days_bt_x_and_y']] = df_key.key_date.apply(
                                           find_closest_date, args=[df_hr.datetime])

df_key2 = pd.merge(df_key, df_hr, left_on=['closest'], right_on=['datetime'])
# colorder = ['timepoint_x','closest','timepoint_y','days_bt_x_and_y','measure_x','measure_y']

#df_key.head()
df_key2.head()

Unnamed: 0,key_date,closest,days_bt_x_and_y,datetime,heartrate,confidence
0,2022-07-10 17:24:00,2022-07-10 17:24:04,0 days 00:00:04,2022-07-10 17:24:04,1,87
1,2022-07-10 17:24:05,2022-07-10 17:24:04,0 days 00:00:01,2022-07-10 17:24:04,1,87
2,2022-07-10 17:24:10,2022-07-10 17:24:09,0 days 00:00:01,2022-07-10 17:24:09,1,89
3,2022-07-10 17:24:15,2022-07-10 17:24:14,0 days 00:00:01,2022-07-10 17:24:14,1,88
4,2022-07-10 17:24:20,2022-07-10 17:24:19,0 days 00:00:01,2022-07-10 17:24:19,1,86


In [38]:
find_closest_date(df_key2.key_date, df_label.datetime,add_time_delta_column=True )
# print(x)

df_key2[['closest2', 'days_bt_x_and_y2']] = df_key2.key_date.apply(
                                           find_closest_date, args=[df_label.datetime])

df_key3 = pd.merge(df_key2, df_label, left_on=['closest2'], right_on=['datetime'])
# colorder = ['timepoint_x','closest','timepoint_y','days_bt_x_and_y','measure_x','measure_y']

#df_key.head()
df_key2.head()

Unnamed: 0,key_date,closest,days_bt_x_and_y,datetime,heartrate,confidence,closest2,days_bt_x_and_y2
0,2022-07-10 17:24:00,2022-07-10 17:24:04,0 days 00:00:04,2022-07-10 17:24:04,1,87,2022-10-07 17:24:06,89 days 00:00:06
1,2022-07-10 17:24:05,2022-07-10 17:24:04,0 days 00:00:01,2022-07-10 17:24:04,1,87,2022-10-07 17:24:06,89 days 00:00:01
2,2022-07-10 17:24:10,2022-07-10 17:24:09,0 days 00:00:01,2022-07-10 17:24:09,1,89,2022-10-07 17:24:06,88 days 23:59:56
3,2022-07-10 17:24:15,2022-07-10 17:24:14,0 days 00:00:01,2022-07-10 17:24:14,1,88,2022-10-07 17:24:06,88 days 23:59:51
4,2022-07-10 17:24:20,2022-07-10 17:24:19,0 days 00:00:01,2022-07-10 17:24:19,1,86,2022-10-07 17:24:06,88 days 23:59:46


In [39]:
#Create a dummy primary key datetime

date  =pd.date_range(start="2022-07-10 17:24:00", end="2022-07-10 18:08:00", freq='5S')
df_key = pd.DataFrame(data=date, columns=['key_date'])
df_key.key_date = pd.to_datetime(df_key.key_date)
df_key.head()



Unnamed: 0,key_date
0,2022-07-10 17:24:00
1,2022-07-10 17:24:05
2,2022-07-10 17:24:10
3,2022-07-10 17:24:15
4,2022-07-10 17:24:20


Unnamed: 0,SoundID,Starttime,Emotion,Emotionrating,Endtime,UserID,Soundgroup,Soundgroupselection,datetime
0,0085_2,2022-10-07 17:24:06,Sadness,3,2022-10-07 17:24:18,1,Transport,Transport Sound,2022-10-07 17:24:06
1,0166_2,2022-10-07 17:24:24,Sadness,2,2022-10-07 17:24:37,1,Transport,Transport Sound,2022-10-07 17:24:24
2,0167_2,2022-10-07 17:24:40,Fear,5,2022-10-07 17:24:50,1,Transport,Transport Sound,2022-10-07 17:24:40
3,0173_2,2022-10-07 17:24:52,Sadness,1,2022-10-07 17:25:03,1,Transport,Transport Sound,2022-10-07 17:24:52
4,0217_2,2022-10-07 17:25:05,Fear,5,2022-10-07 17:25:15,1,Transport,Transport Sound,2022-10-07 17:25:05


In [7]:
df_key.key_date = pd.to_datetime(df_key.key_date)
df_thetwo = pd.merge_asof(df_key,df_hr,left_on='key_date', right_on='datetime', tolerance=pd.Timedelta('2s'))
# df_thethree = pd.merge_asof(df_key,df_label, left_on='key_date',right_on='datetime', tolerance=pd.Timedelta('3s'), direction='forward', 
#                             allow_exact_matches=True)

df_thethree = pd.merge_ordered(df_key,df_label,fill_method='ffill',left_on='key_date', right_on='datetime')

df_thethree.head()

Unnamed: 0,key_date,datetime_x,heartrate,confidence,SoundID,Starttime,Emotion,Emotionrating,Endtime,UserID,Soundgroup,Soundgroupselection,datetime_y
0,2022-07-10 17:24:00,NaT,,,,NaT,,,NaT,,,,NaT
1,2022-07-10 17:24:05,NaT,,,,NaT,,,NaT,,,,NaT
2,2022-07-10 17:24:10,NaT,,,,NaT,,,NaT,,,,NaT
3,2022-07-10 17:24:15,NaT,,,,NaT,,,NaT,,,,NaT
4,2022-07-10 17:24:20,NaT,,,,NaT,,,NaT,,,,NaT
