In [62]:
import pandas as pd
import os
import numpy as np
import datetime
import matplotlib.pyplot as plt
import json

In [8]:
dataset_path='/Users/adrianapsay/dsc106/dsc106-final-project/data/Wearable_Dataset'#replace the folder path 

strees_level_v1_path='/Users/adrianapsay/dsc106/dsc106-final-project/data/Stress_Level_v1.csv'#replace the file path
strees_level_v2_path='/Users/adrianapsay/dsc106/dsc106-final-project/data/Stress_Level_v2.csv'#replace the file path

In [9]:
def moving_average(acc_data):
    # Initialization of variables
    avg = 0
    prevX, prevY, prevZ = 0, 0, 0
    results = []
    # Each second (32 samples) the acceleration data is summarized using the following method:
    for i in range(0, len(acc_data), 32):
        sum_ = 0
        buffX = acc_data[i:i+32, 0]
        buffY = acc_data[i:i+32, 1]
        buffZ = acc_data[i:i+32, 2]
        
        for j in range(len(buffX)):
            sum_ += max(
                abs(buffX[j] - prevX),
                abs(buffY[j] - prevY),
                abs(buffZ[j] - prevZ)
            )
            prevX, prevY, prevZ = buffX[j], buffY[j], buffZ[j]
        #The output is then filtered:
        avg = avg * 0.9 + (sum_ / 32) * 0.1 #
        results.append(avg)
    
    return results

In [10]:
def graph_multiple(signals,timeline,subject_signals,state):

    plt.figure(figsize=(25,15))

    keys = list(signals[subject_signals].keys())
    keys.remove("tags")

    i=1
    
    for key in keys:
        plt.subplot(len(keys),1,i)
        if i==1:
            plt.title(subject_signals + "  -  "+state)
        if key=='ACC':
            acc=moving_average(signals[subject_signals][key])
            plt.plot(acc,label=key)
        else:
            plt.plot(timeline[subject_signals][key],signals[subject_signals][key],label=key)
        
        for tag in signals[subject_signals]["tags"][1:]:
            plt.axvline(x=tag, color='r', linestyle='-')

        if state=='STRESS' and signals[subject_signals]["tags"]:
            if 'S' in subject_signals: #first version
                plt.axvspan(signals[subject_signals]["tags"][3], signals[subject_signals]["tags"][4], color='red', alpha=0.2) #stroop
                plt.axvspan(signals[subject_signals]["tags"][5], signals[subject_signals]["tags"][6], color='red', alpha=0.2)# tmct
                plt.axvspan(signals[subject_signals]["tags"][7], signals[subject_signals]["tags"][8], color='red', alpha=0.2)#real opinion
                plt.axvspan(signals[subject_signals]["tags"][9], signals[subject_signals]["tags"][10], color='red', alpha=0.2)#opposite opinion
                plt.axvspan(signals[subject_signals]["tags"][11], signals[subject_signals]["tags"][12], color='red', alpha=0.2)#subtract test

            else: #second version
                plt.axvspan(signals[subject_signals]["tags"][2], signals[subject_signals]["tags"][3], color='red', alpha=0.2) #tmct
                plt.axvspan(signals[subject_signals]["tags"][4], signals[subject_signals]["tags"][5], color='red', alpha=0.2)#real opinion
                plt.axvspan(signals[subject_signals]["tags"][6], signals[subject_signals]["tags"][7], color='red', alpha=0.2)#opposite opinion
                plt.axvspan(signals[subject_signals]["tags"][8], signals[subject_signals]["tags"][9], color='red', alpha=0.2)#subtract test
        
        plt.legend()
        plt.grid()
        i = i+1  
    plt.show()

In [11]:
# create a vector from the data frame (signal imported by pandas)
def create_df_array(dataframe):
    matrix_df=dataframe.values
    # returns 2-d matrix
    matrix = np.array(matrix_df)
    array_df = matrix.flatten()# Convert matrix into an array
    return array_df

# convert UTC arrays to arrays in seconds relative to 0 (record beginning)
def time_abs_(UTC_array):
    new_array=[]
    for utc in UTC_array:
        time=(datetime.datetime.strptime(utc,'%Y-%m-%d %H:%M:%S')-datetime.datetime.strptime(UTC_array[0], '%Y-%m-%d %H:%M:%S')).total_seconds()
        new_array.append(int(time))
    return new_array

In [12]:
def read_signals(main_folder):
    signal_dict = {}
    time_dict = {}
    fs_dict = {}

    # Get a list of subfolders in the main folder
    subfolders = next(os.walk(main_folder))[1]

    utc_start_dict={}
    for folder_name in subfolders:
            csv_path = f'{main_folder}/{folder_name}/EDA.csv'
            df=pd.read_csv(csv_path)
            utc_start_dict[folder_name]= df.columns.tolist()

    # Iterate over the subfolders
    for folder_name in subfolders:
        folder_path = os.path.join(main_folder, folder_name)
        # Get a list of files in the subfolder
        files = os.listdir(folder_path)

        # Initialize a dictionary for the signals in the current subfolder
        signals = {}
        time_line = {}
        fs_signal= {}
        
        # Define the list of desired file names
        desired_files = ['EDA.csv', 'BVP.csv', 'HR.csv', 'TEMP.csv','tags.csv','ACC.csv']
   
        # Iterate over the files in the subfolder
        for file_name in files:
            file_path = os.path.join(folder_path, file_name)

            # Check if it's a CSV file and if it is in the desired files list
            if file_name.endswith('.csv') and file_name in desired_files:
                # Read the CSV file and store the signal data

                if file_name == 'tags.csv':
                    try:
                        df = pd.read_csv(file_path,header=None)
                        tags_vector = create_df_array(df)
                        tags_UTC_vector =np.insert(tags_vector,0,utc_start_dict[folder_name])
                        signal_array=time_abs_(tags_UTC_vector)
                    except pd.errors.EmptyDataError:
                        signal_array=[]
                
                else:
                    df = pd.read_csv(file_path)
                    fs= df.loc[0]
                    fs=int(fs[0])# Get sampling frequency
                    df.drop([0],axis = 0,inplace=True) 
                    signal_array = df.values
                    time_array = np.linspace(0, len(signal_array)/fs,len(signal_array))

                signal_name = file_name.split('.')[0]
                signals[signal_name] = signal_array
                time_line[signal_name] = time_array
                fs_signal[signal_name] = fs

        # Store the signals of the current subfolder in the main dictionary
        signal_dict[folder_name] = signals
        time_dict[folder_name] = time_line
        fs_dict[folder_name] = fs_signal

    return signal_dict, time_dict, fs_dict


In [None]:
states=os.listdir(dataset_path) #['AEROBIC', 'ANAEROBIC', 'STRESS']

signal_data={}
time_data={}
fs_dict={}
participants={}

for state in states:
    folder_path = f'{dataset_path}/{state}' 
    participants[state]=os.listdir(folder_path)
    signal_data[state], time_data[state], fs_dict[state] = read_signals(folder_path) # Returns three dictionaries with subjects info: raw signals (signal_data), temporal data ready to graph (time_data) and sample frequency for escha signal(fs_dict).

In [22]:
signal_data.keys()

dict_keys(['ANAEROBIC', 'AEROBIC', 'STRESS'])

In [24]:
signal_data['ANAEROBIC'].keys()

dict_keys(['S05', 'S02', 'f11', 'f10', 'S03', 'S04', 'f03', 'f04', 'S17', 'S10', 'S11', 'f05', 'f02', 'S18', 'S01', 'S06', 'S08', 'f12', 'S09', 'f13', 'S07', 'f07', 'S13', 'f09', 'S14', 'S15', 'S12', 'f08', 'f01', 'f06'])

In [26]:
signal_data['ANAEROBIC']['S05'].keys()

dict_keys(['TEMP', 'tags', 'HR', 'ACC', 'EDA', 'BVP'])

In [38]:
signal_data['ANAEROBIC']['S05']['TEMP']

array([[208.95],
       [208.95],
       [208.95],
       ...,
       [ 35.41],
       [ 35.41],
       [ 35.41]])

In [29]:
time_data.keys()

dict_keys(['ANAEROBIC', 'AEROBIC', 'STRESS'])

In [30]:
time_data['ANAEROBIC'].keys()

dict_keys(['S05', 'S02', 'f11', 'f10', 'S03', 'S04', 'f03', 'f04', 'S17', 'S10', 'S11', 'f05', 'f02', 'S18', 'S01', 'S06', 'S08', 'f12', 'S09', 'f13', 'S07', 'f07', 'S13', 'f09', 'S14', 'S15', 'S12', 'f08', 'f01', 'f06'])

In [32]:
time_data['ANAEROBIC']['S05'].keys()

dict_keys(['TEMP', 'tags', 'HR', 'ACC', 'EDA', 'BVP'])

In [39]:
time_data['ANAEROBIC']['S05']['TEMP']

array([0.00000000e+00, 2.50055127e-01, 5.00110254e-01, ...,
       1.13349989e+03, 1.13374994e+03, 1.13400000e+03])

In [34]:
fs_dict.keys()

dict_keys(['ANAEROBIC', 'AEROBIC', 'STRESS'])

In [35]:
fs_dict['ANAEROBIC'].keys()

dict_keys(['S05', 'S02', 'f11', 'f10', 'S03', 'S04', 'f03', 'f04', 'S17', 'S10', 'S11', 'f05', 'f02', 'S18', 'S01', 'S06', 'S08', 'f12', 'S09', 'f13', 'S07', 'f07', 'S13', 'f09', 'S14', 'S15', 'S12', 'f08', 'f01', 'f06'])

In [37]:
fs_dict['ANAEROBIC']['S05'].keys()

dict_keys(['TEMP', 'tags', 'HR', 'ACC', 'EDA', 'BVP'])

In [42]:
fs_dict['ANAEROBIC']['S05']["TEMP"]

4

In [None]:
# RUN WITH CAUTION AS THIS IS TOO BIG LOL
export_data = {}

for state in signal_data.keys():
    export_data[state] = {}

    for subject in signal_data[state].keys():  # Iterate through subjects
        export_data[state][subject] = {}

        for signal_type in signal_data[state][subject].keys():
            signal_values = signal_data[state][subject][signal_type]
            timestamps = time_data[state][subject][signal_type]
            sampling_rate = fs_dict[state][subject][signal_type]

            if isinstance(signal_values, np.ndarray):  
                signal_values = signal_values.flatten().tolist()
            elif isinstance(signal_values, list):  
                signal_values = [float(x) for x in signal_values]

            if isinstance(timestamps, np.ndarray):  
                timestamps = timestamps.tolist()
            elif isinstance(timestamps, list):  
                timestamps = [float(x) for x in timestamps]

            export_data[state][subject][signal_type] = {
                "values": signal_values,
                "timestamps": timestamps,
                "sampling_rate": int(sampling_rate)
            }

# # Save as JSON file
# with open("data/wearable_data.json", "w") as f:
#     json.dump(export_data, f)

# print("Data exported successfully to wearable_data.json!")
# export_data

3

In [56]:
with open("data/wearable_data.json", "w") as json_file:
    json.dump(export_data, json_file, indent=4)

In [None]:
export_data = {}

MAX_SUBJECTS = 10
DOWNSAMPLE_FACTOR = 100

for state in signal_data.keys():
    export_data[state] = {}

    # limit subjects
    subjects = list(signal_data[state].keys())[:MAX_SUBJECTS]

    for subject in subjects:
        export_data[state][subject] = {}

        for signal_type in signal_data[state][subject].keys():
            signal_values = signal_data[state][subject][signal_type]
            timestamps = time_data[state][subject][signal_type]
            sampling_rate = fs_dict[state][subject][signal_type]

            if isinstance(signal_values, np.ndarray):  
                signal_values = signal_values.flatten().tolist()
            if isinstance(timestamps, np.ndarray):  
                timestamps = timestamps.tolist()

            # downsample
            if len(signal_values) > 10000:
                signal_values = signal_values[::DOWNSAMPLE_FACTOR]
                timestamps = timestamps[::DOWNSAMPLE_FACTOR]

            # store data
            export_data[state][subject][signal_type] = {
                "values": signal_values,
                "timestamps": timestamps,
                "sampling_rate": int(sampling_rate)
            }

with open("data/reduced_wearable_data.json", "w") as f:
    json.dump(export_data, f)

print("Data exported successfully (MASSIVE REDUCTION)")

Data exported successfully (MASSIVE REDUCTION)
