## Data Pipeline


This notebook outlines the data processing pipeline developed for the Master’s thesis "Urban stress and Electrodermal Activity: Exploring the Role of Gender and Exposure in different Urban Typologies". It covers the pre-processing of raw Electrodermal Activity (EDA) data and the extraction of relevant features, setting the foundation for further analysis and insights into the dataset.

In [2]:
# Imports

import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import numpy as np
import neurokit2 as nk

from io import StringIO
from scipy.signal import find_peaks

import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
import statsmodels.formula.api as smf


### 1. Data Integration

The first step in this pipeline is to integrate the EDA data into a unified format. Since the EDA signals were recorded from different positions, each participant in the dataset has multiple files that must be merged into a single document for streamlined analysis.


The resulting dataframe will have one row per participant, with columns capturing the corresponding file content, timestamps, and EDA signals for each recorded position. The first column contains participant IDs, which are derived from the folder names where the data was sourced.

To improve performance, the final dataframe is saved as a pickle file, minimizing future loading times.

### 1.1 Load all EDA files including walking files

In [4]:
# Reads the timestamp and EDA data from the first and second columns of the given file
def read_eda_file(file_path):
    data = pd.read_csv(file_path, delimiter='\t', header=None, usecols=[0, 1])
    timestamps = data.iloc[:, 0].tolist()  # Convert first column to a list (timestamps)
    eda_values = data.iloc[:, 1].tolist()  # Convert second column to a list (EDA values)
    
    return timestamps, eda_values


# Process all files for a given folder of a participant
def process_participant_files(participant_folder):
    eda_data = {}
    baseline_data = None
    baseline_timestamps = None

    for file_name in os.listdir(participant_folder):
        if file_name.startswith('leda') and file_name.endswith('.txt'):
            file_path = os.path.join(participant_folder, file_name)
            timestamps, eda_values = read_eda_file(file_path)
            if file_name in ['leda_baseline.txt', 'leda_base.txt']:
                baseline_data = eda_values  # Collect baseline EDA data
                baseline_timestamps = timestamps  # Collect baseline timestamps
            else:
                eda_data[file_name] = {
                    'timestamps': timestamps,
                    'eda_values': eda_values
                }
                
    # Store baseline data under unified name ('leda_baseline.txt' vs. 'leda_base.txt')
    if baseline_data is not None:
        eda_data['leda_baseline.txt'] = {
            'timestamps': baseline_timestamps,
            'eda_values': baseline_data
        }  

    return eda_data

# Define the order of the files
file_order = [
    "leda_baseline.txt", "leda_1a.txt", "leda_1i.txt", "leda_2a.txt", "leda_2i.txt",
    "leda_3a.txt", "leda_3i.txt", "leda_4a.txt", "leda_4i.txt", "leda_5a.txt",
    "leda_5i.txt", "leda_w1.txt", "leda_w2.txt", "leda_w3.txt", "leda_w4.txt"
]

# Set the directory containing the participants folders
root_directory = '/Users/luciemarme/Desktop/master-thesis/data/phdproject_data'

# Initialize a list to store each participants EDA data as a dictionary
all_participant_data = []

# Iterate through each folder and process all relevant files
for participant_folder in os.listdir(root_directory):
    participant_folder_path = os.path.join(root_directory, participant_folder)
    if os.path.isdir(participant_folder_path):
        participant_data = process_participant_files(participant_folder_path)
        participant_data['Participant'] = participant_folder  # Add participants ID to the data
        all_participant_data.append(participant_data)

# Convert the list of dictionaries into a dataframe
eda_df = pd.DataFrame(all_participant_data)

# Reorder columns to ensure 'Participant' is first and then follow "file_order"
eda_df = eda_df[['Participant'] + file_order]


In [5]:
# Save as Pickle
eda_df.to_pickle('eda_data.pkl')

In [1]:
# Load from Pickle
eda_df = pd.read_pickle('eda_data.pkl')

In [None]:
# Sanity Check: Print first rows to display the final dataframe
eda_df

In [14]:
# Check for NaN values in the DataFrame and print the rows that contain any NaN values
nan_rows = eda_df[eda_df.isna().any(axis=1)]
nan_rows

Unnamed: 0,Participant,leda_baseline.txt,leda_1a.txt,leda_1i.txt,leda_2a.txt,leda_2i.txt,leda_3a.txt,leda_3i.txt,leda_4a.txt,leda_4i.txt,leda_5a.txt,leda_5i.txt,leda_w1.txt,leda_w2.txt,leda_w3.txt,leda_w4.txt
5,pb18 - WJG24,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099986, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...",,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
8,pb21 - AGO11,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...",,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
16,pb16 - KKN25,"{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099953, 0.0019991, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099998, 0.0019999, 0....","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...",,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
27,test:back-up,,,,,,,,,,,,,,,
31,pb13 - AWD17,,"{'timestamps': [0.48488, 0.48588, 0.48688, 0.4...","{'timestamps': [0.0, 0.0009999, 0.0019998, 0.0...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.34701, 0.34801, 0.34901, 0.3...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.42984, 0.43084, 0.43184, 0.4...","{'timestamps': [0.21936, 0.22036, 0.22136, 0.2...","{'timestamps': [0.36814, 0.36914, 0.37014, 0.3...","{'timestamps': [0.241, 0.242, 0.243, 0.244, 0....","{'timestamps': [0.55182, 0.55282, 0.55382, 0.5...",,,,
32,pb17 - AEW02,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...",,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
34,unvollständige Daten,,,,,,,,,,,,,,,
36,pb23 - GHR03,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.0010005, 0.0020009, 0.0...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...",,"{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,..."


### 1.2 Load only EDA files at specific typologies and positions

In [10]:
# Reads the timestamp and EDA data from the first and second columns of the given file
def read_eda_file(file_path):
    data = pd.read_csv(file_path, delimiter='\t', header=None, usecols=[0, 1])
    timestamps = data.iloc[:, 0].tolist()  # Convert first column to a list (timestamps)
    eda_values = data.iloc[:, 1].tolist()  # Convert second column to a list (EDA values)
    
    return timestamps, eda_values


# Process all files for a given folder of a participant
def process_participant_files(participant_folder):
    eda_data = {}
    baseline_data = None
    baseline_timestamps = None

    for file_name in os.listdir(participant_folder):
        if file_name.startswith('leda') and file_name.endswith('.txt'):
            # Skip walking files by excluding those that start with 'leda_w'
            if 'leda_w' in file_name:
                continue  # Skip the walking files

            file_path = os.path.join(participant_folder, file_name)
            timestamps, eda_values = read_eda_file(file_path)
            if file_name in ['leda_baseline.txt', 'leda_base.txt']:
                baseline_data = eda_values  # Collect baseline EDA data
                baseline_timestamps = timestamps  # Collect baseline timestamps
            else:
                eda_data[file_name] = {
                    'timestamps': timestamps,
                    'eda_values': eda_values
                }
                
    # Store baseline data under unified name ('leda_baseline.txt' vs. 'leda_base.txt')
    if baseline_data is not None:
        eda_data['leda_baseline.txt'] = {
            'timestamps': baseline_timestamps,
            'eda_values': baseline_data
        }  

    return eda_data


# Define the order of the files, excluding the walking files
file_order = [
    "leda_baseline.txt", "leda_1a.txt", "leda_1i.txt", "leda_2a.txt", "leda_2i.txt",
    "leda_3a.txt", "leda_3i.txt", "leda_4a.txt", "leda_4i.txt", "leda_5a.txt", "leda_5i.txt"
]


# Set the directory containing the participants folders
root_directory = '/Users/luciemarme/Desktop/master-thesis/data/participants_data'

# Initialize a list to store each participants EDA data as a dictionary
all_participant_data = []

# Iterate through each folder and process all relevant files
for participant_folder in os.listdir(root_directory):
    participant_folder_path = os.path.join(root_directory, participant_folder)
    if os.path.isdir(participant_folder_path):
        participant_data = process_participant_files(participant_folder_path)
        participant_data['Participant'] = participant_folder  # Add participants ID to the data
        all_participant_data.append(participant_data)

# Convert the list of dictionaries into a dataframe
eda_df_without_walking_files = pd.DataFrame(all_participant_data)

# Reorder columns to ensure 'Participant' is first and then follow "file_order"
eda_df_without_walking_files = eda_df_without_walking_files[['Participant'] + file_order]

#### Check rows with NaN values

In [12]:
# Check for NaN values in the DataFrame and print the rows that contain any NaN values
nan_rows = eda_df_without_walking_files[eda_df_without_walking_files.isna().any(axis=1)]

nan_rows

Unnamed: 0,Participant,leda_baseline.txt,leda_1a.txt,leda_1i.txt,leda_2a.txt,leda_2i.txt,leda_3a.txt,leda_3i.txt,leda_4a.txt,leda_4i.txt,leda_5a.txt,leda_5i.txt
30,pb13 - AWD17,,"{'timestamps': [0.48488, 0.48588, 0.48688, 0.4...","{'timestamps': [0.0, 0.0009999, 0.0019998, 0.0...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.34701, 0.34801, 0.34901, 0.3...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.42984, 0.43084, 0.43184, 0.4...","{'timestamps': [0.21936, 0.22036, 0.22136, 0.2...","{'timestamps': [0.36814, 0.36914, 0.37014, 0.3...","{'timestamps': [0.241, 0.242, 0.243, 0.244, 0....","{'timestamps': [0.55182, 0.55282, 0.55382, 0.5..."


In [13]:
# Save as Pickle
eda_df_without_walking_files.to_pickle('eda_df_without_walking_files.pkl')

In [3]:
# Load from Pickle
eda_df_without_walking_files = pd.read_pickle('eda_df_without_walking_files.pkl')

In [4]:
eda_df_without_walking_files.head(10)

Unnamed: 0,Participant,leda_baseline.txt,leda_1a.txt,leda_1i.txt,leda_2a.txt,leda_2i.txt,leda_3a.txt,leda_3i.txt,leda_4a.txt,leda_4i.txt,leda_5a.txt,leda_5i.txt
0,pb26 - MMO02,"{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ..."
1,pb30 - CAN61,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099998, 0.002, 0.003,..."
2,pb35 - BRE26,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099995, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
3,pb37 - CCN18,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
4,pb27 - STG27,"{'timestamps': [0.0, 0.0010001, 0.0020002, 0.0...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
5,pb18 - WJG24,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099986, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
6,pb40 - IMN29,"{'timestamps': [0.0, 0.00099995, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099988, 0.0019998, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099985, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
7,pb24 - AGT14,"{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099987, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,..."
8,pb21 - AGO11,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
9,pb31 - MJN12,"{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."


### 2. Standardize length of signal

To standardize the analysis, the duration of data recordings from each position is trimmed to 180 seconds. Any EDA values beyond this time frame are removed, ensuring uniformity across all recordings and creating a cleaner, more consistent dataset for analysis.

#### 2.1 EDA files including walking files

In [8]:
# Define the order of the files (columns in the DataFrame)
file_order = [
    "leda_baseline.txt", "leda_1a.txt", "leda_1i.txt", "leda_2a.txt", "leda_2i.txt",
    "leda_3a.txt", "leda_3i.txt", "leda_4a.txt", "leda_4i.txt", "leda_5a.txt",
    "leda_5i.txt", "leda_w1.txt", "leda_w2.txt", "leda_w3.txt", "leda_w4.txt"
]

# Filter the timestamps and eda_values for each participant's data
def filter_participant_data(participant_data, max_time=180):
    for file in file_order:
        # Get the dictionary for the current file (if it exists)
        if isinstance(participant_data[file], dict):
            timestamps = participant_data[file]['timestamps']
            eda_values = participant_data[file]['eda_values']

            # Create list of all indices that contain timestamps less than or equal to 180 seconds
            valid_indices = [i for i, t in enumerate(timestamps) if t <= max_time]

            # Filter both timestamps and eda_values based on the valid indices
            filtered_timestamps = [timestamps[i] for i in valid_indices]
            filtered_eda_values = [eda_values[i] for i in valid_indices]

            # Update the dictionary with the filtered data
            participant_data[file]['timestamps'] = filtered_timestamps
            participant_data[file]['eda_values'] = filtered_eda_values

    return participant_data

# Apply the filtering to each participant (each row in the DataFrame)
eda_df_filtered = eda_df.apply(lambda row: filter_participant_data(row), axis=1)


In [9]:
# Save the filtered DataFrame to a pickle file
eda_df_filtered.to_pickle('filtered_eda_data.pkl')

In [10]:
# Sanity Check: Print first rows to display the filtered dataframe
eda_df_filtered.head()

Unnamed: 0,Participant,leda_baseline.txt,leda_1a.txt,leda_1i.txt,leda_2a.txt,leda_2i.txt,leda_3a.txt,leda_3i.txt,leda_4a.txt,leda_4i.txt,leda_5a.txt,leda_5i.txt,leda_w1.txt,leda_w2.txt,leda_w3.txt,leda_w4.txt
0,pb26 - MMO02,"{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
1,pb30 - CAN61,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099998, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
2,pb35 - BRE26,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099995, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0...."
3,pb37 - CCN18,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
4,pb27 - STG27,"{'timestamps': [0.0, 0.0010001, 0.0020002, 0.0...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."


#### 2.2 EDA File without walking files

In [15]:
# Define the order of the files (columns in the DataFrame)
file_order = [
    "leda_baseline.txt", "leda_1a.txt", "leda_1i.txt", "leda_2a.txt", "leda_2i.txt",
    "leda_3a.txt", "leda_3i.txt", "leda_4a.txt", "leda_4i.txt", "leda_5a.txt",
    "leda_5i.txt"]

# Filter the timestamps and eda_values for each participant's data
def filter_participant_data(participant_data, max_time=180):
    for file in file_order:
        # Get the dictionary for the current file (if it exists)
        if isinstance(participant_data[file], dict):
            timestamps = participant_data[file]['timestamps']
            eda_values = participant_data[file]['eda_values']

            # Create list of all indices that contain timestamps less than or equal to 180 seconds
            valid_indices = [i for i, t in enumerate(timestamps) if t <= max_time]

            # Filter both timestamps and eda_values based on the valid indices
            filtered_timestamps = [timestamps[i] for i in valid_indices]
            filtered_eda_values = [eda_values[i] for i in valid_indices]

            # Update the dictionary with the filtered data
            participant_data[file]['timestamps'] = filtered_timestamps
            participant_data[file]['eda_values'] = filtered_eda_values

    return participant_data

# Apply the filtering to each participant (each row in the DataFrame)
eda_df_filtered_without_walking_files = eda_df_without_walking_files.apply(lambda row: filter_participant_data(row), axis=1)

In [16]:
# Save the filtered DataFrame to a pickle file
eda_df_filtered_without_walking_files.to_pickle('eda_df_filtered_without_walking_files.pkl')

In [17]:
eda_df_filtered_without_walking_files

Unnamed: 0,Participant,leda_baseline.txt,leda_1a.txt,leda_1i.txt,leda_2a.txt,leda_2i.txt,leda_3a.txt,leda_3i.txt,leda_4a.txt,leda_4i.txt,leda_5a.txt,leda_5i.txt
0,pb26 - MMO02,"{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ..."
1,pb30 - CAN61,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099998, 0.002, 0.003,..."
2,pb35 - BRE26,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099995, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
3,pb37 - CCN18,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
4,pb27 - STG27,"{'timestamps': [0.0, 0.0010001, 0.0020002, 0.0...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099996, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.0010001, 0.0020001, 0.0...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
5,pb18 - WJG24,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099986, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
6,pb40 - IMN29,"{'timestamps': [0.0, 0.00099995, 0.0019999, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099988, 0.0019998, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099985, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
7,pb24 - AGT14,"{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099987, 0.0019997, 0....","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,..."
8,pb21 - AGO11,"{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099999, 0.002, 0.003,...","{'timestamps': [0.0, 0.001, 0.0020001, 0.00300...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."
9,pb31 - MJN12,"{'timestamps': [0.0, 0.001, 0.002, 0.0030001, ...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00...","{'timestamps': [0.0, 0.00099998, 0.002, 0.0029...","{'timestamps': [0.0, 0.001, 0.002, 0.003, 0.00..."


### 3. Decomposition into phasic and tonic components

This step of the pipeline involves decomposing the raw EDA data in the filtered DataFrame. For the purposes of this thesis, only the tonic component of the signal is considered relevant. The decomposition is performed using the eda_process() method from the NeuroKit library.

#### 3.1 EDA files including walking files

In [11]:
# Function to extract the tonic component of the EDA signal using the neurokit library
def extract_tonic_level(eda_values):
    eda_signal, eda_info = nk.eda_process(eda_values, sampling_rate=1000) 
    tonic_level = eda_signal['EDA_Tonic'].tolist()  # Extract the tonic component
    return tonic_level

# Initialize an empty dataframe to store the tonic level data
tonic_eda_data = []

# Loop through each row (each participant) in the filtered DataFrame
for index, row in eda_df_filtered.iterrows():
    tonic_data_row = {'Participant': row['Participant']}
    
    # Loop through each location and extract the tonic level
    for location in file_order:
        if pd.notna(row[location]):  # Check if the cell is not NaN
            try:
                eda_values = row[location]['eda_values']  # Get the EDA values
                tonic_level = extract_tonic_level(eda_values)  # Extract tonic component
                tonic_data_row[location] = tonic_level  # Store the tonic level as a list
            except (KeyError, TypeError):
                # Handle any potential issues (like missing 'eda_values' or other errors)
                tonic_data_row[location] = None
        else:
            tonic_data_row[location] = None  # In case the location is missing data or NaN
    
    # Append the processed row to the tonic data list
    tonic_eda_data.append(tonic_data_row)

# Convert the list of dictionaries into a new DataFrame
tonic_eda_df = pd.DataFrame(tonic_eda_data)

# Reorder columns to ensure 'Patient' is first and then follow the file_order
tonic_eda_df = tonic_eda_df[['Participant'] + file_order]

In [12]:
# Save the tonic level dataframe as a pickle file 
tonic_eda_df.to_pickle('tonic_eda_data.pkl')

In [1]:
# Sanity Check: Print first rows to display the tonic level dataframe
print(tonic_eda_df.head())

In [14]:
# Load from pickle file
tonic_eda_df = pd.read_pickle('tonic_eda_data.pkl')

### 4. Clean DataFrame 

Before feature extraction, the DataFrame is checked for any missing values. Any incomplete entries are removed to ensure data quality and consistency, preparing the dataset for the next steps in the analysis.

In [15]:
# Print the column "Participant"
# Loop through each row (each participant) in tonic_eda_df
for _, row in tonic_eda_df.iterrows():
    participant = row["Participant"]  # Get the participant ID
    print(participant)

pb26 - MMO02
pb30 - CAN61
pb35 - BRE26
pb37 - CCN18
pb27 - STG27
pb18 - WJG24
pb40 - IMN29
pb24 - AGT14
pb21 - AGO11
pb31 - MJN12
pb38 - MUH08
pb29 - GMG06
pb15 - ONK28
pb14 - MAN14
pb50 - MJN06
pb19 - BLN10
pb16 - KKN25
pb22 - GRR10
pb42 - MCG24
pb44 - AWD10
pb47 - FMT17
pb33 - AHN27
pb28 - EFN19
pb41 - NGA35
pb34 - GSL36
pb45 - CAM29
pb46 - GAR28
test:back-up
pb20 - HWG29
pb49 - MPD06
pb43 - IVZ24
pb13 - AWD17
pb17 - AEW02
pb48 - AWG36
unvollständige Daten
pb39 - IKE31
pb23 - GHR03
pb32 - HNG26
pb36 - DSM18


In [16]:
# Remove unwanted entries from Dataframe    
# List of unwanted entries in columns "Participant"
unwanted_participants = ["test:back-up", "unvollständige Daten"]

# Filter out the unwanted participants in one step
tonic_eda_df = tonic_eda_df[~tonic_eda_df['Participant'].isin(unwanted_participants)]

# Loop through each row (each participant) in tonic_eda_df
for _, row in tonic_eda_df.iterrows():
    participant = row["Participant"]  # Get the participant ID
    print(participant)

pb26 - MMO02
pb30 - CAN61
pb35 - BRE26
pb37 - CCN18
pb27 - STG27
pb18 - WJG24
pb40 - IMN29
pb24 - AGT14
pb21 - AGO11
pb31 - MJN12
pb38 - MUH08
pb29 - GMG06
pb15 - ONK28
pb14 - MAN14
pb50 - MJN06
pb19 - BLN10
pb16 - KKN25
pb22 - GRR10
pb42 - MCG24
pb44 - AWD10
pb47 - FMT17
pb33 - AHN27
pb28 - EFN19
pb41 - NGA35
pb34 - GSL36
pb45 - CAM29
pb46 - GAR28
pb20 - HWG29
pb49 - MPD06
pb43 - IVZ24
pb13 - AWD17
pb17 - AEW02
pb48 - AWG36
unvollständige Daten
pb39 - IKE31
pb23 - GHR03
pb32 - HNG26
pb36 - DSM18


In [18]:
# Sanity Check for row "unvollständige Daten"
print(tonic_eda_df.loc[34])

Participant          unvollständige Daten
leda_baseline.txt                     None
leda_1a.txt                           None
leda_1i.txt                           None
leda_2a.txt                           None
leda_2i.txt                           None
leda_3a.txt                           None
leda_3i.txt                           None
leda_4a.txt                           None
leda_4i.txt                           None
leda_5a.txt                           None
leda_5i.txt                           None
leda_w1.txt                           None
leda_w2.txt                           None
leda_w3.txt                           None
leda_w4.txt                           None
Name: 34, dtype: object


In [19]:
tonic_eda_clean_df = tonic_eda_df[tonic_eda_df.index != 34]

In [20]:
# Sanity Check for row "pb13 - AWD17"
print(tonic_eda_clean_df.loc[31])

Participant                                               pb13 - AWD17
leda_baseline.txt                                                 None
leda_1a.txt          [24.899861417690037, 24.8998613784221, 24.8998...
leda_1i.txt          [24.89998609594894, 24.899986094542715, 24.899...
leda_2a.txt          [24.900000077785677, 24.900000077787347, 24.90...
leda_2i.txt          [24.899999985621495, 24.89999998562143, 24.899...
leda_3a.txt          [22.535787933489974, 22.53566773097391, 22.535...
leda_3i.txt          [20.800118887139107, 20.800070523776768, 20.80...
leda_4a.txt          [24.490547165832652, 24.49036449899534, 24.490...
leda_4i.txt          [24.92206384559603, 24.92205701837004, 24.9220...
leda_5a.txt          [22.866762624049155, 22.866429714724088, 22.86...
leda_5i.txt          [2.5125038435290215, 2.513112014310026, 2.5137...
leda_w1.txt                                                       None
leda_w2.txt                                                       None
leda_w

In [22]:
tonic_eda_clean_df = tonic_eda_clean_df[tonic_eda_clean_df.index != 31]

In [23]:
tonic_eda_clean_df = tonic_eda_clean_df.reset_index(drop=True)

In [24]:
# Loop through each row (each participant) in tonic_eda_clean_df
for _, row in tonic_eda_clean_df.iterrows():
    participant = row["Participant"]  # Get the participant ID
    print(participant)

pb26 - MMO02
pb30 - CAN61
pb35 - BRE26
pb37 - CCN18
pb27 - STG27
pb18 - WJG24
pb40 - IMN29
pb24 - AGT14
pb21 - AGO11
pb31 - MJN12
pb38 - MUH08
pb29 - GMG06
pb15 - ONK28
pb14 - MAN14
pb50 - MJN06
pb19 - BLN10
pb16 - KKN25
pb22 - GRR10
pb42 - MCG24
pb44 - AWD10
pb47 - FMT17
pb33 - AHN27
pb28 - EFN19
pb41 - NGA35
pb34 - GSL36
pb45 - CAM29
pb46 - GAR28
pb20 - HWG29
pb49 - MPD06
pb43 - IVZ24
pb17 - AEW02
pb48 - AWG36
pb39 - IKE31
pb23 - GHR03
pb32 - HNG26
pb36 - DSM18


### 5. Dataset Transformation

Before extracting features, the existing DataFrame must be transformed into a suitable format. This involves converting the columns from containing the original text files to reflecting meaningful data such as EDA signals and statistical features. Two additional columns, 'Typology' and 'Position,' are added to categorize the data.

To enable this transformation, we first need to map each text file to its corresponding Typology and Position. The respective EDA signals are then added to the 'eda_signal' column for further analysis.

In [32]:
# Define the mapping for the new columns 'Typology' and 'Position'
file_mapping = {
    "leda_baseline.txt": ("Baseline", "Baseline"),
    "leda_1a.txt": ("CPS", "Peripheral"),
    "leda_1i.txt": ("CPS", "Central"),
    "leda_2a.txt": ("GS", "Peripheral"),
    "leda_2i.txt": ("GS", "Central"),
    "leda_3a.txt": ("BSS", "Peripheral"),
    "leda_3i.txt": ("BSS", "Central"),
    "leda_4a.txt": ("LPS", "Peripheral"),
    "leda_4i.txt": ("LPS", "Central"),
    "leda_5a.txt": ("TI", "Peripheral"),
    "leda_5i.txt": ("TI", "Central")
}

# Initialize a list to store rows of the new DataFrame
eda_statistics_data = []

# Iterate through each row in the DataFrame
for index, row in tonic_eda_clean_df.iterrows():
    participant = row["Participant"]  # Get the participant ID

    # Process each file
    for file_name, (typology, position) in file_mapping.items():
        # Access the EDA signal for the current participant and file using iloc
        column_index = tonic_eda_clean_df.columns.get_loc(file_name)
            
        # Get the EDA signal using iloc with the row and column indices
        eda_signal = tonic_eda_clean_df.iloc[index, column_index]
        
        # Ensure there's data for this location and it's not empty
        if eda_signal is not None and len(eda_signal) > 0:
            # Append a new row to the data list
            eda_statistics_data.append({
                "Participant": participant,
                "Typology": typology,
                "Position": position,
                "EDA_signal": eda_signal
            })
        
        
# Convert the list of dictionaries into a new DataFrame
eda_transformed_df = pd.DataFrame(eda_statistics_data)

In [33]:
# Save the new DataFrame as a pickle file 
eda_transformed_df.to_pickle('eda_transformed_data.pkl')

In [172]:
# Load from pickle file
eda_transformed_df = pd.read_pickle('eda_transformed_data.pkl')

In [173]:
# Sanity Check: Print first rows to display the new dataframe 
eda_transformed_df

Unnamed: 0,Participant,Typology,Position,EDA_signal
0,pb26 - MMO02,Baseline,Baseline,"[6.311107422600814, 6.311105162507489, 6.31110..."
1,pb26 - MMO02,CPS,Peripheral,"[6.1175884095833934, 6.117593942611662, 6.1175..."
2,pb26 - MMO02,CPS,Central,"[8.368473037545975, 8.368460922002575, 8.36844..."
3,pb26 - MMO02,GS,Peripheral,"[10.546255998683943, 10.546260613876449, 10.54..."
4,pb26 - MMO02,GS,Central,"[10.27966128795543, 10.279677003350086, 10.279..."
...,...,...,...,...
391,pb36 - DSM18,BSS,Central,"[12.64075488596803, 12.640807578763976, 12.640..."
392,pb36 - DSM18,LPS,Peripheral,"[9.596506973862809, 9.596544458826289, 9.59658..."
393,pb36 - DSM18,LPS,Central,"[10.727332287542206, 10.727368443938083, 10.72..."
394,pb36 - DSM18,TI,Peripheral,"[12.831698554881463, 12.831575749899729, 12.83..."


In [174]:
# Check for NaN values in the 'Participant' column
has_nan = eda_transformed_df['EDA_signal'].isna().any()

# Print the result
print(f"Are there any NaN values in 'EDA_signal' column? {has_nan}")

Are there any NaN values in 'EDA_signal' column? False
