In [None]:
%pip install scikit-learn -q
%pip install pandas -q
%pip install numpy -q
%pip install matplotlib -q
%pip install seaborn -q
%pip install keras -q
%pip install os -q

%pip install cvxopt -q

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np
import keras
from keras import layers
import tensorflow as tf
from sklearn import preprocessing, model_selection
import random
import seaborn as sns
import os
import cvxEDA

In [None]:

MAIN_PATH = os.path.dirname(os.getcwd())
DATA_PATH = MAIN_PATH + "/data/"

QUALITY_THRESHOLD = 128
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 2

In [None]:
dataset = pd.read_csv(DATA_PATH + "/merged_data.csv")

In [None]:
dataset.dtypes

In [None]:
dataset

In [None]:
import pandas as pd

# Function to create sequences DataFrame
def create_sequences_df(dataset, max_length=128):
    sequences = []
    temp_sequence = []
    eda_sequence = []
    label = None
    time_sequence = []
    current_id = None

    for index, row in dataset.iterrows():
        if current_id != row['ID']:
            # New ID encountered, append previous sequence to list
            if temp_sequence:
                sequences.append({
                    'ID': current_id,
                    'w_eda': eda_sequence,
                    'w_temp': temp_sequence,
                    'downsampled_label': label,
                    'Time': time_sequence
                })
            # Reset sequences for new ID
            temp_sequence = [row['w_temp']]
            eda_sequence = [row['w_eda']]
            label = row['downsampled_labels']
            time_sequence = [row['Time']]
            current_id = row['ID']
        else:
            # Append values to sequences
            temp_sequence.append(row['w_temp'])
            eda_sequence.append(row['w_eda'])
            time_sequence.append(row['Time'])

        # Check if sequence length exceeds max_length
        if len(temp_sequence) >= max_length:
            sequences.append({
                'ID': current_id,
                'w_eda': eda_sequence,
                'w_temp': temp_sequence,
                'downsampled_label': label,
                'Time': time_sequence
            })
            # Reset sequences for new ID
            temp_sequence = []
            eda_sequence = []
            label = None
            time_sequence = []
            current_id = None

    # Append last sequence if it's not empty
    if temp_sequence:
        sequences.append({
            'ID': current_id,
            'w_eda': eda_sequence,
            'w_temp': temp_sequence,
            'downsampled_label': label,
            'Time': time_sequence
        })

    # Convert list of dictionaries to DataFrame
    sequences_df = pd.DataFrame(sequences)
    return sequences_df

# Create sequences DataFrame
sequences_df = create_sequences_df(dataset)

# Check the resulting DataFrame
sequences_df

In [None]:
print(sequences_df.loc[88, 'w_eda'])


In [None]:
print(sequences_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create subplots
fig, axes = plt.subplots(2, 8, figsize=(25, 6))  # Increased figure size
axes = axes.flatten()

# Define unique_ids
unique_ids = dataset['ID'].unique()

# Iterate through each unique id
for i, unique_id in enumerate(unique_ids):
    if i < len(unique_ids):
        # Filter data for each id
        subset_data = dataset[dataset['ID'] == unique_id]
        
        # Plotting
        sns.lineplot(x='Time', y='w_eda', data=subset_data, ax=axes[i], color='blue', label='EDA')
        # sns.lineplot(x='Time', y='w_temp', data=subset_data, ax=axes[i], color='red', label='Temp')

        axes[i].set_title(f"Data for {unique_id}")
        axes[i].set_xlabel('Time')
        axes[i].set_ylabel('Measurement')
        axes[i].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# import matplotlib.pyplot as plt

# # Create subplots
# fig, axes = plt.subplots(16, 3, figsize=(60, 40))  # Increased figure size

# # Define colors for each acceleration component
# colors = ['red', 'green', 'blue']

# # Iterate through each unique id
# for i, unique_id in enumerate(unique_ids):
#     # Filter data for each id
#     subset_data = dataset[dataset['ID'] == unique_id]
    
#     # Iterate through X, Y, and Z accelerations
#     for j, accel_component in enumerate(['X', 'Y', 'Z']):
#         ax = axes[i, j]  # Select the appropriate subplot
        
#         # Plot acceleration component with different color
#         ax.plot(subset_data['Time'], subset_data[accel_component], label=f'{accel_component} Acceleration', color=colors[j])
#         ax.set_title(f"Data for {unique_id} - {accel_component} Acceleration")
#         ax.set_xlabel('Time')
#         ax.set_ylabel('Acceleration')
#         ax.legend()
#         ax.tick_params(axis='x', rotation=45)

# # Adjust layout
# plt.tight_layout()
# plt.show()


In [None]:
import cvxEDA.src.cvxEDA

def calculate_eda_levels(y):
    fs_dict = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'label': 700, 'Resp': 700}
    Fs = fs_dict['EDA']
    yn = (y - y.mean()) / y.std()
    [r, p, t, l, d, e, obj] = cvxEDA.src.cvxEDA.cvxEDA(yn, 1. / Fs)
    return [p, t]


In [None]:
import matplotlib.pyplot as plt

# Define unique_ids
unique_ids = dataset['ID'].unique()

# Iterate through each unique id
for unique_id in unique_ids:
    # Filter data for each id
    subset_data = dataset[dataset['ID'] == unique_id]
    
    # Calculate EDA levels
    tonic, phasic = calculate_eda_levels(subset_data['w_eda'].values)
    
    # Plotting
    plt.plot(tonic, label='Tonic')
    plt.plot(phasic, label='Phasic')
    plt.plot(subset_data['w_eda'].values, label='EDA')
    
    plt.xlabel('Time')
    plt.ylabel('EDA Levels')
    plt.title(f'Phasic and Tonic EDA for ID: {unique_id}')
    plt.legend()
    plt.show()


In [None]:
print("Before replacing labels")
unique_labels_before = sequences_df['downsampled_label'].unique()
print(unique_labels_before, "\n")
print("Number of unique labels before replacement:", len(unique_labels_before), "\n")


In [None]:
sequences_df['downsampled_label'] = sequences_df['downsampled_label'].apply(lambda x : 1 if x == 2.0 else 0)


In [None]:
from sklearn import preprocessing

print("After replacing labels")
unique_labels_after = sequences_df['downsampled_label'].unique()
print(unique_labels_after)
print("Number of unique labels after replacement:", len(unique_labels_after))

le = preprocessing.LabelEncoder()  # Generates a look-up table
le.fit(sequences_df['downsampled_label'])
sequences_df['downsampled_label'] = le.transform(sequences_df['downsampled_label'])


In [None]:
num_classes = len(sequences_df['downsampled_label'].unique())
print(num_classes)


In [None]:
from collections import Counter

def plot_label_distribution(df):
    # Define class labels
    sorts = {
        0: "No-stress",
        1: "Stress"
    }

    # Count occurrences of each label
    label_counts = Counter(df['downsampled_label'])

    # Extract counts for '0' and '1'
    counts = [label_counts[0], label_counts[1]]
    print("Label distribution:", counts)

    # Define bar labels
    bar_labels = [sorts[0], sorts[1]]

    # Plotting
    plt.bar(bar_labels, counts)
    plt.title("Number of samples per class")
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.show()

In [None]:
plot_label_distribution(sequences_df)


In [None]:
import pandas as pd
from sklearn.utils import resample

# Separate the majority ('no-stress') and minority ('stress') classes
df_no_stress = sequences_df[sequences_df['downsampled_label'] == 0]
df_stress = sequences_df[sequences_df['downsampled_label'] == 1]

# Downsample the majority class ('no-stress') to match the minority class ('stress')
df_no_stress_downsampled = resample(df_no_stress,
                                    replace=False,  # Sample without replacement
                                    n_samples=len(df_stress),  # Match the number of 'stress' samples
                                    random_state=42)  # Ensure reproducibility

# Combine the downsampled 'no-stress' class with the 'stress' class
sequences_df_balanced = pd.concat([df_no_stress_downsampled, df_stress])

# Shuffle the combined dataset to mix the samples
sequences_df_balanced = sequences_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Now sequences_df_balanced contains a balanced dataset with 'no-stress' samples evenly undersampled to match 'stress' samples

In [None]:
plot_label_distribution(sequences_df_balanced)

****Scale and split data****

We perform a simple Min-Max scaling to bring the value-range between 0 and 1. We do not use Standard Scaling as the data does not follow a Gaussian distribution.

In [None]:
# Scale the 'w_eda' feature
scaler = preprocessing.MinMaxScaler()
eda_series_list_scaled = [scaler.fit_transform(np.asarray(i).reshape(-1, 1)) for i in sequences_df_balanced["w_eda"]]

# Convert the scaled feature back to a list of arrays
eda_array_list = [np.array(series).flatten() for series in eda_series_list_scaled]

# Separate the labels
labels_list = [i for i in sequences_df_balanced['downsampled_label']]

# Convert the labels list to numpy array
labels_array = np.array(labels_list)

# print(len(combined_series_list))
print(f"EDA list Count:", len(eda_series_list_scaled),"\n" "Labels list Count:", len(labels_array))



In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Padding sequences to ensure uniform length
max_sequence_length = 128  # Choose the desired maximum sequence length
padded_series_list = pad_sequences(eda_series_list_scaled, maxlen=max_sequence_length, dtype='float32', padding='post', truncating='post')

# Splitting data into training and testing sets (70% train, 30% test)
x_temp, x_test, y_temp, y_test = train_test_split(
    padded_series_list, labels_list, test_size=0.30, random_state=42, shuffle=True
)

# Further splitting the training data into training and validation sets (80% train, 20% val from the original 70% train)
x_train, x_val, y_train, y_val = train_test_split(
    x_temp, y_temp, test_size=0.20, random_state=42, shuffle=True
)

# Convert to numpy arrays and reshape for compatibility with Keras
x_train = np.asarray(x_train).astype(np.float32).reshape(-1, max_sequence_length, 1)  # Assuming 1 feature (EDA or TEMP)
y_train = np.asarray(y_train).astype(np.float32).reshape(-1, 1)  # Do not one-hot encode

x_val = np.asarray(x_val).astype(np.float32).reshape(-1, max_sequence_length, 1)  # Assuming 1 feature (EDA or TEMP)
y_val = np.asarray(y_val).astype(np.float32).reshape(-1, 1)  # Do not one-hot encode

x_test = np.asarray(x_test).astype(np.float32).reshape(-1, max_sequence_length, 1)  # Assuming 1 feature (EDA or TEMP)
y_test = np.asarray(y_test).astype(np.float32).reshape(-1, 1)  # Do not one-hot encode

# Create tf.data.Dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(32)

# Check lengths of train, validation, and test sets
print(
    f"Length of x_train : {len(x_train)}\nLength of x_val : {len(x_val)}\nLength of x_test : {len(x_test)}\n"
    f"Length of y_train : {len(y_train)}\nLength of y_val : {len(y_val)}\nLength of y_test : {len(y_test)}"
)


In [None]:
def plot_dataset_distribution(x_train, y_train, x_test, y_test, x_val, y_val):
    """
    Plots a bar chart showing the sizes of the train, validation, and test sets.

    Parameters:
    - x_train, y_train: Training data and labels.
    - x_val, y_val: Validation data and labels.
    - x_test, y_test: Test data and labels.
    """
    dataset_names = ['Train', 'Test', 'Validation']
    x_lengths = [len(x_train), len(x_test), len(x_val)]
    y_lengths = [len(y_train), len(y_test), len(y_val)]
    
    # Plotting the bar plot
    plt.figure(figsize=(10, 6))
    
    plt.bar(dataset_names, x_lengths, color='b', alpha=0.6, label='X (Features)')
    plt.bar(dataset_names, y_lengths, color='r', alpha=0.6, label='Y (Labels)', bottom=x_lengths)
    
    plt.xlabel('Dataset')
    plt.ylabel('Number of Samples')
    plt.title('Dataset Distribution')
    plt.legend()
    plt.show()


# Plot dataset distribution
plot_dataset_distribution(x_train, y_train, x_test, y_test, x_val, y_val)

In [None]:
# Creating tf.data.Datasets from numpy arrays
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))

# Shuffling and batching the datasets
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)


In [None]:
vals_dict = {}
for i in sequences_df_balanced['downsampled_label']:
    if i in vals_dict.keys():
        vals_dict[i] += 1
    else:
        vals_dict[i] = 1
total = sum(vals_dict.values())

# Formula used - Naive method where
# weight = 1 - (no. of samples present / total no. of samples)
# So more the samples, lower the weight

weight_dict = {k: (1 - (v / total)) for k, v in vals_dict.items()}
print(weight_dict)



In [None]:
# Assuming your one-hot encoded labels are in a variable named 'labels'
binary_labels = np.argmax(sequences_df_balanced['downsampled_label'])
print("Shape of binary labels:", binary_labels.shape)

In [None]:
def plot_history_metrics(history: keras.callbacks.History):
    total_plots = len(history.history)
    cols = total_plots // 2

    rows = total_plots // cols

    if total_plots % cols != 0:
        rows += 1

    pos = range(1, total_plots + 1)
    plt.figure(figsize=(15, 10))
    for i, (key, value) in enumerate(history.history.items()):
        plt.subplot(rows, cols, pos[i])
        plt.plot(range(len(value)), value)
        plt.title(str(key))
    plt.show()

In [None]:
from tensorflow.keras import layers

def create_model():
    input_layer = keras.Input(shape=(128, 1))

    x = layers.Conv1D(
        filters=32, kernel_size=3, strides=2, activation="relu", padding="same"
    )(input_layer)
    x = layers.BatchNormalization()(x)

    x = layers.Conv1D(
        filters=64, kernel_size=3, strides=2, activation="relu", padding="same"
    )(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv1D(
        filters=128, kernel_size=5, strides=2, activation="relu", padding="same"
    )(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv1D(
        filters=256, kernel_size=5, strides=2, activation="relu", padding="same"
    )(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv1D(
        filters=512, kernel_size=7, strides=2, activation="relu", padding="same"
    )(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv1D(
        filters=1024,
        kernel_size=7,
        strides=2,
        activation="relu",
        padding="same",
    )(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dropout(0.2)(x)

    x = layers.Flatten()(x)
    x = layers.Dense(4096, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(2048, activation="relu")(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Dense(1024, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    
    x = layers.Dense(128, activation="relu")(x)

    output_layer = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=input_layer, outputs=output_layer)
    
    return model



In [None]:
conv_model = create_model()
conv_model.summary()

# Save model to JSON
# Done for Kevin Bevers for his Headless CMS Stress platform prototype project
model_json = conv_model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)


In [None]:
epochs = 100

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "best_model.keras", save_best_only=True, monitor="val_loss"  # Change monitor to "val_loss"
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",  # Change monitor to "val_loss"
        factor=0.2,
        patience=2,
        min_lr=0.000001,
    ),
]

optimizer = keras.optimizers.Adam(amsgrad=True, learning_rate=0.001)
loss = keras.losses.BinaryCrossentropy()

In [None]:
conv_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[
        keras.metrics.BinaryAccuracy(),
        keras.metrics.AUC(),
        keras.metrics.Precision(),
        keras.metrics.Recall(),
    ],
)

conv_model_history = conv_model.fit(
    train_dataset,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=val_dataset,
    class_weight=weight_dict,
)


In [None]:
plot_history_metrics(conv_model_history)

loss, accuracy, auc, precision, recall = conv_model.evaluate(train_dataset)
print(f"Loss : {loss}")
print(f"Binary Accuracy : {accuracy}")
print(f"Area under the Curve (ROC) : {auc}")
print(f"Precision : {precision}")
print(f"Recall : {recall}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate predictions on the test set
y_pred_probs = conv_model.predict(x_test, verbose=0)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def view_evaluated_eeg_plots(model, sequences_df, scaler):
    def plot_signals(data, labels, predictions, ids, times):
        total_plots = len(data)
        cols = total_plots // 5
        rows = total_plots // cols
        if total_plots % cols != 0:
            rows += 1
        pos = range(1, total_plots + 1)
        fig = plt.figure(figsize=(40, 30))
        for i, (plot_data, og_label, pred_label, id_, time) in enumerate(zip(data, labels, predictions, ids, times)):
            plt.subplot(rows, cols, pos[i])
            plt.plot(time, plot_data)
            plt.title(f"ID: {id_}\nActual Label: {og_label}\nPredicted Label: {pred_label}")
            fig.subplots_adjust(hspace=0.5)
        plt.show()

    def generate_signals_for_label(label, num_signals=25):
        filtered_df = sequences_df[sequences_df['downsampled_label'] == label]
        sampled_df = filtered_df.sample(n=num_signals, random_state=42)
        data = sampled_df['w_eda']
        times = sampled_df['Time']
        data_array = [scaler.fit_transform(np.asarray(i).reshape(-1, 1)) for i in data]
        data_array = np.asarray(data_array).astype(np.float32).reshape(-1, 128, 1)
        labels = sampled_df['downsampled_label'].tolist()
        ids = sampled_df['ID'].tolist()  # Extract IDs
        predictions = (model.predict(data_array, verbose=0) > 0.5).astype(int).flatten()
        return data, labels, predictions, ids, times

    data_0, labels_0, predictions_0, ids_0, times_0 = generate_signals_for_label(0)
    data_1, labels_1, predictions_1, ids_1, times_1 = generate_signals_for_label(1)
    
    print("Plotting signals with label 0:")
    plot_signals(data_0, labels_0, predictions_0, ids_0, times_0)
    
    print("Plotting signals with label 1:")
    plot_signals(data_1, labels_1, predictions_1, ids_1, times_1)

# Call the function with the required arguments
view_evaluated_eeg_plots(conv_model, sequences_df, scaler)


In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Assuming sequences_df, scaler, and conv_model are already defined

# Evaluate the model and print evaluation metrics
loss, accuracy, auc, precision, recall = conv_model.evaluate(test_dataset)
print(f"Loss : {loss}")
print(f"Top 3 Categorical Accuracy : {accuracy}")
print(f"Area under the Curve (ROC) : {auc}")
print(f"Precision : {precision}")
print(f"Recall : {recall}")

def view_evaluated_eeg_plots(model):
    start_index = random.randint(10, len(sequences_df) - 12)
    end_index = start_index + 11
    data = sequences_df.loc[start_index:end_index, 'w_eda']
    time = sequences_df.loc[start_index:end_index, 'Time']
    subjects = sequences_df.loc[start_index:end_index, 'ID']

    # Ensure the time series data and time are aligned correctly
    data_array = [scaler.fit_transform(np.asarray(i).reshape(-1, 1)) for i in data]
    data_array = np.asarray(data_array).astype(np.float32).reshape(-1, 128, 1)
    
    original_labels = sequences_df.loc[start_index:end_index, 'downsampled_label']
    predicted_labels = np.argmax(model.predict(data_array, verbose=0), axis=1)

    total_plots = len(data)
    cols = total_plots // 3
    rows = total_plots // cols
    if total_plots % cols != 0:
        rows += 1
    pos = range(1, total_plots + 1)
    
    fig = plt.figure(figsize=(20, 10))
    for i, (plot_data, og_label, pred_label, subject) in enumerate(zip(data, original_labels, predicted_labels, subjects)):
        plt.subplot(rows, cols, pos[i])
        plt.plot(time.iloc[i], plot_data)
        plt.title(f"Subject: {subject}\nActual Label: {og_label}\nPredicted Label: {pred_label}")
        fig.subplots_adjust(hspace=0.5)
    
    plt.show()

view_evaluated_eeg_plots(conv_model)
