In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.signal import resample
from scipy.fft import fft
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

#Dataset Path
dataset_dir = "/kaggle/input/wesad-full-dataset/WESAD/"  # Adjust path
sampling_rates = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4}
window_size = 30  # seconds

# Load and Preprocess Data
def load_data():
    all_subjects_data = []
    stress_count = 0
    non_stress_count = 0

    for subject in os.listdir(dataset_dir):
        subject_path = os.path.join(dataset_dir, subject, f"{subject}.pkl")
        if os.path.exists(subject_path):
            with open(subject_path, 'rb') as file:
                data = pickle.load(file, encoding='latin1')

            signal_data = data['signal']['wrist']
            labels = data['label']

            # Resample signals to align with EDA length
            bvp_resampled = resample(signal_data['BVP'], len(signal_data['EDA']))
            acc_resampled = resample(signal_data['ACC'], len(signal_data['EDA']), axis=0)

            window_stride = sampling_rates['EDA'] * window_size  # 4 Hz * 30s = 120 samples
            for i in range(0, len(signal_data['EDA']) - window_stride, window_stride):
                label = labels[i]
                if label == 1:
                    stress_count += 1
                else:
                    non_stress_count += 1

                all_subjects_data.append({
                    'EDA': signal_data['EDA'][i : i + window_stride],
                    'BVP': bvp_resampled[i : i + window_stride],
                    'TEMP': signal_data['TEMP'][i : i + window_stride],
                    'ACC_X': acc_resampled[i : i + window_stride, 0],
                    'ACC_Y': acc_resampled[i : i + window_stride, 1],
                    'ACC_Z': acc_resampled[i : i + window_stride, 2],
                    'Label': label
                })

    print("Stress Samples After Windowing:", stress_count)
    print("Non-Stress Samples After Windowing:", non_stress_count)
    return all_subjects_data

#Feature Extraction
def extract_features(data_list):
    feature_dict = {col: [] for col in ['EDA', 'BVP', 'TEMP', 'ACC_X', 'ACC_Y', 'ACC_Z']}
    labels = []

    for entry in data_list:
        labels.append(entry['Label'])
        for col in feature_dict.keys():
            feature_dict[col].append(entry[col])

    print("Stress Samples After Feature Extraction:", labels.count(1))
    print("Non-Stress Samples After Feature Extraction:", labels.count(0))

    for col in feature_dict.keys():
        feature_dict[col] = np.stack(feature_dict[col])

    feature_arrays = []
    for col in feature_dict.keys():
        feature_arrays.append(stat_features(feature_dict[col]))
        feature_arrays.append(freq_features(feature_dict[col], sampling_rates[col.split('_')[0]]))

    feature_array = np.hstack(feature_arrays)
    feature_df = pd.DataFrame(feature_array)
    feature_df['Label'] = labels
    return feature_df

#Helper Functions for Feature Extraction
def stat_features(arr):
    return np.column_stack([
        np.mean(arr, axis=1), np.std(arr, axis=1), np.var(arr, axis=1),
        np.min(arr, axis=1), np.max(arr, axis=1), np.median(arr, axis=1),
        skew(arr, axis=1), kurtosis(arr, axis=1),
        np.sqrt(np.mean(arr**2, axis=1))
    ])

def freq_features(arr, sampling_rate):
    fft_vals = np.abs(fft(arr, axis=1))
    fft_freqs = np.fft.fftfreq(arr.shape[1], d=1/sampling_rate)
    dominant_freq = fft_freqs[np.argmax(fft_vals, axis=1)]
    spectral_energy = np.sum(fft_vals**2, axis=1)
    return np.column_stack([dominant_freq, spectral_energy])

#Load and Process Data
data_list = load_data()
df = extract_features(data_list)
df.dropna(inplace=True)

# Train-Test Split
stress_samples = df[df['Label'] == 1]
non_stress_samples = df[df['Label'] == 0]

X_stress_train, X_stress_test, y_stress_train, y_stress_test = train_test_split(
    stress_samples.drop(columns=['Label']), stress_samples['Label'], test_size=0.3, random_state=42
)

X_no_stress_train, X_no_stress_test, y_no_stress_train, y_no_stress_test = train_test_split(
    non_stress_samples.drop(columns=['Label']), non_stress_samples['Label'], test_size=0.3, random_state=42
)

print("Stress Samples in Training Set:", len(X_stress_train))
print("Stress Samples in Test Set:", len(X_stress_test))
print("Non-Stress Samples in Training Set:", len(X_no_stress_train))
print("Non-Stress Samples in Test Set:", len(X_no_stress_test))

#Combine stress and no-stress sets to form final train and test sets
X_train = pd.concat([X_stress_train, X_no_stress_train])
y_train = pd.concat([y_stress_train, y_no_stress_train])
X_test = pd.concat([X_stress_test, X_no_stress_test])
y_test = pd.concat([y_stress_test, y_no_stress_test])

print("Total Samples in Training Set:", len(X_train))
print("Stress Samples in Training Set:", y_train.value_counts()[1])
print("Non-Stress Samples in Training Set:", y_train.value_counts()[0])
print("Total Samples in Test Set:", len(X_test))
print("Stress Samples in Test Set:", y_test.value_counts()[1])
print("Non-Stress Samples in Test Set:", y_test.value_counts()[0])

#Apply SMOTE if needed
if len(y_train.unique()) > 1:
    smote = SMOTE(sampling_strategy=1.0, random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print("Samples in Training Set After SMOTE:", len(X_train))
    print("Stress Samples in Training Set After SMOTE:", y_train.value_counts()[1])
    print("Non-Stress Samples in Training Set After SMOTE:", y_train.value_counts()[0])

#Train Model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

#Evaluate Model
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Stress Samples After Windowing: 31
Non-Stress Samples After Windowing: 2858
Stress Samples After Feature Extraction: 31
Non-Stress Samples After Feature Extraction: 2858
Stress Samples in Training Set: 21
Stress Samples in Test Set: 10
Non-Stress Samples in Training Set: 2000
Non-Stress Samples in Test Set: 858
Total Samples in Training Set: 2021
Stress Samples in Training Set: 21
Non-Stress Samples in Training Set: 2000
Total Samples in Test Set: 868
Stress Samples in Test Set: 10
Non-Stress Samples in Test Set: 858
Samples in Training Set After SMOTE: 4000
Stress Samples in Training Set After SMOTE: 2000
Non-Stress Samples in Training Set After SMOTE: 2000
Accuracy: 0.9988479262672811
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       858
           1       1.00      0.90      0.95        10

    accuracy                           1.00       868
   macro avg       1.00      0.95      0.97       868
weighted avg       1.00      1.0

In [2]:
# Reload the dataset and check label distribution without modifying the main code

import pickle
import os
import numpy as np
import pandas as pd
from scipy.signal import resample

# ✅ Dataset Path (Make sure this matches your dataset location)
dataset_dir = "/kaggle/input/wesad-full-dataset/WESAD/"  
sampling_rates = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4}
window_size = 30  # seconds

# ✅ Load Data Function
def load_data():
    all_subjects_data = []
    for subject in os.listdir(dataset_dir):
        subject_path = os.path.join(dataset_dir, subject, f"{subject}.pkl")
        if os.path.exists(subject_path):
            with open(subject_path, 'rb') as file:
                data = pickle.load(file, encoding='latin1')
            
            labels = data['label']
            all_subjects_data.append(pd.DataFrame({'Label': labels}))

    return pd.concat(all_subjects_data, ignore_index=True)

# ✅ Load and Print Label Distribution
df = load_data()
print("Label Distribution in Full Dataset:\n", df['Label'].value_counts())


Label Distribution in Full Dataset:
 Label
0    27654897
1    12327702
4     8264199
2     6976201
3     3902501
7      576802
6      552998
5      552300
Name: count, dtype: int64
