In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dreamer/DREAMER.mat
/kaggle/input/dreamer/dreamer/DREAMER.mat


In [2]:
# Kaggle: run once at top of notebook
!pip install --upgrade pip
!pip install neurokit2 tqdm joblib shap


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting neurokit2
  Downloading neurokit2-0.2.12-py2.py3-none-any.whl.metadata (37 kB)
Downloading neurokit2-0.2.12-py2.py3-none-any.whl (708 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.4/708.4 kB[0m [31m19.1 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: neurokit2
Successfully installed neurokit2-0.2.12


In [3]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from tqdm import tqdm
import os

mat_path = '/kaggle/input/dreamer/DREAMER.mat'  # adjust if different
mat = loadmat(mat_path)
# The main structure is usually at mat['DREAMER'][0,0]
dreamer = mat['DREAMER'][0,0]
print("Fields in DREAMER:", dreamer.dtype.names)
# Often fields: Data, Labels, Subject, etc


Fields in DREAMER: ('Data', 'EEG_SamplingRate', 'ECG_SamplingRate', 'EEG_Electrodes', 'noOfSubjects', 'noOfVideoSequences', 'Disclaimer', 'Provider', 'Version', 'Acknowledgement')


In [4]:
import scipy.io
import numpy as np

# Load the DREAMER dataset
dreamer = scipy.io.loadmat("/kaggle/input/dreamer/DREAMER.mat", struct_as_record=False, squeeze_me=True)

# Show top-level keys
print("Top-level keys in .mat file:\n", dreamer.keys())

# Explore Data field
data = dreamer['DREAMER']
print("\nType of DREAMER:", type(data))

# If it has attributes, list them
if hasattr(data, '_fieldnames'):
    print("\nFields in DREAMER:", data._fieldnames)

# Example: Check the first subject
subject1 = data.Data[0]
print("\nType of subject[0]:", type(subject1))
try:
    print("Fields in subject[0]:", subject1._fieldnames)
except:
    print("Subject[0] is not a struct, type:", type(subject1))

# Peek into one trial
trial1 = subject1[0] if isinstance(subject1, (list, np.ndarray)) else None
print("\nTrial example type:", type(trial1))
if trial1 is not None:
    try:
        print("Trial fields:", trial1._fieldnames)
    except:
        print("Trial is not a struct, content:", trial1)


Top-level keys in .mat file:
 dict_keys(['__header__', '__version__', '__globals__', 'DREAMER'])

Type of DREAMER: <class 'scipy.io.matlab._mio5_params.mat_struct'>

Fields in DREAMER: ['Data', 'EEG_SamplingRate', 'ECG_SamplingRate', 'EEG_Electrodes', 'noOfSubjects', 'noOfVideoSequences', 'Disclaimer', 'Provider', 'Version', 'Acknowledgement']

Type of subject[0]: <class 'scipy.io.matlab._mio5_params.mat_struct'>
Fields in subject[0]: ['Age', 'Gender', 'EEG', 'ECG', 'ScoreValence', 'ScoreArousal', 'ScoreDominance']

Trial example type: <class 'NoneType'>


# ============================
# UPACS — DREAMER (ECG) End-to-End (LOSO fix applied)
# Paste into a Kaggle notebook cell and run.
# (If NeuroKit2 not installed, uncomment pip install line)
# ============================

# !pip install --quiet neurokit2==0.2.7 joblib==1.3.2 tqdm

import os, warnings, random
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from scipy.io import loadmat
from scipy.signal import butter, filtfilt
from tqdm import tqdm
import joblib
import neurokit2 as nk

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ---------- CONFIG ----------
MAT_PATH = "/kaggle/input/dreamer/DREAMER.mat"   # adjust if needed
OUT_DIR = "/kaggle/working/upacs_output"
os.makedirs(OUT_DIR, exist_ok=True)

AROUSAL_THRESHOLD = 3.0
MIN_RPEAKS = 5

# ---------- helpers ----------
def bandpass_filter(sig, fs, low=0.5, high=40.0, order=4):
    nyq = 0.5 * fs
    lowcut, highcut = low/nyq, high/nyq
    b, a = butter(order, [lowcut, highcut], btype="band")
    return filtfilt(b, a, sig)

def find_numeric_arrays(obj, _visited=None):
    """Recursively find numeric numpy arrays inside nested mat_struct/object arrays."""
    if _visited is None:
        _visited = set()
    results = []
    try:
        oid = id(obj)
        if oid in _visited:
            return results
        _visited.add(oid)
    except Exception:
        pass

    if isinstance(obj, np.ndarray):
        if obj.dtype == object:
            for el in obj.flat:
                results.extend(find_numeric_arrays(el, _visited))
            return results
        if obj.dtype.names is not None:
            for name in obj.dtype.names:
                try:
                    val = obj[name]
                    results.extend(find_numeric_arrays(val, _visited))
                except Exception:
                    pass
            return results
        if np.issubdtype(obj.dtype, np.number):
            results.append(obj)
            return results
        return results

    # scipy mat_struct
    if hasattr(obj, "_fieldnames"):
        for name in obj._fieldnames:
            try:
                val = getattr(obj, name)
                results.extend(find_numeric_arrays(val, _visited))
            except Exception:
                pass
        return results

    if isinstance(obj, (list, tuple)):
        for el in obj:
            results.extend(find_numeric_arrays(el, _visited))
        return results

    if isinstance(obj, dict):
        for k,v in obj.items():
            results.extend(find_numeric_arrays(v, _visited))
        return results

    try:
        if isinstance(obj, (int, float, np.floating, np.integer)):
            results.append(np.array([obj], dtype=float))
            return results
    except Exception:
        pass

    return results

def choose_best_array(arr_list):
    """Return the longest numeric array (flattened) from list or None."""
    if not arr_list:
        return None
    arr_list = [np.asarray(a).flatten() for a in arr_list if np.asarray(a).size>0]
    if not arr_list:
        return None
    arr_list_sorted = sorted(arr_list, key=lambda x: x.size, reverse=True)
    return arr_list_sorted[0]

def get_label_list(obj, expected_len=None):
    """Extract a python list of label values from various possible encodings."""
    if obj is None:
        return None
    try:
        arr = np.asarray(obj, dtype=float).flatten()
        if arr.size >= 1:
            return arr.tolist()
    except Exception:
        pass
    arrs = find_numeric_arrays(obj)
    scalar_lists = [a.flatten() for a in arrs if np.asarray(a).ndim==1]
    if expected_len is not None:
        for a in scalar_lists:
            if len(a) == expected_len:
                return a.tolist()
    if scalar_lists:
        return max(scalar_lists, key=lambda x: x.size).tolist()
    return None

# ---------- Load MAT ----------
print("Loading MAT:", MAT_PATH)
mat = loadmat(MAT_PATH, squeeze_me=True, struct_as_record=False)
if 'DREAMER' not in mat:
    raise RuntimeError("DREAMER key not found in MAT.")
dreamer = mat['DREAMER']
print("DREAMER fields:", dreamer._fieldnames)

# sampling rate & counts
try:
    FS = int(dreamer.ECG_SamplingRate)
except Exception:
    FS = 256
n_subjects = int(getattr(dreamer, "noOfSubjects", np.nan))
n_videos = int(getattr(dreamer, "noOfVideoSequences", np.nan))
print("FS:", FS, "Subjects:", n_subjects, "Videos:", n_videos)

# subjects list
subjects = list(np.atleast_1d(dreamer.Data))
print("Found subjects:", len(subjects))

# ---------- Extract trials robustly ----------
rows = []
print("\nExtracting ECG trials (robustly handling mat_structs)...")
for subj_idx, subj in enumerate(tqdm(subjects, desc="Subjects")):
    ecg_field = getattr(subj, "ECG", None)
    sv_field = getattr(subj, "ScoreValence", None)
    sa_field = getattr(subj, "ScoreArousal", None)
    sd_field = getattr(subj, "ScoreDominance", None)

    # label lists
    expected_len = None
    try:
        expected_len = int(n_videos) if not np.isnan(n_videos) else None
    except:
        expected_len = None
    valence_list = get_label_list(sv_field, expected_len=expected_len)
    arousal_list = get_label_list(sa_field, expected_len=expected_len)
    dom_list = get_label_list(sd_field, expected_len=expected_len)

    if ecg_field is None:
        continue

    trial_candidates = []
    if isinstance(ecg_field, np.ndarray) and ecg_field.dtype == object:
        for el in np.atleast_1d(ecg_field):
            trial_candidates.append(el)
    else:
        arr = ecg_field
        try:
            np_arr = np.asarray(arr)
            if np_arr.ndim == 2 and (not np.isnan(n_videos)):
                r,c = np_arr.shape
                if r == n_videos:
                    for i in range(r):
                        trial_candidates.append(np_arr[i,:])
                elif c == n_videos:
                    for i in range(c):
                        trial_candidates.append(np_arr[:,i])
                else:
                    for i in range(r):
                        trial_candidates.append(np_arr[i,:])
            elif np_arr.ndim == 1 and (not np.isnan(n_videos)) and np_arr.size % n_videos == 0:
                chunk = np_arr.size // n_videos
                for i in range(n_videos):
                    trial_candidates.append(np_arr[i*chunk:(i+1)*chunk])
            else:
                trial_candidates.append(arr)
        except Exception:
            trial_candidates.append(arr)

    for t_idx, cand in enumerate(trial_candidates):
        numeric_arrays = find_numeric_arrays(cand)
        best = choose_best_array(numeric_arrays)
        if best is None:
            continue
        if best.size < 100:
            continue

        def get_label_from_list(lst, idx):
            try:
                if lst is None:
                    return np.nan
                if idx < len(lst):
                    return float(lst[idx])
                else:
                    return np.nan
            except Exception:
                return np.nan

        val = get_label_from_list(valence_list, t_idx)
        aro = get_label_from_list(arousal_list, t_idx)
        dom = get_label_from_list(dom_list, t_idx)

        rows.append({
            "subject": subj_idx+1,
            "trial": t_idx+1,
            "ecg": best.astype(float).flatten(),
            "valence": val,
            "arousal": aro,
            "dominance": dom
        })

df_trials = pd.DataFrame(rows)
print("Total trials collected:", len(df_trials))
if len(df_trials) == 0:
    raise RuntimeError("No ECG trials extracted. Inspect MAT file structure or path.")

# basic length stats
lengths = df_trials['ecg'].apply(len)
print("ECG length stats (min, median, max):", lengths.min(), int(lengths.median()), lengths.max())

# ---------- Feature extraction ----------
def extract_features_from_ecg(ecg_signal, fs=FS, min_rpeaks=MIN_RPEAKS):
    try:
        x = np.asarray(ecg_signal, dtype=float)
        if x.size < fs*2:
            return None
        try:
            x_f = bandpass_filter(x, fs)
        except Exception:
            x_f = x
        try:
            signals, info = nk.ecg_process(x_f, sampling_rate=fs)
            rpeaks = info.get("ECG_R_Peaks", [])
        except Exception:
            try:
                rpeaks_dict = nk.ecg_peaks(x_f, sampling_rate=fs)
                rpeaks = rpeaks_dict.get("ECG_R_Peaks", [])
            except Exception:
                rpeaks = []

        if len(rpeaks) < min_rpeaks:
            return None

        times = np.array(rpeaks) / fs
        rr = np.diff(times) * 1000.0
        if len(rr) < 2:
            return None

        feats = {
            "mean_rr": float(np.nanmean(rr)),
            "sdnn": float(np.nanstd(rr, ddof=1)),
            "rmssd": float(np.sqrt(np.nanmean(np.diff(rr)**2))),
            "pnn50": float(np.sum(np.abs(np.diff(rr)) > 50.0) / max(1, (len(rr)-1)) * 100.0),
            "hr_mean": float(60000.0 / np.nanmean(rr)) if np.nanmean(rr) > 0 else np.nan,
            "n_beats": int(len(rpeaks))
        }

        try:
            hrv_t = nk.hrv_time(rpeaks, sampling_rate=fs)
            hrv_f = nk.hrv_frequency(rpeaks, sampling_rate=fs)
            hrv_n = nk.hrv_nonlinear(rpeaks, sampling_rate=fs)
            for df in (hrv_t, hrv_f, hrv_n):
                if df is None or df.shape[0] == 0:
                    continue
                for col in df.columns:
                    val = df.iloc[0].get(col, np.nan)
                    feats[str(col)] = float(val) if not pd.isna(val) else np.nan
        except Exception:
            pass

        return feats
    except Exception:
        return None

print("\nExtracting HRV features for each trial (may take a few minutes)...")
feature_rows = []
for _, row in tqdm(df_trials.iterrows(), total=len(df_trials), desc="Trials"):
    feats = extract_features_from_ecg(row.ecg, fs=FS)
    if feats is None:
        continue
    feats.update({
        "subject": int(row.subject),
        "trial": int(row.trial),
        "valence": float(row.valence) if not np.isnan(row.valence) else np.nan,
        "arousal": float(row.arousal) if not np.isnan(row.arousal) else np.nan,
        "dominance": float(row.dominance) if not np.isnan(row.dominance) else np.nan
    })
    feature_rows.append(feats)

features_df = pd.DataFrame(feature_rows)
print("Features extracted (rows x cols):", features_df.shape)
if features_df.shape[0] == 0:
    raise RuntimeError("No feature rows extracted — check ECG length and R-peak detection.")

features_csv = os.path.join(OUT_DIR, "dreamer_ecg_hrv.csv")
features_df.to_csv(features_csv, index=False)
print("Saved features CSV:", features_csv)

# drop columns with many NaNs, fill rest
nan_frac = features_df.isna().mean()
drop_cols = nan_frac[nan_frac > 0.4].index.tolist()
if drop_cols:
    print("Dropping columns with >40% NaN:", drop_cols)
    features_df.drop(columns=drop_cols, inplace=True)
features_df.fillna(features_df.median(numeric_only=True), inplace=True)

# ---------- Prepare data for modeling (Arousal binary) ----------
features_df = features_df.dropna(subset=['arousal'])
features_df['arousal_bin'] = (features_df['arousal'] > AROUSAL_THRESHOLD).astype(int)

non_feat_cols = ['subject','trial','valence','arousal','dominance','arousal_bin']
feature_cols = [c for c in features_df.columns if c not in non_feat_cols]

X = features_df[feature_cols].values
y = features_df['arousal_bin'].values
groups = features_df['subject'].values

print("Training samples:", X.shape[0], "Features:", X.shape[1], "Subjects:", len(np.unique(groups)))

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
joblib.dump(scaler, os.path.join(OUT_DIR, "hrv_scaler.pkl"))
joblib.dump(feature_cols, os.path.join(OUT_DIR, "feature_columns.pkl"))

# ---------- LOSO evaluation (ROBUST) ----------
logo = LeaveOneGroupOut()
accs, f1s, bals, rocs = [], [], [], []

print("\nRunning LOSO evaluation (robust predict_proba handling)...")
for fold_idx, (train_idx, test_idx) in enumerate(logo.split(X_scaled, y, groups), start=1):
    Xtr, Xte = X_scaled[train_idx], X_scaled[test_idx]
    ytr, yte = y[train_idx], y[test_idx]

    clf = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=SEED, n_jobs=-1)
    clf.fit(Xtr, ytr)

    preds = clf.predict(Xte)

    # Robust computation of probability for positive class (1)
    prob = None
    if hasattr(clf, "predict_proba"):
        proba_arr = clf.predict_proba(Xte)  # shape (n_samples, n_classes_trained)
        # If classifier trained only on one class, proba_arr has shape (n_samples,1)
        if proba_arr.shape[1] == 1:
            trained_class = clf.classes_[0]
            if trained_class == 1:
                prob = np.ones(proba_arr.shape[0])
            else:
                prob = np.zeros(proba_arr.shape[0])
        else:
            # find column corresponding to class 1 (positive)
            if 1 in clf.classes_:
                idx = list(clf.classes_).index(1)
                prob = proba_arr[:, idx]
            else:
                # classifier has multiple classes but not 1 (unlikely here) -> zeros
                prob = np.zeros(proba_arr.shape[0])
    else:
        prob = None

    acc = accuracy_score(yte, preds)
    f1 = f1_score(yte, preds, zero_division=0)
    bal = balanced_accuracy_score(yte, preds)
    roc = roc_auc_score(yte, prob) if (prob is not None and len(np.unique(yte))>1) else float('nan')

    accs.append(acc); f1s.append(f1); bals.append(bal); rocs.append(roc)
    subj_id = np.unique(groups[test_idx])[0] if len(np.unique(groups[test_idx]))>0 else np.nan
    print(f"Fold {fold_idx} (subject {subj_id}) -> Acc: {acc:.3f}, F1: {f1:.3f}, BalAcc: {bal:.3f}, ROC: {roc:.3f}")

import numpy as _np
print("\nLOSO Summary (mean ± std):")
print("Accuracy: {:.3f} ± {:.3f}".format(_np.nanmean(accs), _np.nanstd(accs)))
print("F1:       {:.3f} ± {:.3f}".format(_np.nanmean(f1s), _np.nanstd(f1s)))
print("BalAcc:   {:.3f} ± {:.3f}".format(_np.nanmean(bals), _np.nanstd(bals)))
print("ROC-AUC:  {:.3f} ± {:.3f}".format(_np.nanmean(rocs), _np.nanstd(rocs)))

# ---------- Train final model & save ----------
final_clf = RandomForestClassifier(n_estimators=400, class_weight='balanced', random_state=SEED, n_jobs=-1)
final_clf.fit(X_scaled, y)
joblib.dump(final_clf, os.path.join(OUT_DIR, "rf_arousal_model.pkl"))
features_df.to_csv(os.path.join(OUT_DIR, "dreamer_ecg_hrv_cleaned.csv"), index=False)
print("\nFinal model + cleaned CSV saved in:", OUT_DIR)

# ---------- Inference helper ----------
def predict_from_raw_ecg(ecg_signal, model=final_clf, scaler=scaler, feature_cols=feature_cols, fs=FS):
    feats = extract_features_from_ecg(ecg_signal, fs=fs)
    if feats is None:
        return {"error": "Could not compute features (signal too short or no R-peaks)", "prob": None, "pred": None}
    x = np.array([feats.get(c, 0.0) for c in feature_cols], dtype=float).reshape(1, -1)
    x_scaled = scaler.transform(x)
    # robust prob extraction
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(x_scaled)
        if proba.shape[1] == 1:
            trained_class = model.classes_[0]
            prob_pos = float(1.0 if trained_class==1 else 0.0)
        else:
            if 1 in model.classes_:
                idx = list(model.classes_).index(1)
                prob_pos = float(proba[:, idx][0])
            else:
                prob_pos = 0.0
    else:
        prob_pos = float(model.predict(x_scaled)[0])
    pred = int(prob_pos > 0.5)
    return {"prob": prob_pos, "pred": pred, "features": feats}

# quick sanity test
if len(df_trials) > 0:
    first_ecg = df_trials.iloc[0]['ecg']
    print("\nSanity test predict on first extracted ECG trial:")
    print(predict_from_raw_ecg(first_ecg))


# Inspect ECG fields inside subject 0
ecg_struct = subj0.ECG
print("Type of subj0.ECG:", type(ecg_struct))
print("ECG struct fields:", ecg_struct._fieldnames)

# If it has fields, check one by one
for field in ecg_struct._fieldnames:
    val = getattr(ecg_struct, field)
    print(f"\nField: {field}")
    print("  Type:", type(val))
    try:
        print("  Shape:", np.shape(val))
    except:
        print("  Length:", len(val) if hasattr(val, "__len__") else "scalar")
    
    # Peek first 10 values if it's array-like
    if hasattr(val, "__getitem__"):
        try:
            print("  First 10 values:", np.array(val).flatten()[:10])
        except:
            print("  Could not preview")


In [5]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm

# =======================
# 1. Load dataset
# =======================
print("Loading DREAMER dataset...")
mat = loadmat("/kaggle/input/dreamer/DREAMER.mat", squeeze_me=True, struct_as_record=False)
dreamer = mat["DREAMER"]

n_subjects = int(dreamer.noOfSubjects)
n_videos = int(dreamer.noOfVideoSequences)
fs = int(dreamer.ECG_SamplingRate)

print(f"Subjects: {n_subjects}, Videos per subject: {n_videos}, ECG Sampling Rate: {fs} Hz")

# =======================
# 2. Extract ECG + Labels
# =======================
data = []

for s in tqdm(range(n_subjects), desc="Extracting ECG trials"):
    subj = dreamer.Data[s]

    ecg_baseline = subj.ECG.baseline   # shape (18,)
    ecg_stimuli  = subj.ECG.stimuli    # shape (18,)

    valence = subj.ScoreValence
    arousal = subj.ScoreArousal
    dominance = subj.ScoreDominance

    for t in range(n_videos):
        try:
            ecg_trial = np.array(ecg_stimuli[t]).flatten()

            # Skip if trial empty
            if ecg_trial.size < 100:
                continue

            # Store record
            data.append({
                "subject": s+1,
                "trial": t+1,
                "ecg": ecg_trial,
                "valence": float(valence[t]),
                "arousal": float(arousal[t]),
                "dominance": float(dominance[t])
            })

        except Exception as e:
            print(f"⚠️ Error at subject {s}, trial {t}: {e}")

df = pd.DataFrame(data)
print(f"\nFinal dataset shape: {df.shape}")
print(df.head())

# =======================
# 3. Feature Extraction
# (simple statistics per trial)
# =======================
def extract_features(signal):
    return [
        np.mean(signal),
        np.std(signal),
        np.min(signal),
        np.max(signal),
        np.median(signal),
        np.percentile(signal, 25),
        np.percentile(signal, 75)
    ]

features = []
labels = []

for _, row in df.iterrows():
    feat = extract_features(row["ecg"])
    features.append(feat)
    # Example: classify high/low valence
    labels.append(1 if row["valence"] >= 3 else 0)

X = np.array(features)
y = np.array(labels)

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)

# =======================
# 4. Preprocessing
# =======================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =======================
# 5. Train/Test Split
# =======================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# =======================
# 6. Train Classifier
# =======================
print("\nTraining XGBoost model...")
model = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# =======================
# 7. Evaluation
# =======================
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# =======================
# 8. Save Model + Scaler
# =======================
joblib.dump(model, "xgb_final_model.pkl")
joblib.dump(scaler, "hrv_scaler.pkl")

print("\n✅ Pipeline complete: model and scaler saved.")


Loading DREAMER dataset...
Subjects: 23, Videos per subject: 18, ECG Sampling Rate: 256 Hz


Extracting ECG trials: 100%|██████████| 23/23 [00:00<00:00, 93.71it/s]



Final dataset shape: (414, 6)
   subject  trial                                                ecg  valence  \
0        1      1  [2046, 2056, 2042, 2063, 2039, 2059, 2039, 205...      4.0   
1        1      2  [2054, 2061, 2036, 2041, 2036, 2041, 2035, 203...      3.0   
2        1      3  [2018, 2026, 2022, 2026, 2025, 2024, 2027, 202...      5.0   
3        1      4  [2055, 2051, 2052, 2051, 2053, 2054, 2054, 205...      4.0   
4        1      5  [2080, 2080, 2038, 2052, 2043, 2062, 2044, 206...      4.0   

   arousal  dominance  
0      3.0        2.0  
1      3.0        1.0  
2      4.0        4.0  
3      3.0        2.0  
4      4.0        4.0  
Feature matrix shape: (414, 7)
Labels shape: (414,)

Training XGBoost model...

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.31      0.38        32
           1       0.65      0.80      0.72        51

    accuracy                           0.61        83
   macro avg     