In [1]:
import os
import numpy as np
from scipy.stats import f_oneway, skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.signal import welch
from tqdm import tqdm
from scipy.stats import entropy as shannon_entropy

# Constants
INPUT_DIR = "C:\\Users\\fathi\\Desktop\\Rest_eeg_ds004504-download\\derivatives\\500hz_bands\\alpha\\numpy"
SAMPLE_RATE = 500
WINDOW_SIZE = 4 * SAMPLE_RATE  # 4 seconds

# Label mapping
def get_label(s):
    if 1 <= s <= 36:
        return 2  # Alzheimer's
    elif 37 <= s <= 65:
        return 0  # Healthy Control
    elif 66 <= s <= 88:
        return 1  # Dementia
    else:
        return None

# Hjorth parameters
def hjorth_parameters(signal):
    first_deriv = np.diff(signal)
    second_deriv = np.diff(first_deriv)
    activity = np.var(signal)
    mobility = np.sqrt(np.var(first_deriv) / activity) if activity != 0 else 0
    complexity = (np.sqrt(np.var(second_deriv) / np.var(first_deriv))
                  if np.var(first_deriv) != 0 else 0)
    return activity, mobility, complexity

# Feature extractor per window/channel
def extract_features(window):
    features = []
    for ch in range(window.shape[1]):
        x = window[:, ch]

        # Welch PSD
        f, psd = welch(x, fs=SAMPLE_RATE, nperseg=window.shape[0])
        total_power = np.sum(psd)
        mean_psd = np.mean(psd)
        rel_power = mean_psd / total_power if total_power > 0 else 0
        psd_norm = psd / np.sum(psd) if np.sum(psd) > 0 else np.ones_like(psd) / len(psd)
        spec_entropy = shannon_entropy(psd_norm)

        # Hjorth
        act, mob, comp = hjorth_parameters(x)

        # Additional features
        mean_val = np.mean(x)
        std_val = np.std(x)
        skew_val = skew(x)
        kurt_val = kurtosis(x)
        rms = np.sqrt(np.mean(x**2))
        zero_crossings = ((x[:-1] * x[1:]) < 0).sum()

        features.extend([
            mean_psd, rel_power, spec_entropy, act, mob, comp,
            mean_val, std_val, skew_val, kurt_val, rms, zero_crossings
        ])
    return features

# Load EEG data
subject_data = {}
all_files = [
    f for f in os.listdir(INPUT_DIR)
    if f.endswith(".npy") and f.startswith("sub-")
]

for fn in tqdm(all_files, desc="Loading data"):
    stem, _ = os.path.splitext(fn)
    subj_prefix = stem.split("_")[0]
    try:
        subj_id = int(subj_prefix.replace("sub-", ""))
    except ValueError:
        continue

    label = get_label(subj_id)
    if label is None:
        continue

    eeg_path = os.path.join(INPUT_DIR, fn)
    time_series = np.load(eeg_path)
    if time_series.shape[0] < time_series.shape[1]:
        time_series = time_series.T

    subject_data[subj_id] = {
        "data": time_series,
        "label": label
    }

# Get number of EEG channels from first subject
num_channels = next(iter(subject_data.values()))["data"].shape[1]

# Feature name generation
channel_feature_names = [
    "mean_psd", "rel_power", "spectral_entropy",
    "hjorth_activity", "hjorth_mobility", "hjorth_complexity",
    "mean", "std", "skew", "kurtosis", "rms", "zero_crossings"
]

all_feature_names = []
for ch in range(num_channels):
    for fname in channel_feature_names:
        all_feature_names.append(f"ch{ch+1}_{fname}")

# Train-test split
subject_ids = list(subject_data.keys())
labels = [subject_data[s]["label"] for s in subject_ids]
train_ids, test_ids = train_test_split(subject_ids, test_size=0.3, stratify=labels, random_state=42)

# Subject-level feature extraction (aggregate mean per subject)
def process_subjects(subject_ids):
    all_feats, all_labels = [], []
    for sid in subject_ids:
        ts = subject_data[sid]["data"]
        label = subject_data[sid]["label"]
        subj_feats = []
        for start in range(0, ts.shape[0] - WINDOW_SIZE + 1, WINDOW_SIZE):
            window = ts[start:start + WINDOW_SIZE, :]
            feats = extract_features(window)
            subj_feats.append(feats)
        # Aggregate across all windows for this subject (mean)
        subj_feats = np.mean(subj_feats, axis=0)
        all_feats.append(subj_feats)
        all_labels.append(label)
    return np.array(all_feats), np.array(all_labels)

train_X, train_y = process_subjects(train_ids)
test_X, test_y = process_subjects(test_ids)

# ANOVA
f_scores = []
p_values = []
for i in range(train_X.shape[1]):
    group_feats = [train_X[train_y == c, i] for c in np.unique(train_y)]
    f, p = f_oneway(*group_feats)
    f_scores.append(f)
    p_values.append(p)

# Report significant features
print("\nSignificant Features (p < 0.05):")
for i, (f, p) in enumerate(zip(f_scores, p_values)):
    if p < 0.05:
        print(f"{all_feature_names[i]}: F = {f:.2f}, p = {p:.4f}")

# Filter features by ANOVA p-values
significant_indices = [i for i, p in enumerate(p_values) if p < 0.05]
train_X_filtered = train_X[:, significant_indices]
test_X_filtered = test_X[:, significant_indices]
filtered_feature_names = [all_feature_names[i] for i in significant_indices]

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_X_filtered, train_y)

# Evaluate
pred_y = rf.predict(test_X_filtered)
acc = accuracy_score(test_y, pred_y)
print(f"\nRandom Forest Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(test_y, pred_y))

# Feature importances
importances = rf.feature_importances_
feature_weights = sorted(zip(filtered_feature_names, importances), key=lambda x: x[1], reverse=True)
print("\nTop 10 Important Features:")
for feat, weight in feature_weights[:10]:
    print(f"{feat}: {weight:.4f}")

Loading data: 100%|██████████| 88/88 [00:08<00:00, 10.74it/s]



Significant Features (p < 0.05):
ch1_spectral_entropy: F = 3.68, p = 0.0312
ch2_spectral_entropy: F = 3.55, p = 0.0350
ch3_spectral_entropy: F = 4.31, p = 0.0179
ch4_spectral_entropy: F = 5.37, p = 0.0073
ch4_hjorth_complexity: F = 3.76, p = 0.0293
ch5_spectral_entropy: F = 3.49, p = 0.0370
ch6_spectral_entropy: F = 3.53, p = 0.0357
ch7_mean_psd: F = 5.84, p = 0.0049
ch7_spectral_entropy: F = 5.03, p = 0.0097
ch7_hjorth_activity: F = 5.95, p = 0.0044
ch7_std: F = 6.35, p = 0.0032
ch7_rms: F = 6.35, p = 0.0032
ch8_spectral_entropy: F = 3.71, p = 0.0305
ch9_mean_psd: F = 9.28, p = 0.0003
ch9_spectral_entropy: F = 7.91, p = 0.0009
ch9_hjorth_activity: F = 9.33, p = 0.0003
ch9_hjorth_complexity: F = 4.71, p = 0.0127
ch9_std: F = 8.91, p = 0.0004
ch9_rms: F = 8.91, p = 0.0004
ch10_mean_psd: F = 10.88, p = 0.0001
ch10_spectral_entropy: F = 10.19, p = 0.0002
ch10_hjorth_activity: F = 10.83, p = 0.0001
ch10_hjorth_complexity: F = 4.41, p = 0.0164
ch10_std: F = 12.82, p = 0.0000
ch10_rms: F = 

In [7]:
import os
import numpy as np
from scipy.stats import f_oneway, skew, kurtosis
from sklearn.model_selection import train_test_split
from scipy.signal import welch
from tqdm import tqdm
from scipy.stats import entropy as shannon_entropy
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Constants
INPUT_DIR = "C:\\Users\\fathi\\Desktop\\Rest_eeg_ds004504-download\\derivatives\\500hz_bands\\alpha\\numpy"
SAMPLE_RATE = 500
WINDOW_SIZE = 3 * SAMPLE_RATE  # 4 seconds

# Label mapping
def get_label(s):
    if 1 <= s <= 36:
        return 2  # Alzheimer's
    elif 37 <= s <= 65:
        return 0  # Healthy Control
    elif 66 <= s <= 88:
        return 1  # Dementia
    else:
        return None

# Hjorth parameters
def hjorth_parameters(signal):
    first_deriv = np.diff(signal)
    second_deriv = np.diff(first_deriv)
    activity = np.var(signal)
    mobility = np.sqrt(np.var(first_deriv) / activity) if activity != 0 else 0
    complexity = (np.sqrt(np.var(second_deriv) / np.var(first_deriv))
                  if np.var(first_deriv) != 0 else 0)
    return activity, mobility, complexity

# Feature extractor per window/channel
def extract_features(window):
    features = []
    for ch in range(window.shape[1]):
        x = window[:, ch]

        # Welch PSD
        f, psd = welch(x, fs=SAMPLE_RATE, nperseg=window.shape[0])
        total_power = np.sum(psd)
        mean_psd = np.mean(psd)
        rel_power = mean_psd / total_power if total_power > 0 else 0
        psd_norm = psd / np.sum(psd) if np.sum(psd) > 0 else np.ones_like(psd) / len(psd)
        spec_entropy = shannon_entropy(psd_norm)

        # Hjorth
        act, mob, comp = hjorth_parameters(x)

        # Additional features
        mean_val = np.mean(x)
        std_val = np.std(x)
        skew_val = skew(x)
        kurt_val = kurtosis(x)
        rms = np.sqrt(np.mean(x**2))
        zero_crossings = ((x[:-1] * x[1:]) < 0).sum()

        features.extend([
            mean_psd, rel_power, spec_entropy, act, mob, comp,
            mean_val, std_val, skew_val, kurt_val, rms, zero_crossings
        ])
    return features

# Load EEG data
subject_data = {}
all_files = [
    f for f in os.listdir(INPUT_DIR)
    if f.endswith(".npy") and f.startswith("sub-")
]

for fn in tqdm(all_files, desc="Loading data"):
    stem, _ = os.path.splitext(fn)
    subj_prefix = stem.split("_")[0]
    try:
        subj_id = int(subj_prefix.replace("sub-", ""))
    except ValueError:
        continue

    label = get_label(subj_id)
    if label is None:
        continue

    eeg_path = os.path.join(INPUT_DIR, fn)
    time_series = np.load(eeg_path)
    if time_series.shape[0] < time_series.shape[1]:
        time_series = time_series.T

    subject_data[subj_id] = {
        "data": time_series,
        "label": label
    }

# Get number of EEG channels from first subject
num_channels = next(iter(subject_data.values()))["data"].shape[1]

# Feature name generation
channel_feature_names = [
    "mean_psd", "rel_power", "spectral_entropy",
    "hjorth_activity", "hjorth_mobility", "hjorth_complexity",
    "mean", "std", "skew", "kurtosis", "rms", "zero_crossings"
]

all_feature_names = []
for ch in range(num_channels):
    for fname in channel_feature_names:
        all_feature_names.append(f"ch{ch+1}_{fname}")

# Train-test split
subject_ids = list(subject_data.keys())
labels = [subject_data[s]["label"] for s in subject_ids]
train_ids, test_ids = train_test_split(subject_ids, test_size=0.3, stratify=labels, random_state=42)

# Subject-level feature extraction (aggregate mean per subject)
def process_subjects(subject_ids):
    all_feats, all_labels = [], []
    for sid in subject_ids:
        ts = subject_data[sid]["data"]
        label = subject_data[sid]["label"]
        subj_feats = []
        for start in range(0, ts.shape[0] - WINDOW_SIZE + 1, WINDOW_SIZE):
            window = ts[start:start + WINDOW_SIZE, :]
            feats = extract_features(window)
            subj_feats.append(feats)
        # Aggregate across all windows for this subject (mean)
        subj_feats = np.mean(subj_feats, axis=0)
        all_feats.append(subj_feats)
        all_labels.append(label)
    return np.array(all_feats), np.array(all_labels)

train_X, train_y = process_subjects(train_ids)
test_X, test_y = process_subjects(test_ids)

# ANOVA
f_scores = []
p_values = []
for i in range(train_X.shape[1]):
    group_feats = [train_X[train_y == c, i] for c in np.unique(train_y)]
    f, p = f_oneway(*group_feats)
    f_scores.append(f)
    p_values.append(p)

# Report significant features
print("\nSignificant Features (p < 0.05):")
significant_features = []
significant_indices = []
for i, (f, p) in enumerate(zip(f_scores, p_values)):
    if p < 0.05:
        print(f"{all_feature_names[i]}: F = {f:.2f}, p = {p:.4f}")
        significant_features.append(all_feature_names[i])
        significant_indices.append(i)

# Filter train and test data based on significant features
train_X_sig = train_X[:, significant_indices]
test_X_sig = test_X[:, significant_indices]

# Random Forest Classification
print("\n--- Random Forest Classifier ---")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_X_sig, train_y)
preds_rf = rf.predict(test_X_sig)
print("Accuracy:", accuracy_score(test_y, preds_rf))
print("Feature Importances:")
for name, importance in zip(significant_features, rf.feature_importances_):
    print(f"{name}: {importance:.4f}")
print("\nClassification Report:\n", classification_report(test_y, preds_rf))

# XGBoost Classification
print("\n--- XGBoost Classifier ---")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(train_X_sig, train_y)
preds_xgb = xgb.predict(test_X_sig)
print("Accuracy:", accuracy_score(test_y, preds_xgb))
print("Feature Importances:")
for name, importance in zip(significant_features, xgb.feature_importances_):
    print(f"{name}: {importance:.4f}")
print("\nClassification Report:\n", classification_report(test_y, preds_xgb))


Loading data: 100%|██████████| 88/88 [00:06<00:00, 12.94it/s]



Significant Features (p < 0.05):
ch1_rel_power: F = 4.43, p = 0.0162
ch1_spectral_entropy: F = 3.69, p = 0.0309
ch1_skew: F = 4.09, p = 0.0218
ch2_rel_power: F = 4.43, p = 0.0162
ch2_spectral_entropy: F = 3.68, p = 0.0312
ch3_rel_power: F = 4.43, p = 0.0162
ch3_spectral_entropy: F = 4.18, p = 0.0201
ch4_rel_power: F = 4.43, p = 0.0162
ch4_spectral_entropy: F = 5.43, p = 0.0069
ch4_hjorth_complexity: F = 3.61, p = 0.0333
ch5_rel_power: F = 4.43, p = 0.0162
ch5_spectral_entropy: F = 3.48, p = 0.0374
ch6_rel_power: F = 4.43, p = 0.0162
ch7_mean_psd: F = 6.02, p = 0.0042
ch7_rel_power: F = 4.43, p = 0.0162
ch7_spectral_entropy: F = 5.85, p = 0.0048
ch7_hjorth_activity: F = 5.95, p = 0.0045
ch7_std: F = 6.33, p = 0.0033
ch7_rms: F = 6.33, p = 0.0033
ch8_rel_power: F = 4.43, p = 0.0162
ch8_spectral_entropy: F = 3.69, p = 0.0310
ch9_mean_psd: F = 9.42, p = 0.0003
ch9_rel_power: F = 4.43, p = 0.0162
ch9_spectral_entropy: F = 8.41, p = 0.0006
ch9_hjorth_activity: F = 9.33, p = 0.0003
ch9_hjort

Parameters: { "use_label_encoder" } are not used.

