In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load EEG dataset
file_path = "data/train.csv"
df = pd.read_csv(file_path)

# Relevant EEG features
relevant_columns = [
    # Schizophrenia (alpha PSD)
    "AB.C.alpha.a.FP1", "AB.C.alpha.b.FP2", "AB.C.alpha.c.F7", "AB.C.alpha.d.F3",
    "AB.C.alpha.e.Fz", "AB.C.alpha.f.F4", "AB.C.alpha.g.F8", "AB.C.alpha.h.T3",
    "AB.C.alpha.i.C3", "AB.C.alpha.j.Cz", "AB.C.alpha.k.C4", "AB.C.alpha.l.T4",
    "AB.C.alpha.m.T5", "AB.C.alpha.n.P3", "AB.C.alpha.o.Pz", "AB.C.alpha.p.P4",
    "AB.C.alpha.q.T6", "AB.C.alpha.r.O1", "AB.C.alpha.s.O2",

    # Trauma and stress-related disorders (beta FC)
    "COH.D.beta.a.FP1.b.FP2", "COH.D.beta.c.F7.d.F3", "COH.D.beta.e.Fz.f.F4",
    "COH.D.beta.g.F8.h.T3", "COH.D.beta.i.C3.j.Cz", "COH.D.beta.k.C4.l.T4",
    "COH.D.beta.m.T5.n.P3", "COH.D.beta.o.Pz.p.P4", "COH.D.beta.q.T6.r.O1",

    # Anxiety disorders (whole band PSD)
    "AB.A.delta.a.FP1", "AB.A.delta.b.FP2", "AB.B.theta.a.FP1", "AB.B.theta.b.FP2",
    "AB.C.alpha.a.FP1", "AB.C.alpha.b.FP2", "AB.D.beta.a.FP1", "AB.D.beta.b.FP2",
    "AB.E.highbeta.a.FP1", "AB.E.highbeta.b.FP2", "AB.F.gamma.a.FP1", "AB.F.gamma.b.FP2",

    # Mood disorders (theta FC)
    "COH.B.theta.a.FP1.b.FP2", "COH.B.theta.c.F7.d.F3", "COH.B.theta.e.Fz.f.F4",
    "COH.B.theta.g.F8.h.T3", "COH.B.theta.i.C3.j.Cz", "COH.B.theta.k.C4.l.T4",
    "COH.B.theta.m.T5.n.P3", "COH.B.theta.o.Pz.p.P4", "COH.B.theta.q.T6.r.O1",

    # Addictive disorders (theta PSD)
    "AB.B.theta.a.FP1", "AB.B.theta.b.FP2", "AB.B.theta.c.F7", "AB.B.theta.d.F3",
    "AB.B.theta.e.Fz", "AB.B.theta.f.F4", "AB.B.theta.g.F8", "AB.B.theta.h.T3",
    "AB.B.theta.i.C3", "AB.B.theta.j.Cz", "AB.B.theta.k.C4", "AB.B.theta.l.T4",
    "AB.B.theta.m.T5", "AB.B.theta.n.P3", "AB.B.theta.o.Pz", "AB.B.theta.p.P4",
    "AB.B.theta.q.T6", "AB.B.theta.r.O1", "AB.B.theta.s.O2",

    # Obsessive-compulsive disorder (gamma FC)
    "COH.F.gamma.a.FP1.b.FP2", "COH.F.gamma.c.F7.d.F3", "COH.F.gamma.e.Fz.f.F4",
    "COH.F.gamma.g.F8.h.T3", "COH.F.gamma.i.C3.j.Cz", "COH.F.gamma.k.C4.l.T4",
    "COH.F.gamma.m.T5.n.P3", "COH.F.gamma.o.Pz.p.P4", "COH.F.gamma.q.T6.r.O1"
]

# Target label
target_col = "main.disorder"

# Filter relevant columns & drop NaNs
df_eeg = df[[target_col] + relevant_columns].dropna()


In [2]:
# Define Autoencoder Model
def build_autoencoder(input_dim, encoding_dim=32):
    input_layer = Input(shape=(input_dim,))
    
    # Encoder
    encoded = Dense(64, activation="relu")(input_layer)
    encoded = Dense(encoding_dim, activation="relu")(encoded)

    # Decoder
    decoded = Dense(64, activation="relu")(encoded)
    decoded = Dense(input_dim, activation="linear")(decoded)  # Match input shape

    # Compile model
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer="adam", loss="mse")

    # Encoder Model (for feature extraction)
    encoder = Model(input_layer, encoded)
    
    return autoencoder, encoder


In [3]:
# List of unique disorder classes
disorders = df_eeg["main.disorder"].unique()

# Autoencoder parameters
encoding_dim = 10  # Latent space size
epochs = 100
batch_size = 32

# Train an autoencoder for each disorder class
for disorder in disorders:
    print(f"\n🔹 Training Autoencoder for: {disorder}")

    # Filter data for the current disorder class
    df_class = df[df["main.disorder"] == disorder][relevant_columns].dropna()

    if df_class.shape[0] < 10:
        print(f"⚠️ Skipping {disorder} (too few samples)")
        continue  # Skip classes with too few samples

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_class)

    # Split data into training and validation sets
    X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)

    # Define Autoencoder Model
    input_dim = X_train.shape[1]
    input_layer = Input(shape=(input_dim,))
    
    encoded = Dense(encoding_dim, activation="relu")(input_layer)
    decoded = Dense(input_dim, activation="sigmoid")(encoded)  # Reconstruction

    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)

    # Compile and train the autoencoder
    autoencoder.compile(optimizer="adam", loss="mse")
    autoencoder.fit(X_train, X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_val, X_val),
                    verbose=1)

    # Save encoded representations
    encoded_data = encoder.predict(X_scaled)
    encoded_df = pd.DataFrame(encoded_data)
    encoded_df["main.disorder"] = disorder  # Attach disorder label

    # Save to CSV
    save_path = f"data/encoded_{disorder.replace(' ', '_')}.csv"
    encoded_df.to_csv(save_path, index=False)
    print(f"Encoded representations saved: {save_path}")

print("\n Autoencoding completed for all disorder classes!")


🔹 Training Autoencoder for: Mood disorder


NameError: name 'train_test_split' is not defined

In [None]:
# Concatenate synthetic data from all classes
df_synthetic_all = pd.concat(synthetic_data, axis=0)

# Save synthetic dataset
df_synthetic_all.to_csv(".csv", index=False)

print("Synthetic EEG dataset saved!")
