In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010030rest 20160324 1054..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020025rest 20150713 1519..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010013rest 20150703 1333..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020016rest 20150701 1040..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020015_rest 20150630 1527.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010022restnew 20150724 14.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020027rest 20150713 1049..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010008_rest 20150619 1653.csv
/kaggle/input/preprocessed-raw-m

# **Setup, load & preprocessing, save splits**

In [2]:
# CELL 1: Setup + Load + Preprocess + Save splits
import os, re, math, json, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import IncrementalPCA

# CONFIG
DATA_DIR    = '/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2'
OUTPUT_DIR  = '/kaggle/working/dl_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)

SAMPLE_FRAC      = 1.0        # set 0.1 for quick tests
USE_IPCA         = True
IPCA_COMPONENTS  = 128
IPCA_BATCH       = 5000
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# GPU memory growth (optional)
gpus = tf.config.experimental.list_physical_devices('GPU')
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception:
        pass

# helpers
def detect_eeg_columns(columns):
    regex = re.compile(r'^(?:EEG[_\-\s]?|E[_\-\s]?)(0*?)(\d{1,3})$', flags=re.I)
    found = {}
    for c in columns:
        m = regex.match(c.strip())
        if m:
            num = int(m.group(2))
            if 1 <= num <= 128:
                found[num] = c
    if found:
        return [found[i] for i in sorted(found.keys())]
    # fallback
    return [c for c in columns if re.match(r'^(E|EEG)\d+', c, flags=re.I)]

def to_binary_label_series(s):
    s = s.dropna()
    if s.empty: return None
    s_num = pd.to_numeric(s, errors='coerce')
    if s_num.notna().all():
        uniq = set(np.unique(s_num))
        if uniq.issubset({0,1}): return s_num.astype(int)
        if uniq.issubset({1,2}): return s_num.map({1:0,2:1}).astype(int)
        med = float(s_num.median()); return (s_num > med).astype(int)
    s_str = s.astype(str)
    unique_vals = s_str.unique()
    if len(unique_vals) == 1: return s_str.map({unique_vals[0]:0}).astype(int)
    if len(unique_vals) == 2:
        le = LabelEncoder().fit(unique_vals)
        return pd.Series(le.transform(s_str), index=s_str.index).astype(int)
    mode_val = s_str.mode().iat[0]; return (s_str != mode_val).astype(int)

# 1) Read CSVs
csvs = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.csv')])
if len(csvs)==0:
    raise RuntimeError("No CSV files in DATA_DIR")
print("Found", len(csvs), "CSV files.")

parts = []
for fn in csvs:
    path = os.path.join(DATA_DIR, fn)
    df = pd.read_csv(path, engine='python')
    if SAMPLE_FRAC is not None and 0 < SAMPLE_FRAC < 1.0:
        df = df.sample(frac=SAMPLE_FRAC, random_state=SEED)
    df['__source_file'] = os.path.splitext(fn)[0]
    parts.append(df)
combined = pd.concat(parts, ignore_index=True)
print("Combined shape:", combined.shape)

# 2) label detection (prefer epoch, label, condition)
label_cols_try = ['epoch','label','condition','cond','target']
label_series = None
for c in label_cols_try:
    if c in combined.columns:
        s = to_binary_label_series(combined[c])
        if s is not None:
            label_series = pd.Series(index=combined.index, dtype=int)
            label_series.loc[combined[c].dropna().index] = s
            label_series = label_series.fillna(0).astype(int)
            print("Using", c, "as labels.")
            break
if label_series is None:
    # fallback search
    for c in combined.columns:
        if c.startswith('__'): continue
        s = to_binary_label_series(combined[c])
        if s is not None:
            label_series = pd.Series(index=combined.index, dtype=int)
            label_series.loc[combined[c].dropna().index] = s
            label_series = label_series.fillna(0).astype(int)
            print("Fallback using", c, "as labels.")
            break
if label_series is None:
    raise RuntimeError("No suitable label column found. Ensure 'epoch'/'label' exists.")

print("Label distribution:", label_series.value_counts().to_dict())
if label_series.nunique() <= 1:
    print("Detected single class after mapping — abort and inspect label columns.")
    raise RuntimeError("Single-class dataset. Fix labels.")

combined['__label'] = label_series.astype(int)

# 3) Detect EEG columns & form feature matrix
eeg_cols = detect_eeg_columns(combined.columns)
if not eeg_cols:
    raise RuntimeError("No EEG columns detected; check column names.")
print("Detected EEG columns:", len(eeg_cols))
# drop known metadata columns
drop_cols = {'time','condition','label','epoch','__source_file','__label'}
feature_cols = [c for c in eeg_cols if c not in drop_cols]
if len(feature_cols) == 0:
    raise RuntimeError("No feature columns after filtering.")
X_full = combined[feature_cols].to_numpy(dtype=np.float32)
y = combined['__label'].to_numpy(dtype=np.int32)
print("X_full shape:", X_full.shape, "y shape:", y.shape)

# impute NaNs
if np.isnan(X_full).any():
    col_means = np.nanmean(X_full, axis=0)
    inds = np.where(np.isnan(X_full)); X_full[inds] = np.take(col_means, inds[1])
    print("Imputed NaNs.")

# 4) Optional IncrementalPCA
if USE_IPCA and IPCA_COMPONENTS is not None and 0 < IPCA_COMPONENTS < X_full.shape[1]:
    print("Running IncrementalPCA...")
    ipca = IncrementalPCA(n_components=IPCA_COMPONENTS)
    n = X_full.shape[0]; bs = IPCA_BATCH
    for i in range(0, n, bs):
        ipca.partial_fit(X_full[i:i+bs])
    X_reduced = np.empty((n, IPCA_COMPONENTS), dtype=np.float32)
    for i in range(0, n, bs):
        X_reduced[i:i+bs] = ipca.transform(X_full[i:i+bs]).astype(np.float32)
    X = X_reduced
else:
    X = X_full
print("Post-PCA shape:", X.shape)

# 5) scale and split (save splits for model cells)
scaler = StandardScaler()
X = scaler.fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
# persist splits so model cells can load them
np.savez_compressed(os.path.join(OUTPUT_DIR, 'data_split.npz'),
                    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
print("Saved data_split.npz to", OUTPUT_DIR)
# create empty models_results.json if not exists
res_path = os.path.join(OUTPUT_DIR, 'models_results.json')
if not os.path.exists(res_path):
    with open(res_path,'w') as f: json.dump([], f)
print("Cell 1 done.")


2025-11-20 09:15:46.698153: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763630146.895604      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763630146.959594      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Found 51 CSV files.
Combined shape: (3862388, 133)
Using epoch as labels.
Label distribution: {0: 1932951, 1: 1929437}
Detected EEG columns: 128
X_full shape: (3862388, 128) y shape: (3862388,)
Post-PCA shape: (3862388, 128)
Saved data_split.npz to /kaggle/working/dl_results
Cell 1 done.


# **Utility functions**

In [3]:
# CELL 2: Utility functions for model cells (run once)
import os, json, numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score

OUTPUT_DIR = '/kaggle/working/dl_results'
def load_data_splits():
    p = os.path.join(OUTPUT_DIR, 'data_split.npz')
    d = np.load(p)
    return d['X_train'], d['X_test'], d['y_train'], d['y_test']

def save_model_result(res):
    """Append JSON-serializable result dict to models_results.json"""
    p = os.path.join(OUTPUT_DIR, 'models_results.json')
    lst = []
    if os.path.exists(p):
        with open(p,'r') as f:
            try:
                lst = json.load(f)
            except Exception:
                lst = []
    lst.append(res)
    with open(p,'w') as f:
        json.dump(lst, f)

def make_result_dict(name, model, X_test, y_test, history=None):
    # predict probabilities where possible
    try:
        probs = model.predict(X_test, verbose=0).ravel()
    except Exception:
        # if model expects 3D or 4D, let caller reshape X_test appropriately before calling make_result_dict
        probs = model.predict(X_test, verbose=0).ravel()
    preds = (probs >= 0.5).astype(int)
    acc = float(accuracy_score(y_test, preds))
    try:
        roc_auc = float(roc_auc_score(y_test, probs))
    except Exception:
        roc_auc = None
    rep = classification_report(y_test, preds)
    cm = confusion_matrix(y_test, preds).tolist()
    try:
        fpr,tpr,_ = roc_curve(y_test, probs)
        fpr = fpr.tolist(); tpr = tpr.tolist()
    except Exception:
        fpr,tpr = [], []
    hist_dict = history.history if history is not None else {}
    # convert numpy types in hist to lists
    clean_hist = {k: (list(np.array(v).astype(float)) if hasattr(v,'__iter__') else v) for k,v in hist_dict.items()}
    res = {
        'name': name,
        'accuracy': acc,
        'roc_auc': roc_auc,
        'class_report': rep,
        'conf_mat': cm,
        'fpr': fpr,
        'tpr': tpr,
        'history': clean_hist
    }
    return res

print("Cell 2 loaded utilities.")

Cell 2 loaded utilities.


# **GAN**

In [6]:
# ============================================================
# CELL 3 — Lightweight Stabilized GAN (1000 epochs)
# Saves ONLY tiny .h5 weights + plots + JSON result
# ============================================================

import os, time, gc
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

OUTPUT_DIR = '/kaggle/working/dl_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)

LATENT_DIM = 100
EPOCHS_GAN = 1000
BATCH = 64
SEED = 42
np.random.seed(SEED); tf.random.set_seed(SEED)

# ---------------------------
# Load data splits
# ---------------------------
X_train, X_test, y_train, y_test = load_data_splits()
X = X_train.astype(np.float32)
FEATURES = X.shape[1]
N_train = X.shape[0]
print(f"[GAN] features={FEATURES} | n_train={N_train}")

# ---------------------------
# Generator & Discriminator
# ---------------------------
def build_generator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(256, input_dim=LATENT_DIM),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(512),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(1024),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(FEATURES, activation='tanh')
    ], name="generator")

def build_discriminator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(512, input_shape=(FEATURES,)),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(256),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name="discriminator")

generator = build_generator()
discriminator = build_discriminator()

opt_d = tf.keras.optimizers.Adam(0.0002, 0.5)
opt_g = tf.keras.optimizers.Adam(0.0002, 0.5)

@tf.function
def d_loss(real_s, fake_s): return tf.reduce_mean(fake_s) - tf.reduce_mean(real_s)

@tf.function
def g_loss(fake_s): return -tf.reduce_mean(fake_s)

WEIGHT_CLIP = 0.01
DISC_STEPS = 3

@tf.function
def train_step(real_batch):
    batch = tf.shape(real_batch)[0]
    
    for _ in tf.range(DISC_STEPS):
        noise = tf.random.normal((batch, LATENT_DIM))
        with tf.GradientTape() as tape_d:
            fake = generator(noise, training=True)
            real_out = discriminator(real_batch, training=True)
            fake_out = discriminator(fake, training=True)
            ld = d_loss(real_out, fake_out)
        grads_d = tape_d.gradient(ld, discriminator.trainable_variables)
        opt_d.apply_gradients(zip(grads_d, discriminator.trainable_variables))
        for v in discriminator.trainable_variables:
            v.assign(tf.clip_by_value(v, -WEIGHT_CLIP, WEIGHT_CLIP))

    noise = tf.random.normal((batch, LATENT_DIM))
    with tf.GradientTape() as tape_g:
        fake = generator(noise, training=True)
        fake_out = discriminator(fake, training=True)
        lg = g_loss(fake_out)
    grads_g = tape_g.gradient(lg, generator.trainable_variables)
    opt_g.apply_gradients(zip(grads_g, generator.trainable_variables))

    return ld, lg

# ---------------------------
# GAN Training
# ---------------------------
d_losses, g_losses = [], []
print("[GAN] Training...")

for epoch in range(EPOCHS_GAN):
    if epoch > 0 and epoch % 200 == 0:
        opt_d.learning_rate.assign(opt_d.learning_rate * 0.5)
        opt_g.learning_rate.assign(opt_g.learning_rate * 0.5)

    idx = np.random.randint(0, N_train, BATCH)
    real_batch = X[idx]
    ld, lg = train_step(real_batch)
    d_losses.append(float(ld)); g_losses.append(float(lg))

    if epoch % 50 == 0:
        print(f"Epoch {epoch}/{EPOCHS_GAN} | D={d_losses[-1]:.4f} G={g_losses[-1]:.4f}")

print("[GAN] Training Complete")

# ---------------------------
# Generate synthetic (memory safe)
# ---------------------------
def generate_in_batches(model, n_samples, batch=2048):
    out = []
    for i in range(0, n_samples, batch):
        b = min(batch, n_samples - i)
        noise = np.random.normal(0,1,(b, LATENT_DIM)).astype(np.float32)
        out.append(model.predict(noise, verbose=0))
    return np.vstack(out)

synthetic = generate_in_batches(generator, N_train)

# ---------------------------
# Train classifier version of discriminator
# ---------------------------
combined_X = np.vstack([X, synthetic]).astype(np.float32)
combined_y = np.hstack([y_train, np.zeros(len(synthetic), dtype=np.int32)])

# *** FIX: filename must end with 'weights.h5' when save_weights_only=True
disc_best_path = os.path.join(OUTPUT_DIR, "disc_classifier_best.weights.h5")

checkpoint = ModelCheckpoint(disc_best_path, save_best_only=True,
                             save_weights_only=True, monitor='val_loss', verbose=0)
# early = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

discriminator.compile(optimizer=tf.keras.optimizers.Adam(0.0002,0.5),
                      loss="binary_crossentropy", metrics=["accuracy"])

history = discriminator.fit(
    combined_X, combined_y,
    epochs=100, batch_size=64,
    validation_split=0.2,
    callbacks=checkpoint,
    verbose=1
)

# Save final generator weights (use .weights.h5 to be consistent)
generator.save_weights(os.path.join(OUTPUT_DIR,"gan_generator_last.weights.h5"))

# ---------------------------
# Evaluate
# ---------------------------
y_prob = discriminator.predict(X_test, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ---------------------------
# Save results to JSON (was missing)
# ---------------------------
gan_result = make_result_dict(
    name="GAN_Augmented_Classifier",
    model=discriminator,
    X_test=X_test,
    y_test=y_test,
    history=history
)
save_model_result(gan_result)
print("[JSON] GAN result appended.")

# ---------------------------
# Save essential PNG plots
# ---------------------------
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc_val = roc_auc_score(y_test, y_prob)
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}")
plt.plot([0,1],[0,1],'k--')
plt.title("GAN ROC")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR,"gan_roc_2.png"), dpi=100)
plt.close()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap="Blues")
plt.colorbar()
plt.title("GAN Confusion Matrix")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR,"gan_cm_2.png"), dpi=100)
plt.close()

plt.figure(figsize=(6,3))
plt.plot(history.history.get("accuracy",[]), label="train")
plt.plot(history.history.get("val_accuracy",[]), label="val")
plt.title("GAN Train/Val Accuracy")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR,"gan_acc_2.png"), dpi=100)
plt.close()

plt.figure(figsize=(6,3))
plt.plot(history.history.get("loss",[]), label="train")
plt.plot(history.history.get("val_loss",[]), label="val")
plt.title("GAN Train/Val Loss")
plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR,"gan_loss_2.png"), dpi=100)
plt.close()

# Cleanup
K.clear_session()
gc.collect()
print("✅ GAN Cell Completed (Tiny + JSON saved + Weights saved)")

[GAN] features=128 | n_train=3089910
[GAN] Training...
Epoch 0/1000 | D=-0.0001 G=-0.4997
Epoch 50/1000 | D=-0.0390 G=-0.4883
Epoch 100/1000 | D=0.0038 G=-0.5284
Epoch 150/1000 | D=-0.0498 G=-0.4673
Epoch 200/1000 | D=-0.0445 G=-0.4572
Epoch 250/1000 | D=-0.0235 G=-0.5066
Epoch 300/1000 | D=-0.0066 G=-0.4862
Epoch 350/1000 | D=-0.0446 G=-0.5076
Epoch 400/1000 | D=-0.0427 G=-0.4669
Epoch 450/1000 | D=-0.0253 G=-0.5079
Epoch 500/1000 | D=0.0310 G=-0.5597
Epoch 550/1000 | D=0.0152 G=-0.4661
Epoch 600/1000 | D=-0.0165 G=-0.5054
Epoch 650/1000 | D=-0.0108 G=-0.5085
Epoch 700/1000 | D=-0.0008 G=-0.5419
Epoch 750/1000 | D=-0.0393 G=-0.4602
Epoch 800/1000 | D=-0.0027 G=-0.4969
Epoch 850/1000 | D=-0.0128 G=-0.5023
Epoch 900/1000 | D=-0.0094 G=-0.5064
Epoch 950/1000 | D=-0.0111 G=-0.5135
[GAN] Training Complete
Epoch 1/100
[1m77248/77248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 3ms/step - accuracy: 0.7268 - loss: 0.4353 - val_accuracy: 1.0000 - val_loss: 0.0064
Epoch 2/100
[1m77