In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010030rest 20160324 1054..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020025rest 20150713 1519..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010013rest 20150703 1333..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020016rest 20150701 1040..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020015_rest 20150630 1527.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010022restnew 20150724 14.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020027rest 20150713 1049..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010008_rest 20150619 1653.csv
/kaggle/input/preprocessed-raw-m

# **Setup, load & preprocessing, save splits**

In [2]:
# CELL 1: Setup + Load + Preprocess + Save splits
import os, re, math, json, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import IncrementalPCA

# CONFIG
DATA_DIR    = '/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2'
OUTPUT_DIR  = '/kaggle/working/dl_results'
os.makedirs(OUTPUT_DIR, exist_ok=True)

SAMPLE_FRAC      = 1.0        # set 0.1 for quick tests
USE_IPCA         = True
IPCA_COMPONENTS  = 128
IPCA_BATCH       = 5000
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# GPU memory growth (optional)
gpus = tf.config.experimental.list_physical_devices('GPU')
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception:
        pass

# helpers
def detect_eeg_columns(columns):
    regex = re.compile(r'^(?:EEG[_\-\s]?|E[_\-\s]?)(0*?)(\d{1,3})$', flags=re.I)
    found = {}
    for c in columns:
        m = regex.match(c.strip())
        if m:
            num = int(m.group(2))
            if 1 <= num <= 128:
                found[num] = c
    if found:
        return [found[i] for i in sorted(found.keys())]
    # fallback
    return [c for c in columns if re.match(r'^(E|EEG)\d+', c, flags=re.I)]

def to_binary_label_series(s):
    s = s.dropna()
    if s.empty: return None
    s_num = pd.to_numeric(s, errors='coerce')
    if s_num.notna().all():
        uniq = set(np.unique(s_num))
        if uniq.issubset({0,1}): return s_num.astype(int)
        if uniq.issubset({1,2}): return s_num.map({1:0,2:1}).astype(int)
        med = float(s_num.median()); return (s_num > med).astype(int)
    s_str = s.astype(str)
    unique_vals = s_str.unique()
    if len(unique_vals) == 1: return s_str.map({unique_vals[0]:0}).astype(int)
    if len(unique_vals) == 2:
        le = LabelEncoder().fit(unique_vals)
        return pd.Series(le.transform(s_str), index=s_str.index).astype(int)
    mode_val = s_str.mode().iat[0]; return (s_str != mode_val).astype(int)

# 1) Read CSVs
csvs = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.csv')])
if len(csvs)==0:
    raise RuntimeError("No CSV files in DATA_DIR")
print("Found", len(csvs), "CSV files.")

parts = []
for fn in csvs:
    path = os.path.join(DATA_DIR, fn)
    df = pd.read_csv(path, engine='python')
    if SAMPLE_FRAC is not None and 0 < SAMPLE_FRAC < 1.0:
        df = df.sample(frac=SAMPLE_FRAC, random_state=SEED)
    df['__source_file'] = os.path.splitext(fn)[0]
    parts.append(df)
combined = pd.concat(parts, ignore_index=True)
print("Combined shape:", combined.shape)

# 2) label detection (prefer epoch, label, condition)
label_cols_try = ['epoch','label','condition','cond','target']
label_series = None
for c in label_cols_try:
    if c in combined.columns:
        s = to_binary_label_series(combined[c])
        if s is not None:
            label_series = pd.Series(index=combined.index, dtype=int)
            label_series.loc[combined[c].dropna().index] = s
            label_series = label_series.fillna(0).astype(int)
            print("Using", c, "as labels.")
            break
if label_series is None:
    # fallback search
    for c in combined.columns:
        if c.startswith('__'): continue
        s = to_binary_label_series(combined[c])
        if s is not None:
            label_series = pd.Series(index=combined.index, dtype=int)
            label_series.loc[combined[c].dropna().index] = s
            label_series = label_series.fillna(0).astype(int)
            print("Fallback using", c, "as labels.")
            break
if label_series is None:
    raise RuntimeError("No suitable label column found. Ensure 'epoch'/'label' exists.")

print("Label distribution:", label_series.value_counts().to_dict())
if label_series.nunique() <= 1:
    print("Detected single class after mapping — abort and inspect label columns.")
    raise RuntimeError("Single-class dataset. Fix labels.")

combined['__label'] = label_series.astype(int)

# 3) Detect EEG columns & form feature matrix
eeg_cols = detect_eeg_columns(combined.columns)
if not eeg_cols:
    raise RuntimeError("No EEG columns detected; check column names.")
print("Detected EEG columns:", len(eeg_cols))
# drop known metadata columns
drop_cols = {'time','condition','label','epoch','__source_file','__label'}
feature_cols = [c for c in eeg_cols if c not in drop_cols]
if len(feature_cols) == 0:
    raise RuntimeError("No feature columns after filtering.")
X_full = combined[feature_cols].to_numpy(dtype=np.float32)
y = combined['__label'].to_numpy(dtype=np.int32)
print("X_full shape:", X_full.shape, "y shape:", y.shape)

# impute NaNs
if np.isnan(X_full).any():
    col_means = np.nanmean(X_full, axis=0)
    inds = np.where(np.isnan(X_full)); X_full[inds] = np.take(col_means, inds[1])
    print("Imputed NaNs.")

# 4) Optional IncrementalPCA
if USE_IPCA and IPCA_COMPONENTS is not None and 0 < IPCA_COMPONENTS < X_full.shape[1]:
    print("Running IncrementalPCA...")
    ipca = IncrementalPCA(n_components=IPCA_COMPONENTS)
    n = X_full.shape[0]; bs = IPCA_BATCH
    for i in range(0, n, bs):
        ipca.partial_fit(X_full[i:i+bs])
    X_reduced = np.empty((n, IPCA_COMPONENTS), dtype=np.float32)
    for i in range(0, n, bs):
        X_reduced[i:i+bs] = ipca.transform(X_full[i:i+bs]).astype(np.float32)
    X = X_reduced
else:
    X = X_full
print("Post-PCA shape:", X.shape)

# 5) scale and split (save splits for model cells)
scaler = StandardScaler()
X = scaler.fit_transform(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)
# persist splits so model cells can load them
np.savez_compressed(os.path.join(OUTPUT_DIR, 'data_split.npz'),
                    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
print("Saved data_split.npz to", OUTPUT_DIR)
# create empty models_results.json if not exists
res_path = os.path.join(OUTPUT_DIR, 'models_results.json')
if not os.path.exists(res_path):
    with open(res_path,'w') as f: json.dump([], f)
print("Cell 1 done.")


2025-11-23 19:38:04.292186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763926684.772100      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763926684.934232      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Found 51 CSV files.
Combined shape: (3862388, 133)
Using epoch as labels.
Label distribution: {0: 1932951, 1: 1929437}
Detected EEG columns: 128
X_full shape: (3862388, 128) y shape: (3862388,)
Post-PCA shape: (3862388, 128)
Saved data_split.npz to /kaggle/working/dl_results
Cell 1 done.


# **Utility functions**

In [1]:
# CELL 2: Utility functions for model cells (run once)
import os, json, numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score

OUTPUT_DIR = '/kaggle/working/dl_results'
def load_data_splits():
    p = os.path.join(OUTPUT_DIR, 'data_split.npz')
    d = np.load(p)
    return d['X_train'], d['X_test'], d['y_train'], d['y_test']

def save_model_result(res):
    """Append JSON-serializable result dict to models_results.json"""
    p = os.path.join(OUTPUT_DIR, 'models_results.json')
    lst = []
    if os.path.exists(p):
        with open(p,'r') as f:
            try:
                lst = json.load(f)
            except Exception:
                lst = []
    lst.append(res)
    with open(p,'w') as f:
        json.dump(lst, f)

def make_result_dict(name, model, X_test, y_test, history=None):
    # predict probabilities where possible
    try:
        probs = model.predict(X_test, verbose=0).ravel()
    except Exception:
        # if model expects 3D or 4D, let caller reshape X_test appropriately before calling make_result_dict
        probs = model.predict(X_test, verbose=0).ravel()
    preds = (probs >= 0.5).astype(int)
    acc = float(accuracy_score(y_test, preds))
    try:
        roc_auc = float(roc_auc_score(y_test, probs))
    except Exception:
        roc_auc = None
    rep = classification_report(y_test, preds)
    cm = confusion_matrix(y_test, preds).tolist()
    try:
        fpr,tpr,_ = roc_curve(y_test, probs)
        fpr = fpr.tolist(); tpr = tpr.tolist()
    except Exception:
        fpr,tpr = [], []
    hist_dict = history.history if history is not None else {}
    # convert numpy types in hist to lists
    clean_hist = {k: (list(np.array(v).astype(float)) if hasattr(v,'__iter__') else v) for k,v in hist_dict.items()}
    res = {
        'name': name,
        'accuracy': acc,
        'roc_auc': roc_auc,
        'class_report': rep,
        'conf_mat': cm,
        'fpr': fpr,
        'tpr': tpr,
        'history': clean_hist
    }
    return res

print("Cell 2 loaded utilities.")

Cell 2 loaded utilities.


# **Statistical Analysis**

In [4]:
# CELL: Per-subject graphs -> group-level node betweenness & clustering comparisons
# - Loads each subject CSV separately (memory-safe)
# - Builds per-subject correlation graphs (proportional threshold by density)
# - Computes node betweenness & clustering coef per subject
# - Aggregates across groups, runs tests (t-test or Mann-Whitney), computes Cohen's d
# - Saves CSVs + histograms + boxplots to OUTPUT_DIR
import os, math, time, json, warnings
from pathlib import Path
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import networkx as nx
import scipy.stats as st

# --------- CONFIG ---------
DATA_DIR   = r"/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2"  # edit if needed
OUTPUT_DIR = r"/kaggle/working/dl_results/per_subject_graphs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Graph thresholding: 'proportional' keeps top fraction of edges; set density in (0,1)
THRESH_METHOD = "proportional"
DENSITY_KEEP  = 0.20   # keep top 20% absolute correlations as edges
MIN_TIMESERIES_LEN = 5  # skip files with fewer samples
LABEL_COL_CANDIDATES = ['epoch','label','condition','cond','target','y']  # try in order

print("OUTPUT_DIR:", OUTPUT_DIR)
start_all = time.time()

# ------- helpers -------
def detect_eeg_columns(cols):
    import re
    regex = re.compile(r'^(?:EEG[_\-\s]?|E[_\-\s]?)(0*?)(\d{1,3})$', flags=re.I)
    found = {}
    for c in cols:
        m = regex.match(str(c).strip())
        if m:
            num = int(m.group(2))
            if 1 <= num <= 128:
                found[num] = c
    if found:
        return [found[i] for i in sorted(found.keys())]
    # fallback: pick numeric columns except obvious metadata
    cand = [c for c in cols if isinstance(c, str) and c.lower() not in ('time','condition')]
    return cand

def to_binary_label_series(s):
    s = s.dropna()
    if s.empty: return None
    s_num = pd.to_numeric(s, errors='coerce')
    if s_num.notna().all():
        uniq = set(np.unique(s_num))
        if uniq.issubset({0,1}): return s_num.astype(int)
        if uniq.issubset({1,2}): return s_num.map({1:0,2:1}).astype(int)
        med = float(s_num.median()); return (s_num > med).astype(int)
    s_str = s.astype(str)
    unique_vals = s_str.unique()
    if len(unique_vals) == 1: return s_str.map({unique_vals[0]:0}).astype(int)
    if len(unique_vals) == 2:
        # deterministic mapping by sort
        mapping = {unique_vals[0]:0, unique_vals[1]:1}
        return s_str.map(mapping).astype(int)
    mode_val = s_str.mode().iat[0]; return (s_str != mode_val).astype(int)

def threshold_proportional(abs_corr, keep_density):
    # abs_corr: square matrix, diag = 1
    n = abs_corr.shape[0]
    iu = np.triu_indices(n, k=1)
    vals = abs_corr[iu]
    if vals.size == 0:
        return np.zeros_like(abs_corr, dtype=int)
    # determine cutoff so that proportion of edges kept approximates keep_density
    k = int(np.floor(keep_density * (n*(n-1)/2)))
    if k <= 0:
        thr = 1.01  # keep none
    else:
        thr = np.sort(vals)[-k] if k < len(vals) else vals.min()
    A = (abs_corr >= thr).astype(int)
    np.fill_diagonal(A, 0)
    return A

def cohen_d(x, y):
    x = np.asarray(x); y = np.asarray(y)
    nx_, ny_ = len(x), len(y)
    if nx_ < 2 or ny_ < 2:
        return np.nan
    sx = x.std(ddof=1); sy = y.std(ddof=1)
    if np.isnan(sx) or np.isnan(sy): return np.nan
    pooled = np.sqrt(((nx_-1)*sx*sx + (ny_-1)*sy*sy) / (nx_+ny_-2))
    if pooled == 0:
        return np.nan
    return (x.mean() - y.mean()) / pooled

# ------- iterate over files (per-subject) -------
files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith('.csv')])
if len(files) == 0:
    raise RuntimeError("No CSVs found in DATA_DIR")

rows = []   # per-subject summary rows
node_rows = []  # per-subject per-node metrics

for fn in files:
    fp = os.path.join(DATA_DIR, fn)
    try:
        df = pd.read_csv(fp, engine='python')
    except Exception as e:
        print("skip (read fail):", fn, e); continue

    # detect label for this file
    label_series = None; label_found = None
    for c in LABEL_COL_CANDIDATES:
        if c in df.columns:
            s = to_binary_label_series(df[c])
            if s is not None:
                label_series = s
                label_found = c
                break
    # fallback: if no label in-file, try filename mapping (source-level); assume filename contains class 0/1?
    if label_series is None:
        # default: no label -> skip (we need group membership)
        print("skip (no label):", fn); continue

    # detect EEG/feature columns
    eeg_cols = detect_eeg_columns(df.columns)
    # remove metadata columns often present
    eeg_cols = [c for c in eeg_cols if c not in ('time','condition','epoch','label')]
    if len(eeg_cols) < 2:
        # try numeric columns except label
        cand = [c for c in df.columns if c not in (label_found, 'time','condition') and pd.api.types.is_numeric_dtype(df[c])]
        eeg_cols = cand
    if len(eeg_cols) < 2:
        print("skip (not enough features):", fn); continue

    # prepare timeseries: shape (T, n_features)
    data = df[eeg_cols].apply(pd.to_numeric, errors='coerce').values
    # drop rows with NaNs
    mask = ~np.isnan(data).any(axis=1)
    data = data[mask]
    if data.shape[0] < MIN_TIMESERIES_LEN:
        print("skip (short timeseries):", fn); continue

    # compute feature x feature correlation across time (pearson)
    # shape (n_features, n_features)
    try:
        C = np.corrcoef(data, rowvar=False)
        C = np.nan_to_num(C, nan=0.0)
    except Exception as e:
        print("corr fail:", fn, e); continue

    absC = np.abs(C)

    # threshold -> adjacency
    if THRESH_METHOD == "proportional":
        A = threshold_proportional(absC, DENSITY_KEEP)
    else:
        # fixed threshold fallback
        thr = float(THRESH_METHOD)
        A = (absC >= thr).astype(int); np.fill_diagonal(A,0)

    # build graph
    G = nx.from_numpy_array(A)
    # compute node-level metrics
    if G.number_of_nodes() == 0:
        print("empty graph:", fn); continue

    # betweenness centrality
    try:
        bc = nx.betweenness_centrality(G, normalized=True)
    except Exception:
        # fallback approximate for large graphs
        bc = nx.betweenness_centrality(G, normalized=True)
    clust = nx.clustering(G)   # local clustering coefficient (unweighted)
    degree = dict(G.degree())

    # aggregate metrics (per-node)
    for node in range(len(eeg_cols)):
        node_rows.append({
            'subject': fn,
            'label': int(label_series.mode().iat[0]) if hasattr(label_series, 'mode') else int(label_series.iloc[0]),
            'node_idx': int(node),
            'feature_name': eeg_cols[node],
            'betweenness': float(bc.get(node, 0.0)),
            'clustering': float(clust.get(node, 0.0)),
            'degree': int(degree.get(node, 0))
        })

    # subject-level summary (mean metrics across nodes)
    mean_bc = np.mean(list(bc.values())) if len(bc)>0 else 0.0
    mean_cl = np.mean(list(clust.values())) if len(clust)>0 else 0.0
    rows.append({
        'subject': fn,
        'label': int(label_series.mode().iat[0]) if hasattr(label_series, 'mode') else int(label_series.iloc[0]),
        'n_nodes': len(eeg_cols),
        'n_time': data.shape[0],
        'mean_betweenness': float(mean_bc),
        'mean_clustering': float(mean_cl),
        'density_used': float(A.sum() / (len(eeg_cols)*(len(eeg_cols)-1)))
    })

# ----- save raw per-node + per-subject tables -----
node_df = pd.DataFrame(node_rows)
subj_df = pd.DataFrame(rows)
node_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_node_metrics.csv"), index=False)
subj_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_summary.csv"), index=False)
print("Saved per-subject CSVs:", os.path.join(OUTPUT_DIR, "per_subject_node_metrics.csv"))

# -------- group-level comparisons --------
if subj_df.shape[0] == 0:
    raise RuntimeError("No subjects processed. Check DATA_DIR and label presence in files.")

# hist / box: subject-level mean metrics by group
groups = subj_df.groupby('label')
labels_present = sorted(subj_df['label'].unique())

# boxplot: mean betweenness per subject
plt.figure(figsize=(6,4))
sns.boxplot(x='label', y='mean_betweenness', data=subj_df)
plt.title("Subject mean node betweenness by group")
plt.xlabel("Label"); plt.ylabel("Mean betweenness")
plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_betweenness_by_group.png"), dpi=150); plt.close()

# histogram overlay
plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(subj_df.loc[subj_df['label']==lab, 'mean_betweenness'], label=f"group {lab}", kde=True, stat='density', bins=30)
plt.legend(); plt.title("Histogram: subject mean betweenness"); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_betweenness_by_group.png"), dpi=150); plt.close()

# clustering box + hist
plt.figure(figsize=(6,4))
sns.boxplot(x='label', y='mean_clustering', data=subj_df)
plt.title("Subject mean clustering by group")
plt.xlabel("Label"); plt.ylabel("Mean clustering")
plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_clustering_by_group.png"), dpi=150); plt.close()

plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(subj_df.loc[subj_df['label']==lab, 'mean_clustering'], label=f"group {lab}", kde=True, stat='density', bins=30)
plt.legend(); plt.title("Histogram: subject mean clustering"); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_clustering_by_group.png"), dpi=150); plt.close()

# --------- statistical tests: per-subject means ----------
stats_rows = []
if len(labels_present) == 2:
    a = subj_df.loc[subj_df['label']==labels_present[0]]
    b = subj_df.loc[subj_df['label']==labels_present[1]]
    for metric in ['mean_betweenness','mean_clustering']:
        xa = a[metric].values
        xb = b[metric].values
        # choose test depending on normality (Shapiro if n<=5000; else assume approx normal)
        use_t = True
        try:
            if len(xa) >= 3 and len(xb) >= 3:
                pa = st.shapiro(xa).pvalue
                pb = st.shapiro(xb).pvalue
                use_t = (pa>0.05 and pb>0.05)
        except Exception:
            use_t = False
        if use_t:
            stat, p = st.ttest_ind(xa, xb, equal_var=False)
            test = 'ttest'
        else:
            stat, p = st.mannwhitneyu(xa, xb, alternative='two-sided')
            test = 'mannwhitneyu'
        d = cohen_d(xa, xb)
        stats_rows.append({'metric': metric, 'test': test, 'stat': float(stat), 'pval': float(p), 'cohen_d': float(d)})
else:
    # multi-group: Kruskal-Wallis on subject means
    for metric in ['mean_betweenness','mean_clustering']:
        groups_vals = [g[metric].values for _, g in subj_df.groupby('label')]
        try:
            stat, p = st.kruskal(*groups_vals)
            test = 'kruskal'
        except Exception:
            stat, p, test = np.nan, np.nan, 'na'
        stats_rows.append({'metric': metric, 'test': test, 'stat': float(stat) if not np.isnan(stat) else None, 'pval': float(p) if not np.isnan(p) else None, 'cohen_d': None})

stats_df = pd.DataFrame(stats_rows)
stats_df.to_csv(os.path.join(OUTPUT_DIR, "subject_level_stats.csv"), index=False)
print("Saved subject-level stats:", os.path.join(OUTPUT_DIR, "subject_level_stats.csv"))

# -------- node-level testing (per-node across subjects) --------
# pivot node_df so each subject provides a value per node; then test node-wise between groups
if node_df.shape[0] > 0 and len(labels_present) == 2:
    node_stats = []
    nodes = sorted(node_df['node_idx'].unique())
    for node in nodes:
        subn = node_df[node_df['node_idx'] == node]
        ga = subn[subn['label']==labels_present[0]]['betweenness'].values
        gb = subn[subn['label']==labels_present[1]]['betweenness'].values
        # require >=2 per group
        if len(ga) < 2 or len(gb) < 2:
            continue
        # test
        use_t = True
        try:
            pa = st.shapiro(ga).pvalue if len(ga)>=3 else 1.0
            pb = st.shapiro(gb).pvalue if len(gb)>=3 else 1.0
            use_t = (pa>0.05 and pb>0.05)
        except Exception:
            use_t = False
        if use_t:
            stat, p = st.ttest_ind(ga, gb, equal_var=False)
            test = 'ttest'
        else:
            stat, p = st.mannwhitneyu(ga, gb, alternative='two-sided')
            test = 'mannwhitneyu'
        d = cohen_d(ga, gb)
        node_stats.append({'node_idx': int(node), 'feature_name': subn['feature_name'].iloc[0], 'test': test, 'stat': float(stat), 'pval': float(p), 'cohen_d': float(d)})
    node_stats_df = pd.DataFrame(node_stats).sort_values('pval')
    node_stats_df.to_csv(os.path.join(OUTPUT_DIR, "node_level_stats_betweenness.csv"), index=False)
    print("Saved node-level stats (betweenness):", os.path.join(OUTPUT_DIR, "node_level_stats_betweenness.csv"))

    # top nodes by effect (abs cohen d)
    if not node_stats_df.empty:
        top_nodes = node_stats_df.sort_values('cohen_d', key=lambda s: np.abs(s), ascending=False).head(12)
        # plot a small heatmap of cohen_d for top nodes
        plt.figure(figsize=(6, max(2, len(top_nodes)*0.25)))
        sns.barplot(x=np.abs(top_nodes['cohen_d']), y=top_nodes['feature_name'])
        plt.title("Top nodes by |Cohen's d| (betweenness)")
        plt.xlabel("|Cohen's d|"); plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, "top_nodes_cohen_d_betweenness.png"), dpi=150); plt.close()

# --------- final small summary JSON ----------
summary = {
    'n_subjects': int(subj_df.shape[0]),
    'n_files_scanned': len(files),
    'labels_present': labels_present,
    'density_keep': float(DENSITY_KEEP),
    'thresh_method': THRESH_METHOD,
    'generated': time.time()
}
with open(os.path.join(OUTPUT_DIR, "per_subject_graphs_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("Done. All outputs saved under:", OUTPUT_DIR)
print("Elapsed (s):", time.time() - start_all)


OUTPUT_DIR: /kaggle/working/dl_results/per_subject_graphs
Saved per-subject CSVs: /kaggle/working/dl_results/per_subject_graphs/per_subject_node_metrics.csv
Saved subject-level stats: /kaggle/working/dl_results/per_subject_graphs/subject_level_stats.csv


TypeError: Object of type int64 is not JSON serializable

In [5]:
# ==============================  
# FIXED CELL: Per-subject Graphs  
# JSON-SAFE + int64-SAFE  
# ==============================
import os, math, time, json, warnings
from pathlib import Path
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import networkx as nx
import scipy.stats as st

# --------- CONFIG ---------
DATA_DIR   = r"/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2"
OUTPUT_DIR = r"/kaggle/working/dl_results/per_subject_graphs_main"
os.makedirs(OUTPUT_DIR, exist_ok=True)

THRESH_METHOD = "proportional"
DENSITY_KEEP  = 0.20
MIN_TIMESERIES_LEN = 5
LABEL_COL_CANDIDATES = ['epoch','label','condition','cond','target','y']

start_all = time.time()

# ---- Helpers (same as before, unchanged) ----
def detect_eeg_columns(cols):
    import re
    regex = re.compile(r'^(?:EEG[_\-\s]?|E[_\-\s]?)(0*?)(\d{1,3})$', flags=re.I)
    found = {}
    for c in cols:
        m = regex.match(str(c).strip())
        if m:
            num = int(m.group(2))
            if 1 <= num <= 128:
                found[num] = c
    if found:
        return [found[i] for i in sorted(found.keys())]
    cand = [c for c in cols if isinstance(c, str) and c.lower() not in ('time','condition')]
    return cand

def to_binary_label_series(s):
    s = s.dropna()
    if s.empty: return None
    s_num = pd.to_numeric(s, errors='coerce')
    if s_num.notna().all():
        uniq = set(np.unique(s_num))
        if uniq.issubset({0,1}): return s_num.astype(int)
        if uniq.issubset({1,2}): return s_num.map({1:0,2:1}).astype(int)
        med = float(s_num.median()); return (s_num > med).astype(int)
    s_str = s.astype(str)
    uniq = s_str.unique()
    if len(uniq)==1:
        return s_str.map({uniq[0]:0}).astype(int)
    if len(uniq)==2:
        return s_str.map({uniq[0]:0, uniq[1]:1}).astype(int)
    mode = s_str.mode().iat[0]; return (s_str != mode).astype(int)

def threshold_proportional(abs_corr, keep_density):
    n = abs_corr.shape[0]
    iu = np.triu_indices(n, 1)
    vals = abs_corr[iu]
    if vals.size == 0:
        return np.zeros_like(abs_corr, int)
    k = int(np.floor(keep_density * (n*(n-1)/2)))
    if k <= 0:
        thr = 1.01
    else:
        thr = np.sort(vals)[-k] if k < len(vals) else vals.min()
    A = (abs_corr >= thr).astype(int)
    np.fill_diagonal(A, 0)
    return A

def cohen_d(a, b):
    a = np.asarray(a); b = np.asarray(b)
    if len(a)<2 or len(b)<2: return np.nan
    s1 = a.std(ddof=1); s2 = b.std(ddof=1)
    if s1==0 and s2==0: return np.nan
    pooled = np.sqrt(((len(a)-1)*s1*s1 + (len(b)-1)*s2*s2) / (len(a)+len(b)-2))
    if pooled == 0: return np.nan
    return (a.mean() - b.mean()) / pooled

# ---------- MAIN LOOP ----------
files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith(".csv")])

rows = []
node_rows = []

for fn in files:
    fp = os.path.join(DATA_DIR, fn)
    try:
        df = pd.read_csv(fp, engine="python")
    except:
        continue

    # label
    label_series = None
    label_used = None
    for c in LABEL_COL_CANDIDATES:
        if c in df.columns:
            tmp = to_binary_label_series(df[c])
            if tmp is not None:
                label_series = tmp
                label_used = c
                break
    if label_series is None:
        continue

    # EEG columns
    eeg_cols = detect_eeg_columns(df.columns)
    eeg_cols = [c for c in eeg_cols if c not in ('time','condition','epoch','label')]
    if len(eeg_cols) < 2:
        continue

    # data matrix
    data = df[eeg_cols].apply(pd.to_numeric, errors='coerce').values
    mask = ~np.isnan(data).any(axis=1)
    data = data[mask]
    if data.shape[0] < MIN_TIMESERIES_LEN:
        continue

    try:
        C = np.corrcoef(data, rowvar=False)
        C = np.nan_to_num(C)
    except:
        continue

    absC = np.abs(C)

    # adjacency
    A = threshold_proportional(absC, DENSITY_KEEP)

    G = nx.from_numpy_array(A)
    if G.number_of_nodes() == 0:
        continue

    bc = nx.betweenness_centrality(G)
    cl = nx.clustering(G)
    deg = dict(G.degree())

    subject_label = int(label_series.mode().iat[0])

    # node rows
    for i, col in enumerate(eeg_cols):
        node_rows.append({
            "subject": fn,
            "label": subject_label,
            "node_idx": int(i),
            "feature_name": str(col),
            "betweenness": float(bc[i]),
            "clustering": float(cl[i]),
            "degree": int(deg[i])
        })

    rows.append({
        "subject": fn,
        "label": subject_label,
        "n_nodes": int(len(eeg_cols)),
        "n_time": int(data.shape[0]),
        "mean_betweenness": float(np.mean(list(bc.values()))),
        "mean_clustering": float(np.mean(list(cl.values()))),
        "density_used": float(A.sum() / (len(eeg_cols)*(len(eeg_cols)-1)))
    })

node_df = pd.DataFrame(node_rows)
subj_df = pd.DataFrame(rows)

node_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_node_metrics.csv"), index=False)
subj_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_summary.csv"), index=False)

print("Saved per-subject tables.")

# ---------- GROUP COMPARE ----------
labels_present = sorted([int(x) for x in subj_df["label"].unique()])

# boxplots
plt.figure(figsize=(6,4))
sns.boxplot(x="label", y="mean_betweenness", data=subj_df)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_betw.png"))
plt.close()

plt.figure(figsize=(6,4))
sns.boxplot(x="label", y="mean_clustering", data=subj_df)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_clust.png"))
plt.close()

# histograms
plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(
        subj_df[subj_df["label"]==lab]["mean_betweenness"],
        kde=True, stat="density", bins=30, label=f"group {lab}"
    )
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_betw.png"))
plt.close()

plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(
        subj_df[subj_df["label"]==lab]["mean_clustering"],
        kde=True, stat="density", bins=30, label=f"group {lab}"
    )
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_clust.png"))
plt.close()

# -------- stats: subject-level --------
stats_rows = []
if len(labels_present)==2:
    g0 = subj_df[subj_df["label"]==labels_present[0]]
    g1 = subj_df[subj_df["label"]==labels_present[1]]

    for metric in ["mean_betweenness","mean_clustering"]:
        a = g0[metric].values
        b = g1[metric].values
        try:
            p0 = st.shapiro(a).pvalue if len(a)>=3 else 1
            p1 = st.shapiro(b).pvalue if len(b)>=3 else 1
            use_t = (p0>0.05 and p1>0.05)
        except:
            use_t = False

        if use_t:
            stat, p = st.ttest_ind(a, b, equal_var=False)
            test = "ttest"
        else:
            stat, p = st.mannwhitneyu(a, b)
            test = "mannwhitney"

        stats_rows.append({
            "metric": metric,
            "test": test,
            "stat": float(stat),
            "pval": float(p),
            "cohen_d": float(cohen_d(a,b))
        })

stats_df = pd.DataFrame(stats_rows)
stats_df.to_csv(os.path.join(OUTPUT_DIR, "subject_level_stats.csv"), index=False)

# --------- JSON SUMMARY (FIXED) ----------
summary = {
    "n_subjects": int(subj_df.shape[0]),
    "n_files_scanned": int(len(files)),
    "labels_present": [int(x) for x in labels_present],
    "density_keep": float(DENSITY_KEEP),
    "thresh_method": str(THRESH_METHOD),
    "generated_time": float(time.time())
}

with open(os.path.join(OUTPUT_DIR, "per_subject_graphs_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("Done. JSON saved safely.")


Saved per-subject tables.
Done. JSON saved safely.


In [6]:
# ================================
# FIXED CELL — Per-Subject Graphs
# Robust Labels + JSON-safe + Group Stats
# ================================

import os, json, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import networkx as nx
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# ------------ CONFIG ------------
DATA_DIR   = r"/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2"
OUTPUT_DIR = r"/kaggle/working/dl_results/per_subject_graphs_main_2"
os.makedirs(OUTPUT_DIR, exist_ok=True)

THRESH_METHOD = "proportional"
DENSITY_KEEP  = 0.20
MIN_TIMESERIES_LEN = 5
LABEL_CANDIDATES = ["label","epoch","condition","cond","y","target"]

start_all = time.time()

# ===================== Helpers =====================

def detect_eeg_columns(cols):
    import re
    regex = re.compile(r'^(?:EEG|E)[ _-]*0*(\d{1,3})$', flags=re.I)
    idx = {}
    for c in cols:
        m = regex.match(str(c).strip())
        if m:
            n = int(m.group(1))
            if 1 <= n <= 128:
                idx[n] = c
    if idx:
        return [idx[i] for i in sorted(idx.keys())]

    # fallback: pick numeric columns
    return [c for c in cols if pd.api.types.is_numeric_dtype(cols[c])]


def to_binary_label_series(s):
    s = s.dropna()
    if s.empty: return None
    s_num = pd.to_numeric(s, errors="coerce")
    if s_num.notna().all():
        uniq = set(s_num.unique())
        if uniq.issubset({0,1}): return s_num.astype(int)
        if uniq.issubset({1,2}): return s_num.map({1:0,2:1}).astype(int)
        med = float(s_num.median())
        return (s_num > med).astype(int)

    s_str = s.astype(str)
    uniq = s_str.unique()
    if len(uniq)==1: return s_str.map({uniq[0]:0}).astype(int)
    if len(uniq)==2: return s_str.map({uniq[0]:0, uniq[1]:1}).astype(int)
    mode = s_str.mode().iat[0]
    return (s_str != mode).astype(int)


def threshold_proportional(abs_corr, keep_density):
    n = abs_corr.shape[0]
    iu = np.triu_indices(n,1)
    vals = abs_corr[iu]
    if vals.size == 0:
        return np.zeros_like(abs_corr,int)
    k = int(np.floor(keep_density*(n*(n-1)/2)))
    if k <= 0:
        thr = 1.01
    else:
        thr = np.sort(vals)[-k] if k < len(vals) else vals.min()
    A = (abs_corr >= thr).astype(int)
    np.fill_diagonal(A,0)
    return A


def cohen_d(a, b):
    a = np.asarray(a); b = np.asarray(b)
    if len(a)<2 or len(b)<2: return np.nan
    sa = a.std(ddof=1); sb = b.std(ddof=1)
    pooled = np.sqrt(((len(a)-1)*sa*sa + (len(b)-1)*sb*sb)/(len(a)+len(b)-2))
    if pooled == 0: return np.nan
    return (a.mean() - b.mean()) / pooled


# ===================== MAIN LOOP =====================

files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith(".csv")])
rows = []
node_rows = []

print(f"Found {len(files)} CSV files.")

for fn in files:
    fp = os.path.join(DATA_DIR, fn)
    try:
        df = pd.read_csv(fp)
    except:
        print(f"skip (read error): {fn}")
        continue

    # ----- robust label detection -----
    label_series = None
    for col in LABEL_CANDIDATES:
        if col in df.columns:
            tmp = to_binary_label_series(df[col])
            if tmp is not None:
                label_series = tmp
                break
    if label_series is None:
        print(f"skip (no label found): {fn}")
        continue

    uniq_vals = pd.Series(label_series.unique()).dropna().astype(int).tolist()
    majority_thresh = 0.70

    # Decide subject-level label
    if len(uniq_vals) == 1:
        subject_label = int(uniq_vals[0])

    else:
        mode_val = int(label_series.mode().iat[0])
        mode_prop = float((label_series == mode_val).mean())

        if mode_prop >= majority_thresh:
            subject_label = mode_val
            print(f"warning: mixed labels in {fn}, using mode={mode_val}, prop={mode_prop:.2f}")

        else:
            # ambiguous, fall back to mode but warn
            subject_label = mode_val
            print(f"warning: {fn} very mixed labels (prop={mode_prop:.2f}) — using mode={mode_val} (may want subject-level mapping)")

    # ----- detect EEG columns -----
    eeg_cols = detect_eeg_columns(df)
    if len(eeg_cols) < 2:
        print(f"skip (no EEG cols): {fn}")
        continue

    # ----- prepare data -----
    data = df[eeg_cols].apply(pd.to_numeric, errors='coerce').values
    mask = ~np.isnan(data).any(axis=1)
    data = data[mask]
    if data.shape[0] < MIN_TIMESERIES_LEN:
        print(f"skip (short timeseries): {fn}")
        continue

    # ----- correlation -----
    try:
        C = np.corrcoef(data, rowvar=False)
        C = np.nan_to_num(C)
    except:
        print(f"corr fail: {fn}")
        continue

    absC = np.abs(C)
    A = threshold_proportional(absC, DENSITY_KEEP)

    G = nx.from_numpy_array(A)
    if G.number_of_nodes() == 0:
        print(f"empty graph: {fn}")
        continue

    bc = nx.betweenness_centrality(G)
    cl = nx.clustering(G)
    dg = dict(G.degree())

    # ===== per-node rows =====
    for idx, col in enumerate(eeg_cols):
        node_rows.append({
            "subject": fn,
            "label": int(subject_label),
            "node_idx": int(idx),
            "feature": str(col),
            "betweenness": float(bc[idx]),
            "clustering": float(cl[idx]),
            "degree": int(dg[idx])
        })

    # ===== per-subject summary =====
    rows.append({
        "subject": fn,
        "label": int(subject_label),
        "n_nodes": int(len(eeg_cols)),
        "n_time": int(data.shape[0]),
        "mean_betweenness": float(np.mean(list(bc.values()))),
        "mean_clustering": float(np.mean(list(cl.values()))),
        "density_used": float(A.sum() / (len(eeg_cols)*(len(eeg_cols)-1)))
    })


# ===================== SAVE TABLES =====================

node_df = pd.DataFrame(node_rows)
subj_df = pd.DataFrame(rows)

node_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_node_metrics.csv"), index=False)
subj_df.to_csv(os.path.join(OUTPUT_DIR, "per_subject_summary.csv"), index=False)

print(f"Saved: {OUTPUT_DIR}")


# ===================== GROUP COMPARISONS =====================

subj_df["label"] = subj_df["label"].astype(int)
labels_present = sorted(subj_df["label"].unique().tolist())

print("Subject label counts:", subj_df["label"].value_counts().to_dict())

# ----- Boxplots -----
plt.figure(figsize=(6,4))
sns.boxplot(x="label", y="mean_betweenness", data=subj_df)
plt.title("Mean Betweenness by Group")
plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_betweenness.png"))
plt.close()

plt.figure(figsize=(6,4))
sns.boxplot(x="label", y="mean_clustering", data=subj_df)
plt.title("Mean Clustering by Group")
plt.savefig(os.path.join(OUTPUT_DIR, "box_mean_clustering.png"))
plt.close()

# ----- Histograms -----
plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(subj_df[subj_df["label"]==lab]["mean_betweenness"],
                 kde=True, stat="density", bins=30, label=f"group {lab}")
plt.legend()
plt.title("Histogram: Mean Betweenness")
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_betweenness.png"))
plt.close()

plt.figure(figsize=(6,4))
for lab in labels_present:
    sns.histplot(subj_df[subj_df["label"]==lab]["mean_clustering"],
                 kde=True, stat="density", bins=30, label=f"group {lab}")
plt.legend()
plt.title("Histogram: Mean Clustering")
plt.savefig(os.path.join(OUTPUT_DIR, "hist_mean_clustering.png"))
plt.close()

# ----- Stats -----
stats_rows = []
if len(labels_present)==2:
    g0 = subj_df[subj_df['label']==labels_present[0]]
    g1 = subj_df[subj_df['label']==labels_present[1]]

    for metric in ["mean_betweenness","mean_clustering"]:
        a = g0[metric].values
        b = g1[metric].values

        try:
            p0 = st.shapiro(a).pvalue if len(a)>=3 else 1
            p1 = st.shapiro(b).pvalue if len(b)>=3 else 1
            use_t = (p0>0.05 and p1>0.05)
        except:
            use_t = False

        if use_t:
            stat, p = st.ttest_ind(a, b, equal_var=False)
            test = "ttest"
        else:
            stat, p = st.mannwhitneyu(a, b)
            test = "mannwhitney"

        stats_rows.append({
            "metric": metric,
            "test": test,
            "stat": float(stat),
            "pval": float(p),
            "cohen_d": float(cohen_d(a,b))
        })

stats_df = pd.DataFrame(stats_rows)
stats_df.to_csv(os.path.join(OUTPUT_DIR, "subject_level_stats.csv"), index=False)

# ----- JSON SUMMARY -----
summary = {
    "n_subjects": int(subj_df.shape[0]),
    "n_files_scanned": int(len(files)),
    "labels_present": [int(x) for x in labels_present],
    "thresh_method": THRESH_METHOD,
    "density_keep": float(DENSITY_KEEP),
    "generated": float(time.time())
}

with open(os.path.join(OUTPUT_DIR, "summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("DONE. Outputs saved in:", OUTPUT_DIR)
print("Elapsed:", time.time() - start_all, "sec")


Found 51 CSV files.
Saved: /kaggle/working/dl_results/per_subject_graphs_main_2
Subject label counts: {0: 51}
DONE. Outputs saved in: /kaggle/working/dl_results/per_subject_graphs_main_2
Elapsed: 122.2440288066864 sec


# **GAN-Fixed**

In [None]:
# ================================
# CELL 3 — Vanilla GAN (Kaggle-optimized, faster)
# - saves ONLY best generator + best discriminator weights (.weights.h5)
# - adaptive early-stopping for adversarial loop to speed up on T4
# - no synthetic datasets saved, minimal PNGs + JSON
# ================================
import os, time, gc
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

# -------- CONFIG --------
OUTPUT_DIR = "/kaggle/working/dl_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

LATENT_DIM = 100
EPOCHS_GAN = 1000          # you requested 1000 — we keep it, but EARLY_STOP will cut if it stalls
BATCH = 64
CLASSIFIER_EPOCHS = 50
SEED = 42
PATIENCE_GAN = 120         # stop adversarial loop if gen loss doesn't improve for this many epochs
SAVE_SIZE_CAP_BYTES = 2 * 1024**3   # 2 GiB safety cap (we'll warn if exceeded)
LOG_EVERY = 50

np.random.seed(SEED); tf.random.set_seed(SEED)

# -------- load data splits (Cell 1 saved .npz) --------
X_train, X_test, y_train, y_test = load_data_splits()
X = X_train.astype(np.float32)
FEATURES = X.shape[1]
N_train = X.shape[0]
print(f"[GAN] features={FEATURES} | n_train={N_train}")

# -------- build generator & discriminator (unchanged arch) --------
def build_generator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(256, input_dim=LATENT_DIM),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(512),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(1024),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(FEATURES, activation='tanh')
    ], name="generator")

def build_discriminator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(512, input_shape=(FEATURES,)),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(256),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name="discriminator")

generator = build_generator()
discriminator = build_discriminator()

opt_d = tf.keras.optimizers.Adam(0.0002, 0.5)
opt_g = tf.keras.optimizers.Adam(0.0002, 0.5)

# -------- prepare GAN model (compile with disc non-trainable for GAN-object) --------
discriminator.trainable = False
z = tf.keras.Input(shape=(LATENT_DIM,))
gan_out = discriminator(generator(z))
gan = tf.keras.Model(z, gan_out, name="gan_model")
gan.compile(optimizer=opt_g, loss="binary_crossentropy")

# Now compile discriminator separately for manual training loops
discriminator.trainable = True
discriminator.compile(optimizer=opt_d, loss="binary_crossentropy")

# -------- tf.data pipeline (fast + memory-safe) --------
AUTOTUNE = tf.data.AUTOTUNE
shuffle_buffer = min(10000, N_train)
ds = tf.data.Dataset.from_tensor_slices(X).shuffle(shuffle_buffer, seed=SEED).repeat().batch(BATCH).prefetch(AUTOTUNE)
ds_iter = iter(ds)

real_label = tf.ones((BATCH,1), tf.float32)
fake_label = tf.zeros((BATCH,1), tf.float32)

# -------- training step (tf.function) --------
@tf.function
def train_step(real_batch):
    # discriminator on real
    with tf.GradientTape() as tape_d_real:
        pred_real = discriminator(real_batch, training=True)
        loss_real = tf.reduce_mean(tf.keras.losses.binary_crossentropy(real_label, pred_real))
    grads_real = tape_d_real.gradient(loss_real, discriminator.trainable_variables)
    opt_d.apply_gradients(zip(grads_real, discriminator.trainable_variables))

    # discriminator on fake
    noise = tf.random.normal((BATCH, LATENT_DIM))
    fake_batch = generator(noise, training=True)
    with tf.GradientTape() as tape_d_fake:
        pred_fake = discriminator(fake_batch, training=True)
        loss_fake = tf.reduce_mean(tf.keras.losses.binary_crossentropy(fake_label, pred_fake))
    grads_fake = tape_d_fake.gradient(loss_fake, discriminator.trainable_variables)
    opt_d.apply_gradients(zip(grads_fake, discriminator.trainable_variables))

    # generator step (try to fool discriminator) — compute grads manually
    noise2 = tf.random.normal((BATCH, LATENT_DIM))
    with tf.GradientTape() as tape_g:
        gen_out = generator(noise2, training=True)
        disc_out_for_g = discriminator(gen_out, training=False)   # freeze disc for generator update
        loss_g = tf.reduce_mean(tf.keras.losses.binary_crossentropy(real_label, disc_out_for_g))
    grads_g = tape_g.gradient(loss_g, generator.trainable_variables)
    opt_g.apply_gradients(zip(grads_g, generator.trainable_variables))

    return (loss_real + loss_fake) * 0.5, loss_g

# -------- adversarial training with early stop on generator loss --------
print("[GAN] Adversarial training started...")
d_losses, g_losses = [], []
best_g_loss = np.inf
no_improve = 0
gen_best_path = os.path.join(OUTPUT_DIR, "generator_best.weights.h5")

start_time = time.time()
for epoch in range(EPOCHS_GAN):
    real_batch = next(ds_iter)
    d_val, g_val = train_step(real_batch)

    d_losses.append(float(d_val))
    g_losses.append(float(g_val))

    # check improvement & save best generator weights only
    cur_g = float(g_val)
    if cur_g < best_g_loss - 1e-8:
        best_g_loss = cur_g
        no_improve = 0
        generator.save_weights(gen_best_path)   # tiny file
    else:
        no_improve += 1

    # log sparse
    if (epoch % LOG_EVERY) == 0 or epoch == EPOCHS_GAN-1:
        elapsed = time.time() - start_time
        print(f"[Epoch {epoch}/{EPOCHS_GAN}] D_loss={d_losses[-1]:.4f} | G_loss={g_losses[-1]:.4f} | best_g={best_g_loss:.4f} | no_imp={no_improve} | elapsed={elapsed:.1f}s")

    # early stopping for GAN loop
    if no_improve >= PATIENCE_GAN:
        print(f"[GAN] Early stopping: generator loss didn't improve for {PATIENCE_GAN} epochs (best_g={best_g_loss:.5f})")
        break

print(f"[GAN] Adversarial loop done in {time.time()-start_time:.1f}s. Best G loss: {best_g_loss:.5f}")
print("Generator best weights:", gen_best_path)

# -------- classifier fine-tune: discriminator as classifier (use streaming synthetic again) --------
def mixed_batch_generator(X_real, y_real, batch):
    n = X_real.shape[0]
    half = batch // 2
    while True:
        idx = np.random.randint(0, n, half)
        real_x = X_real[idx]
        real_y = y_real[idx].reshape(-1,1).astype(np.float32)

        noise = np.random.normal(0,1,(half,LATENT_DIM)).astype(np.float32)
        gen_x = generator.predict(noise, verbose=0)
        gen_y = np.zeros((half,1), dtype=np.float32)

        Xb = np.vstack([real_x, gen_x])
        yb = np.vstack([real_y, gen_y])
        perm = np.random.permutation(len(Xb))
        yield Xb[perm], yb[perm]

train_gen = mixed_batch_generator(X, y_train, BATCH)
steps_per_epoch = max(10, N_train // BATCH)

disc_best_path = os.path.join(OUTPUT_DIR, "discriminator_best.weights.h5")
# ModelCheckpoint requires '.weights.h5' when save_weights_only=True
checkpoint = ModelCheckpoint(disc_best_path, monitor="val_loss",
                             save_best_only=True, save_weights_only=True, verbose=0)
early = EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True)

discriminator.trainable = True
discriminator.compile(optimizer=tf.keras.optimizers.Adam(0.0002,0.5),
                      loss="binary_crossentropy", metrics=["accuracy"])

print("[Classifier] Fine-tuning discriminator using streamed synthetic batches...")
history = discriminator.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    epochs=CLASSIFIER_EPOCHS,
    validation_data=(X_test.astype(np.float32), y_test.reshape(-1,1).astype(np.float32)),
    callbacks=[checkpoint, early],
    verbose=1
)

print("Discriminator best weights:", disc_best_path)

# -------- evaluation & small PNGs (ROC + train curves) --------
y_prob = discriminator.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("\n=== Classification Report (Discriminator) ===")
print(classification_report(y_test, y_pred))

# ROC
try:
    auc_val = roc_auc_score(y_test, y_prob)
except Exception:
    auc_val = None
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(5,5)); plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}" if auc_val else "ROC"); plt.plot([0,1],[0,1],'k--', alpha=0.3)
plt.title("Discriminator ROC"); plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "gan_discriminator_roc.png"), dpi=100); plt.close()

# Train/val curves
if history is not None:
    plt.figure(figsize=(6,3))
    plt.plot(history.history.get("accuracy", []), label="train_acc")
    plt.plot(history.history.get("val_accuracy", []), label="val_acc")
    plt.legend(); plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "gan_disc_acc.png"), dpi=100); plt.close()

    plt.figure(figsize=(6,3))
    plt.plot(history.history.get("loss", []), label="train_loss")
    plt.plot(history.history.get("val_loss", []), label="val_loss")
    plt.legend(); plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "gan_disc_loss.png"), dpi=100); plt.close()

# -------- save JSON result entry (models_results.json via helper) --------
res = make_result_dict("Vanilla_GAN_best_only", discriminator, X_test, y_test, history)
res.update({
    "best_generator_weights": os.path.basename(gen_best_path),
    "best_discriminator_weights": os.path.basename(disc_best_path),
    "gan_epochs_ran": int(len(g_losses)),
    "timestamp": time.time()
})
save_model_result(res)
print("[JSON] result appended.")

# -------- storage guard: ensure saved files stay small (warn if > 2 GiB) --------
def human_mb(n): return f"{n/1024**2:.2f} MB"
total_bytes = 0
for root,_,files in os.walk(OUTPUT_DIR):
    for f in files:
        total_bytes += os.path.getsize(os.path.join(root,f))
if total_bytes > SAVE_SIZE_CAP_BYTES:
    print("⚠️ WARNING: OUTPUT_DIR size > 2 GiB — consider removing large artifacts.")
print(f"Output directory size: {total_bytes/1024**2:.2f} MB")

# -------- cleanup --------
K.clear_session()
gc.collect()

print("✅ CELL 3 finished — only BEST weights saved (generator + discriminator) + PNGs + JSON.")


[GAN] features=128 | n_train=3089910


I0000 00:00:1763930037.804977      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1763930037.807547      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


[GAN] Adversarial training started...
[Epoch 0/1000] D_loss=0.8708 | G_loss=0.7148 | best_g=0.7148 | no_imp=0 | elapsed=4.4s
[Epoch 50/1000] D_loss=0.5992 | G_loss=1.0553 | best_g=0.4685 | no_imp=45 | elapsed=4.9s
[Epoch 100/1000] D_loss=0.6025 | G_loss=0.8002 | best_g=0.4685 | no_imp=95 | elapsed=5.2s
[GAN] Early stopping: generator loss didn't improve for 120 epochs (best_g=0.46851)
[GAN] Adversarial loop done in 5.4s. Best G loss: 0.46851
Generator best weights: /kaggle/working/dl_results/generator_best.weights.h5
[Classifier] Fine-tuning discriminator using streamed synthetic batches...


I0000 00:00:1763930052.552337     127 service.cc:148] XLA service 0x799630009fd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763930052.553887     127 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1763930052.553910     127 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1763930052.761734     127 cuda_dnn.cc:529] Loaded cuDNN version 90300


Epoch 1/50


I0000 00:00:1763930053.206399     127 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3272s[0m 68ms/step - accuracy: 0.7801 - loss: 0.3408 - val_accuracy: 0.6512 - val_loss: 0.5936
Epoch 2/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3264s[0m 68ms/step - accuracy: 0.8188 - loss: 0.3033 - val_accuracy: 0.6786 - val_loss: 0.5615
Epoch 3/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2993s[0m 62ms/step - accuracy: 0.8300 - loss: 0.2912 - val_accuracy: 0.7043 - val_loss: 0.5339
Epoch 4/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2932s[0m 61ms/step - accuracy: 0.8359 - loss: 0.2841 - val_accuracy: 0.7099 - val_loss: 0.5244
Epoch 5/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2927s[0m 61ms/step - accuracy: 0.8412 - loss: 0.2785 - val_accuracy: 0.7246 - val_loss: 0.5083
Epoch 6/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2931s[0m 61ms/step - accuracy: 0.8444 - loss: 0.2743 - val_accuracy: 0.7330 - val_loss

In [None]:
# ================================
# CELL 3 — Vanilla GAN (Kaggle-optimized, faster)
# - Keeps architecture identical
# - Mixed precision, steps-based loop, faster IO
# - Saves ONLY best generator + best discriminator weights (.weights.h5)
# ================================
import os, time, gc
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

# ---------- CONFIG ----------
OUTPUT_DIR = "/kaggle/working/dl_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# keep your architecture / hyperparams unchanged (per request)
LATENT_DIM = 100
EPOCHS_GAN = 1000          # kept (we control runtime via patience & steps)
BATCH = 64
CLASSIFIER_EPOCHS = 50
SEED = 42

# speed-tuners
PATIENCE_GAN = 30          # stop adversarial loop if gen loss doesn't improve for this many *steps*
LOG_EVERY_STEPS = 50
SAVE_SIZE_CAP_BYTES = 2 * 1024**3   # safety check
USE_MIXED_PRECISION = True  # ENABLE mixed-precision for T4

np.random.seed(SEED); tf.random.set_seed(SEED)

# ---------- mixed precision (fast on T4) ----------
if USE_MIXED_PRECISION:
    try:
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy('mixed_float16')
        print("[info] Mixed precision enabled (float16 compute).")
    except Exception as e:
        print("[warn] cannot enable mixed precision:", e)

# ---------- load splits ----------
X_train, X_test, y_train, y_test = load_data_splits()
X = X_train.astype(np.float32)
FEATURES = X.shape[1]
N_train = X.shape[0]
print(f"[GAN] features={FEATURES} | n_train={N_train}")

# ---------- model builders (unchanged) ----------
def build_generator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(256, input_dim=LATENT_DIM),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(512),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(1024),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dense(FEATURES, activation='tanh')
    ], name="generator")

def build_discriminator():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(512, input_shape=(FEATURES,)),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(256),
        tf.keras.layers.LeakyReLU(0.2),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(1, activation='sigmoid', dtype='float32')  # ensure final logits in float32 for numeric stability
    ], name="discriminator")

generator = build_generator()
discriminator = build_discriminator()

opt_d = tf.keras.optimizers.Adam(0.0002, 0.5)
opt_g = tf.keras.optimizers.Adam(0.0002, 0.5)

# ---------- compile GAN object (discriminator frozen inside gan) ----------
discriminator.trainable = False
z = tf.keras.Input(shape=(LATENT_DIM,))
gan_out = discriminator(generator(z))
gan = tf.keras.Model(z, gan_out, name="gan")
gan.compile(optimizer=opt_g, loss="binary_crossentropy")

# compile discriminator separately (trainable) for manual gradient steps
discriminator.trainable = True
discriminator.compile(optimizer=opt_d, loss="binary_crossentropy")

# ---------- tf.data pipeline ----------
AUTOTUNE = tf.data.AUTOTUNE
shuffle_buffer = min(10000, N_train)
ds = tf.data.Dataset.from_tensor_slices(X).shuffle(shuffle_buffer, seed=SEED).repeat().batch(BATCH).prefetch(AUTOTUNE)
ds_iter = iter(ds)

real_label = tf.ones((BATCH,1), tf.float32)
fake_label = tf.zeros((BATCH,1), tf.float32)

# ---------- tf.function train step (kept simple & fast) ----------
@tf.function
def train_step(real_batch):
    # discriminator on real
    with tf.GradientTape() as tape_d_real:
        pred_real = discriminator(real_batch, training=True)
        loss_real = tf.reduce_mean(tf.keras.losses.binary_crossentropy(real_label, pred_real))
    grads_real = tape_d_real.gradient(loss_real, discriminator.trainable_variables)
    opt_d.apply_gradients(zip(grads_real, discriminator.trainable_variables))

    # discriminator on fake
    noise = tf.random.normal((BATCH, LATENT_DIM))
    fake_batch = generator(noise, training=True)
    with tf.GradientTape() as tape_d_fake:
        pred_fake = discriminator(fake_batch, training=True)
        loss_fake = tf.reduce_mean(tf.keras.losses.binary_crossentropy(fake_label, pred_fake))
    grads_fake = tape_d_fake.gradient(loss_fake, discriminator.trainable_variables)
    opt_d.apply_gradients(zip(grads_fake, discriminator.trainable_variables))

    # generator step
    noise2 = tf.random.normal((BATCH, LATENT_DIM))
    with tf.GradientTape() as tape_g:
        gen_out = generator(noise2, training=True)
        # ensure discriminator used in eval mode for generator update (no dropout effect)
        disc_out_for_g = discriminator(gen_out, training=False)
        loss_g = tf.reduce_mean(tf.keras.losses.binary_crossentropy(real_label, disc_out_for_g))
    grads_g = tape_g.gradient(loss_g, generator.trainable_variables)
    opt_g.apply_gradients(zip(grads_g, generator.trainable_variables))

    # convert losses to float32 for Python side
    return tf.cast((loss_real + loss_fake) * 0.5, tf.float32), tf.cast(loss_g, tf.float32)

# ---------- steps control (faster monitoring) ----------
print("[GAN] Starting adversarial training (steps-based loop)...")
d_losses, g_losses = [], []
best_g_loss = np.inf
no_improve = 0
gen_best_path = os.path.join(OUTPUT_DIR, "generator_best_main-3.weights.h5")

start_time = time.time()
max_steps = EPOCHS_GAN * max(1, N_train // BATCH)  # upper bound steps; early stop will exit earlier
# We WILL break early once PATIENCE_GAN is reached (no improvement on gen loss)

for step in range(int(max_steps)):
    real_batch = next(ds_iter)
    d_val, g_val = train_step(real_batch)

    d_losses.append(float(d_val.numpy()))
    g_losses.append(float(g_val.numpy()))

    cur_g = g_losses[-1]
    # only write weights when we have improvement to minimize IO
    if cur_g < best_g_loss - 1e-8:
        best_g_loss = cur_g
        no_improve = 0
        generator.save_weights(gen_best_path)
    else:
        no_improve += 1

    # log occasionally
    if (step % LOG_EVERY_STEPS) == 0:
        elapsed = time.time() - start_time
        steps_done = step + 1
        # estimate remaining (very rough)
        avg_step_time = elapsed / steps_done
        est_remaining_s = avg_step_time * (max_steps - steps_done)
        print(f"[step {step}/{int(max_steps)}] D={d_losses[-1]:.4f} G={g_losses[-1]:.4f} bestG={best_g_loss:.4f} noImp={no_improve} elapsed={elapsed:.1f}s est_remain={est_remaining_s/60:.1f}m")

    # early-stop based on PATIENCE_GAN (counts steps)
    if no_improve >= PATIENCE_GAN:
        print(f"[GAN] Early stop after {step+1} steps (no gen improvement for {PATIENCE_GAN} steps).")
        break

# record number of GAN steps / equivalent epochs run
gan_steps_ran = len(g_losses)
gan_epochs_equiv = gan_steps_ran / max(1, N_train // BATCH)
print(f"[GAN] Done. Steps ran: {gan_steps_ran} (~{gan_epochs_equiv:.2f} epochs). Best G loss: {best_g_loss:.5f}")
print("Generator best weights saved to:", gen_best_path)

# ---------- classifier fine-tune (streamed synthetic batches) ----------
def mixed_batch_generator(X_real, y_real, batch):
    n = X_real.shape[0]
    half = batch // 2
    while True:
        idx = np.random.randint(0, n, half)
        real_x = X_real[idx]
        real_y = y_real[idx].reshape(-1,1).astype(np.float32)

        noise = np.random.normal(0,1,(half, LATENT_DIM)).astype(np.float32)
        gen_x = generator.predict(noise, verbose=0)
        gen_y = np.zeros((half,1), dtype=np.float32)

        Xb = np.vstack([real_x, gen_x])
        yb = np.vstack([real_y, gen_y])
        perm = np.random.permutation(len(Xb))
        yield Xb[perm], yb[perm]

train_gen = mixed_batch_generator(X, y_train, BATCH)
steps_per_epoch = max(10, N_train // BATCH)

disc_best_path = os.path.join(OUTPUT_DIR, "discriminator_best_main-3.weights.h5")
checkpoint = ModelCheckpoint(disc_best_path, monitor="val_loss", save_best_only=True, save_weights_only=True, verbose=0)
early = EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True)

discriminator.trainable = True
discriminator.compile(optimizer=tf.keras.optimizers.Adam(0.0002,0.5),
                      loss="binary_crossentropy", metrics=["accuracy"])

print("[Classifier] Fine-tuning discriminator using streamed synthetic batches...")
history = discriminator.fit(
    train_gen,
    steps_per_epoch = steps_per_epoch,
    epochs = CLASSIFIER_EPOCHS,
    validation_data = (X_test.astype(np.float32), y_test.reshape(-1,1).astype(np.float32)),
    callbacks = [checkpoint, early],
    verbose = 1
)
print("Discriminator best weights saved to:", disc_best_path)

# ---------- evaluation & plots ----------
y_prob = discriminator.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ROC + small plots
try:
    auc_val = roc_auc_score(y_test, y_prob)
except Exception:
    auc_val = None
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(5,5)); plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}" if auc_val else "ROC"); plt.plot([0,1],[0,1],'k--', alpha=0.3)
plt.title("Discriminator ROC"); plt.legend(); plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "gan_discriminator_roc_main-2.png"), dpi=100); plt.close()

if history is not None:
    plt.figure(figsize=(6,3)); plt.plot(history.history.get("accuracy",[])); plt.plot(history.history.get("val_accuracy",[]))
    plt.title("Train/Val Acc"); plt.savefig(os.path.join(OUTPUT_DIR, "gan_disc_acc_main-2.png"), dpi=100); plt.close()

    plt.figure(figsize=(6,3)); plt.plot(history.history.get("loss",[])); plt.plot(history.history.get("val_loss",[]))
    plt.title("Train/Val Loss"); plt.savefig(os.path.join(OUTPUT_DIR, "gan_disc_loss_main-2.png"), dpi=100); plt.close()

# ---------- save JSON result ----------
res = make_result_dict("Vanilla_GAN_fast", discriminator, X_test, y_test, history)
res.update({
    "best_generator_weights": os.path.basename(gen_best_path),
    "best_discriminator_weights": os.path.basename(disc_best_path),
    "gan_steps_ran": int(gan_steps_ran),
    "gan_epochs_equiv": float(gan_epochs_equiv),
    "timestamp": time.time()
})
save_model_result(res)
print("[JSON] result appended.")

# ---------- storage guard ----------
def bytes_to_mb(n): return n/1024**2
total_bytes = 0
for root,_,files in os.walk(OUTPUT_DIR):
    for f in files:
        total_bytes += os.path.getsize(os.path.join(root,f))
print(f"Output dir size: {bytes_to_mb(total_bytes):.2f} MB")
if total_bytes > SAVE_SIZE_CAP_BYTES:
    print("⚠️ OUTPUT_DIR > 2 GiB - remove artifacts or reduce checkpoints.")

# ---------- cleanup ----------
K.clear_session()
gc.collect()
print("✅ CELL 3 complete. Only best weights + PNGs + JSON saved.")

[info] Mixed precision enabled (float16 compute).
[GAN] features=128 | n_train=3089910
[GAN] Starting adversarial training (steps-based loop)...
[step 0/48279000] D=0.8566 G=0.7086 bestG=0.7086 noImp=0 elapsed=2.5s est_remain=1980658.3m
[GAN] Early stop after 35 steps (no gen improvement for 30 steps).
[GAN] Done. Steps ran: 35 (~0.00 epochs). Best G loss: 0.47509
Generator best weights saved to: /kaggle/working/dl_results/generator_best_main-3.weights.h5
[Classifier] Fine-tuning discriminator using streamed synthetic batches...
Epoch 1/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2807s[0m 58ms/step - accuracy: 0.7814 - loss: 0.3374 - val_accuracy: 0.6534 - val_loss: 0.5921
Epoch 2/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2795s[0m 58ms/step - accuracy: 0.8190 - loss: 0.3031 - val_accuracy: 0.6785 - val_loss: 0.5602
Epoch 3/50
[1m48279/48279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2797s[0m 58ms/step - accuracy: 0.8301 - loss: 0.29