In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/ABIDE")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/ABIDE')

In [None]:
# === ИМПОРТЫ И НАСТРОЙКИ ===
import numpy as np
import pandas as pd
import os
import scipy.io as sio
import warnings
warnings.filterwarnings('ignore')

# Машинное обучение
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_auc_score, precision_score, recall_score,
                             f1_score, balanced_accuracy_score,
                             average_precision_score, matthews_corrcoef,
                             accuracy_score, confusion_matrix)
from sklearn.calibration import CalibratedClassifierCV
import lightgbm as lgb

print("=== ПАЙПЛАЙН ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE) ===")

=== ПАЙПЛАЙН ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE) ===


In [None]:
# === ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ МЕТРИК ===
def safe_auc(y_true, y_score):
    if len(np.unique(y_true)) < 2:
        return np.nan
    return roc_auc_score(y_true, y_score)

def safe_ap(y_true, y_score):
    if len(np.unique(y_true)) < 2:
        return np.nan
    return average_precision_score(y_true, y_score)

def compute_metrics(y_true, y_proba, threshold=0.5):
    y_true = np.asarray(y_true).astype(int)
    y_proba = np.asarray(y_proba).astype(float)
    y_pred = (y_proba >= threshold).astype(int)

    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        tn = fp = fn = tp = np.nan

    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    return {
        "AUC": safe_auc(y_true, y_proba),
        "AP": safe_ap(y_true, y_proba),
        "Acc": accuracy_score(y_true, y_pred) if len(y_true) else np.nan,
        "F1": f1_score(y_true, y_pred, zero_division=0) if len(y_true) else np.nan,
        "Rec": recall_score(y_true, y_pred, zero_division=0) if len(y_true) else np.nan,
        "Prec": precision_score(y_true, y_pred, zero_division=0) if len(y_true) else np.nan,
        "Spec": spec,
        "BalAcc": balanced_accuracy_score(y_true, y_pred) if len(np.unique(y_true)) > 1 else np.nan,
        "TN": tn, "FP": fp, "FN": fn, "TP": tp
    }

def print_split_metrics(split_name, m):
    def fmt(x):
        if x is None or (isinstance(x, float) and np.isnan(x)):
            return "nan"
        if isinstance(x, (int, np.integer)):
            return str(int(x))
        return f"{x:.3f}"

    print(
        f"   [{split_name}] "
        f"AUC={fmt(m['AUC'])}  AP={fmt(m['AP'])}  "
        f"Acc={fmt(m['Acc'])}  F1={fmt(m['F1'])}  "
        f"Rec={fmt(m['Rec'])}  Prec={fmt(m['Prec'])}  "
        f"Spec={fmt(m['Spec'])}  BalAcc={fmt(m['BalAcc'])}   "
        f"CM(TN,FP,FN,TP)=({fmt(m['TN'])},{fmt(m['FP'])},{fmt(m['FN'])},{fmt(m['TP'])})"
    )

In [None]:
# === ФУНКЦИЯ ЗАГРУЗКИ ДАННЫХ ===
def load_and_align_data():
    """
    Загрузка данных с гарантией совпадения размеров
    """
    print("1. Загрузка и выравнивание данных...")

    # Пути к данным
    subject_ids_path = '/content/drive/MyDrive/ABIDE/phenotypic_image_quality/subject_IDs.txt'
    pheno_path = '/content/drive/MyDrive/ABIDE/phenotypic.csv'
    aal_path = '/content/drive/MyDrive/ABIDE/AAL/original/'

    # Загрузка ID субъектов
    if os.path.exists(subject_ids_path):
        subject_IDs = np.genfromtxt(subject_ids_path, dtype=str)
    else:
        print("ВНИМАНИЕ: Файл с ID не найден, создаем тестовые данные...")
        subject_IDs = [f"{i:05d}" for i in range(50001, 50101)]

    # Загрузка фенотипических данных
    if os.path.exists(pheno_path):
        pheno_df = pd.read_csv(pheno_path)
    else:
        print("ВНИМАНИЕ: Фенотипические данные не найдены, создаем тестовые...")
        pheno_df = pd.DataFrame({
            'SUB_ID': [int(sid) for sid in subject_IDs],
            'DX_GROUP': np.random.choice([1, 2], size=len(subject_IDs), p=[0.5, 0.5]),
            'SITE_ID': np.random.choice(['SITE_01', 'SITE_02', 'SITE_03'], size=len(subject_IDs)),
            'AGE_AT_SCAN': np.random.uniform(6, 18, size=len(subject_IDs)),
            'SEX': np.random.choice(['M', 'F'], size=len(subject_IDs), p=[0.7, 0.3])
        })

    # Сбор данных с проверкой
    matrices = []
    meta_data = []

    for sid in subject_IDs:
        mat_path = os.path.join(aal_path, f"{sid}.mat")

        if os.path.exists(mat_path):
            try:
                # Загрузка матрицы
                mat = sio.loadmat(mat_path)

                if 'connectivity' not in mat:
                    continue

                conn = mat['connectivity']

                # Проверка формы
                if conn.shape[0] != conn.shape[1]:
                    continue

                # Векторизация
                triu_idx = np.triu_indices_from(conn, k=1)
                features = conn[triu_idx]

                # Проверка на NaN/Inf
                if np.any(np.isnan(features)) or np.any(np.isinf(features)):
                    continue

                matrices.append(features)

                # Метаданные
                sub_info = pheno_df[pheno_df['SUB_ID'] == int(sid)]
                if len(sub_info) > 0:
                    meta_data.append({
                        'subject_id': sid,
                        'site': sub_info['SITE_ID'].values[0],
                        'age': sub_info['AGE_AT_SCAN'].values[0],
                        'sex': 1 if sub_info['SEX'].values[0] == 'M' else 0,
                        'diagnosis': 1 if sub_info['DX_GROUP'].values[0] == 1 else 0
                    })
                else:
                    matrices.pop()

            except Exception as e:
                continue

    # Проверяем совпадение размеров
    min_len = min(len(matrices), len(meta_data))
    matrices = matrices[:min_len]
    meta_data = meta_data[:min_len]

    # Создаем массивы
    X = np.array(matrices)
    y = np.array([m['diagnosis'] for m in meta_data])
    sites = np.array([m['site'] for m in meta_data])
    ages = np.array([m['age'] for m in meta_data])
    sexes = np.array([m['sex'] for m in meta_data])
    subject_ids = np.array([m['subject_id'] for m in meta_data])

    print(f"   Загружено: X={X.shape}, y={y.shape}, sites={sites.shape}")
    print(f"   Классы: ASD={sum(y)} ({sum(y)/len(y)*100:.1f}%), "
          f"Control={len(y)-sum(y)} ({(1-sum(y)/len(y))*100:.1f}%)")

    return X, y, sites, ages, sexes, subject_ids

In [None]:
# === ОТБОР ПРИЗНАКОВ (С DATA LEAKAGE) ===
def stable_feature_selection_cv(X, y, sites, n_features=60, n_splits=10, random_state=42):
    """
    Стабильный отбор признаков с учетом сайтов (НЕКОРРЕКТНЫЙ - есть data leakage)
    """
    print(f"\n3. СУПЕР-СТАБИЛЬНЫЙ отбор признаков (цель: {n_features})...")

    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Матрица отбора признаков
    feature_selection_matrix = np.zeros((X.shape[1], n_splits))

    for fold, (train_idx, _) in enumerate(sgkf.split(X, y, groups=sites)):
        X_train = X[train_idx]
        y_train = y[train_idx]

        # Создаем внутренний CV
        inner_sgkf = StratifiedGroupKFold(n_splits=3, shuffle=True,
                                         random_state=random_state + fold)

        fold_feature_scores = np.zeros(X.shape[1])

        for inner_fold, (inner_train_idx, inner_val_idx) in enumerate(
            inner_sgkf.split(X_train, y_train, groups=sites[train_idx])):

            X_inner_train = X_train[inner_train_idx]
            y_inner_train = y_train[inner_train_idx]
            X_inner_val = X_train[inner_val_idx]
            y_inner_val = y_train[inner_val_idx]

            # Сильно регуляризованная логистическая регрессия для отбора
            scaler = StandardScaler()
            X_inner_scaled = scaler.fit_transform(X_inner_train)
            X_val_scaled = scaler.transform(X_inner_val)

            # Обучаем 3 разных регуляризованных модели
            models = [
                LogisticRegression(penalty='l1', C=0.005, solver='saga',
                                 max_iter=2000, random_state=random_state + inner_fold),
                LogisticRegression(penalty='l2', C=0.01, solver='saga',
                                 max_iter=2000, random_state=random_state + inner_fold + 100),
                LogisticRegression(penalty='elasticnet', l1_ratio=0.5, C=0.01,
                                 solver='saga', max_iter=2000,
                                 random_state=random_state + inner_fold + 200)
            ]

            for model in models:
                model.fit(X_inner_scaled, y_inner_train)
                if hasattr(model, 'coef_'):
                    fold_feature_scores += np.abs(model.coef_[0])

                # Добавляем score по валидации
                if hasattr(model, 'predict_proba'):
                    y_pred = model.predict_proba(X_val_scaled)[:, 1]
                    auc = roc_auc_score(y_inner_val, y_pred)
                    fold_feature_scores += (auc * np.abs(model.coef_[0]))

        feature_selection_matrix[:, fold] = fold_feature_scores

    # Вычисляем стабильность
    mean_scores = np.mean(feature_selection_matrix, axis=1)
    std_scores = np.std(feature_selection_matrix, axis=1)
    stability_scores = mean_scores / (std_scores + 1e-10)

    # Выбираем ТОЛЬКО самые стабильные признаки
    stability_threshold = np.percentile(stability_scores, 90)
    stable_mask = stability_scores >= stability_threshold

    print(f"   Найдено {np.sum(stable_mask)} стабильных признаков")

    if np.sum(stable_mask) > n_features:
        # Из стабильных берем самые важные
        top_stable_idx = np.argsort(mean_scores[stable_mask])[-n_features:]
        selected_indices = np.where(stable_mask)[0][top_stable_idx]
    else:
        selected_indices = np.where(stable_mask)[0]

    # Убираем высококоррелированные признаки (>0.85)
    X_selected_temp = X[:, selected_indices]
    correlation_matrix = np.abs(np.corrcoef(X_selected_temp, rowvar=False))
    high_corr_mask = np.any(np.triu(correlation_matrix, 1) > 0.85, axis=0)
    final_indices = selected_indices[~high_corr_mask]

    print(f"   После удаления коррелированных: {len(final_indices)} признаков")

    return final_indices, X[:, final_indices]

In [None]:
# === СОЗДАНИЕ МОДЕЛЕЙ ===
def create_ultra_regularized_logistic_regression(X_train, y_train, random_state=42):
    """
    Сильно регуляризованная логистическая регрессия
    """
    print("   Обучение СИЛЬНО регуляризованной Logistic Regression...")

    param_grid = {
        'C': [0.0001, 0.001, 0.005, 0.01],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'l1_ratio': [0.1, 0.5, 0.9],
        'class_weight': ['balanced'],
        'solver': ['saga']
    }

    grid_search = GridSearchCV(
        LogisticRegression(max_iter=3000, random_state=random_state),
        param_grid,
        cv=3,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=0
    )

    grid_search.fit(X_train, y_train)

    # Калибруем модель
    calibrated_model = CalibratedClassifierCV(
        grid_search.best_estimator_,
        method='isotonic',
        cv=3
    )

    calibrated_model.fit(X_train, y_train)

    return calibrated_model


def create_ultra_regularized_lightgbm(X_train, y_train, X_val, y_val, random_state=42):
    """
    LightGBM с экстремальной регуляризацией
    """
    print("   Обучение СИЛЬНО регуляризованного LightGBM...")

    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1]) if sum(y_train) > 0 else 1

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 8,
        'max_depth': 3,
        'learning_rate': 0.01,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 1,
        'reg_alpha': 1.0,
        'reg_lambda': 2.0,
        'min_child_samples': 30,
        'min_child_weight': 0.01,
        'min_split_gain': 0.01,
        'scale_pos_weight': scale_pos_weight,
        'random_state': random_state,
        'n_jobs': -1,
        'verbosity': -1,
        'max_bin': 64
    }

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=200,
        callbacks=[
            lgb.early_stopping(stopping_rounds=20, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )

    return model

In [None]:
# === ОЦЕНКА МОДЕЛЕЙ ===
def evaluate_logistic_regression(X, y, sites, random_state=42):
    """
    Оценка только логистической регрессии
    """
    print("\n" + "_"*70)
    print("LOGISTIC REGRESSION")
    print("_"*70)

    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=random_state)

    results = {
        'test_auc': [], 'test_accuracy': [], 'test_f1': [], 'test_recall': [],
        'test_precision': [], 'test_specificity': [],
        'test_balanced_acc': [], 'train_auc': [], 'auc_diff': []
    }

    for fold, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=sites), 1):
        print(f"\nФолд {fold}/5:")
        print("-" * 40)

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Создаем validation set
        X_train_main, X_val, y_train_main, y_val = train_test_split(
            X_train, y_train, test_size=0.15, stratify=y_train, random_state=random_state
        )

        # Масштабирование
        scaler = RobustScaler(quantile_range=(10, 90))
        X_train_scaled = scaler.fit_transform(X_train_main)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        print(f"   Train: {len(y_train_main)}, Val: {len(y_val)}, Test: {len(y_test)}")

        # Обучаем модель
        model = create_ultra_regularized_logistic_regression(X_train_scaled, y_train_main, random_state + fold)

        # Предсказания
        y_proba_train = model.predict_proba(X_train_scaled)[:, 1]
        y_proba_test = model.predict_proba(X_test_scaled)[:, 1]
        y_proba_val = model.predict_proba(X_val_scaled)[:, 1]

        # Оптимальный порог
        thresholds = np.linspace(0.3, 0.7, 50)
        best_threshold = 0.5
        best_f1 = 0

        for threshold in thresholds:
            y_val_pred = (y_proba_val >= threshold).astype(int)
            f1 = f1_score(y_val, y_val_pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        print(f"   Оптимальный порог: {best_threshold:.3f} (Val F1={best_f1:.3f})")

        # Метрики
        m_train = compute_metrics(y_train_main, y_proba_train, threshold=best_threshold)
        m_val   = compute_metrics(y_val, y_proba_val, threshold=best_threshold)
        m_test  = compute_metrics(y_test, y_proba_test, threshold=best_threshold)

        print_split_metrics("TRAIN", m_train)
        print_split_metrics("VAL ", m_val)
        print_split_metrics("TEST", m_test)

        # Сохраняем результаты
        train_auc = roc_auc_score(y_train_main, y_proba_train)
        y_pred_test = (y_proba_test >= best_threshold).astype(int)
        test_auc = roc_auc_score(y_test, y_proba_test)

        results['test_auc'].append(test_auc)
        results['test_accuracy'].append(accuracy_score(y_test, y_pred_test))
        results['test_f1'].append(f1_score(y_test, y_pred_test, zero_division=0))
        results['test_recall'].append(recall_score(y_test, y_pred_test, zero_division=0))
        results['test_precision'].append(precision_score(y_test, y_pred_test, zero_division=0))
        results['test_specificity'].append(recall_score(y_test, y_pred_test, pos_label=0, zero_division=0))
        results['test_balanced_acc'].append(balanced_accuracy_score(y_test, y_pred_test))
        results['train_auc'].append(train_auc)
        results['auc_diff'].append(train_auc - test_auc)

    return results


def evaluate_lightgbm(X, y, sites, random_state=42):
    """
    Оценка только LightGBM
    """
    print("\n" + "_"*70)
    print("LIGHTGBM")
    print("_"*70)

    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=random_state)

    results = {
        'test_auc': [], 'test_accuracy': [], 'test_f1': [], 'test_recall': [],
        'test_precision': [], 'test_specificity': [],
        'test_balanced_acc': [], 'train_auc': [], 'auc_diff': []
    }

    for fold, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=sites), 1):
        print(f"\nФолд {fold}/5:")
        print("-" * 40)

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Создаем validation set
        X_train_main, X_val, y_train_main, y_val = train_test_split(
            X_train, y_train, test_size=0.15, stratify=y_train, random_state=random_state
        )

        # Масштабирование
        scaler = RobustScaler(quantile_range=(10, 90))
        X_train_scaled = scaler.fit_transform(X_train_main)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        print(f"   Train: {len(y_train_main)}, Val: {len(y_val)}, Test: {len(y_test)}")

        # Обучаем модель
        model = create_ultra_regularized_lightgbm(X_train_scaled, y_train_main, X_val_scaled, y_val, random_state + fold)

        # Предсказания
        y_proba_train = model.predict(X_train_scaled)
        y_proba_test = model.predict(X_test_scaled)
        y_proba_val = model.predict(X_val_scaled)

        # Оптимальный порог
        thresholds = np.linspace(0.3, 0.7, 50)
        best_threshold = 0.5
        best_f1 = 0

        for threshold in thresholds:
            y_val_pred = (y_proba_val >= threshold).astype(int)
            f1 = f1_score(y_val, y_val_pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        print(f"   Оптимальный порог: {best_threshold:.3f} (Val F1={best_f1:.3f})")

        # Метрики
        m_train = compute_metrics(y_train_main, y_proba_train, threshold=best_threshold)
        m_val   = compute_metrics(y_val, y_proba_val, threshold=best_threshold)
        m_test  = compute_metrics(y_test, y_proba_test, threshold=best_threshold)

        print_split_metrics("TRAIN", m_train)
        print_split_metrics("VAL ", m_val)
        print_split_metrics("TEST", m_test)

        # Сохраняем результаты
        train_auc = roc_auc_score(y_train_main, y_proba_train)
        y_pred_test = (y_proba_test >= best_threshold).astype(int)
        test_auc = roc_auc_score(y_test, y_proba_test)

        results['test_auc'].append(test_auc)
        results['test_accuracy'].append(accuracy_score(y_test, y_pred_test))
        results['test_f1'].append(f1_score(y_test, y_pred_test, zero_division=0))
        results['test_recall'].append(recall_score(y_test, y_pred_test, zero_division=0))
        results['test_precision'].append(precision_score(y_test, y_pred_test, zero_division=0))
        results['test_specificity'].append(recall_score(y_test, y_pred_test, pos_label=0, zero_division=0))
        results['test_balanced_acc'].append(balanced_accuracy_score(y_test, y_pred_test))
        results['train_auc'].append(train_auc)
        results['auc_diff'].append(train_auc - test_auc)

    return results


def evaluate_ensemble(X, y, sites, random_state=42):
    """
    Оценка ансамбля Logistic Regression + LightGBM
    """
    print("\n" + "_"*70)
    print("LOGISTIC REGRESSION + LIGHTGBM (АНСАМБЛЬ)")
    print("_"*70)

    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=random_state)

    results = {
        'test_auc': [], 'test_accuracy': [], 'test_f1': [], 'test_recall': [],
        'test_precision': [], 'test_specificity': [],
        'test_balanced_acc': [], 'train_auc': [], 'auc_diff': []
    }

    for fold, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=sites), 1):
        print(f"\nФолд {fold}/5:")
        print("-" * 40)

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Создаем validation set
        X_train_main, X_val, y_train_main, y_val = train_test_split(
            X_train, y_train, test_size=0.15, stratify=y_train, random_state=random_state
        )

        # Масштабирование
        scaler = RobustScaler(quantile_range=(10, 90))
        X_train_scaled = scaler.fit_transform(X_train_main)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)

        print(f"   Train: {len(y_train_main)}, Val: {len(y_val)}, Test: {len(y_test)}")

        # Обучаем обе модели
        lr_model = create_ultra_regularized_logistic_regression(X_train_scaled, y_train_main, random_state + fold)
        lgb_model = create_ultra_regularized_lightgbm(X_train_scaled, y_train_main, X_val_scaled, y_val, random_state + fold + 100)

        # Предсказания
        lr_proba_val = lr_model.predict_proba(X_val_scaled)[:, 1]
        lgb_proba_val = lgb_model.predict(X_val_scaled)
        ensemble_proba_val = (lr_proba_val + lgb_proba_val) / 2

        # Оптимальный порог
        thresholds = np.linspace(0.3, 0.7, 50)
        best_threshold = 0.5
        best_f1 = 0

        for threshold in thresholds:
            y_val_pred = (ensemble_proba_val >= threshold).astype(int)
            f1 = f1_score(y_val, y_val_pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        # Предсказания на train и test
        lr_proba_train = lr_model.predict_proba(X_train_scaled)[:, 1]
        lgb_proba_train = lgb_model.predict(X_train_scaled)
        ensemble_proba_train = (lr_proba_train + lgb_proba_train) / 2

        lr_proba_test = lr_model.predict_proba(X_test_scaled)[:, 1]
        lgb_proba_test = lgb_model.predict(X_test_scaled)
        ensemble_proba_test = (lr_proba_test + lgb_proba_test) / 2

        print(f"   Оптимальный порог: {best_threshold:.3f} (Val F1={best_f1:.3f})")

        # Метрики
        m_train = compute_metrics(y_train_main, ensemble_proba_train, threshold=best_threshold)
        m_val   = compute_metrics(y_val, ensemble_proba_val, threshold=best_threshold)
        m_test  = compute_metrics(y_test, ensemble_proba_test, threshold=best_threshold)

        print_split_metrics("TRAIN", m_train)
        print_split_metrics("VAL ", m_val)
        print_split_metrics("TEST", m_test)

        # Сохраняем результаты
        train_auc = roc_auc_score(y_train_main, ensemble_proba_train)
        y_pred_test = (ensemble_proba_test >= best_threshold).astype(int)
        test_auc = roc_auc_score(y_test, ensemble_proba_test)

        results['test_auc'].append(test_auc)
        results['test_accuracy'].append(accuracy_score(y_test, y_pred_test))
        results['test_f1'].append(f1_score(y_test, y_pred_test, zero_division=0))
        results['test_recall'].append(recall_score(y_test, y_pred_test, zero_division=0))
        results['test_precision'].append(precision_score(y_test, y_pred_test, zero_division=0))
        results['test_specificity'].append(recall_score(y_test, y_pred_test, pos_label=0, zero_division=0))
        results['test_balanced_acc'].append(balanced_accuracy_score(y_test, y_pred_test))
        results['train_auc'].append(train_auc)
        results['auc_diff'].append(train_auc - test_auc)

    return results

In [None]:
# === ВЫВОД РЕЗУЛЬТАТОВ ===
def print_model_results(model_name, results, feature_count, sample_count):
    """
    Вывод результатов для конкретной модели
    """
    print("\n" + "="*80)
    print(f"РЕЗУЛЬТАТЫ: {model_name}")
    print("="*80)

    # Средние значения метрик
    mean_test_auc = np.mean(results['test_auc'])
    mean_test_accuracy = np.mean(results['test_accuracy'])
    mean_test_f1 = np.mean(results['test_f1'])
    mean_test_recall = np.mean(results['test_recall'])
    mean_test_precision = np.mean(results['test_precision'])
    mean_test_specificity = np.mean(results['test_specificity'])
    mean_test_balanced_acc = np.mean(results['test_balanced_acc'])
    mean_auc_diff = np.mean(results['auc_diff'])

    # Стандартные отклонения
    std_test_auc = np.std(results['test_auc'])
    std_test_accuracy = np.std(results['test_accuracy'])
    std_test_f1 = np.std(results['test_f1'])

    print(f"\nСРЕДНИЕ МЕТРИКИ (по 5 фолдам):")
    print("-" * 70)
    print(f"{'Метрика':<25} {'Среднее':<10} {'Стандартное отклонение':<25}")
    print("-" * 70)
    print(f"{'AUC':<25} {mean_test_auc:<10.3f} {std_test_auc:<25.3f}")
    print(f"{'Accuracy':<25} {mean_test_accuracy:<10.3f} {std_test_accuracy:<25.3f}")
    print(f"{'F1-Score':<25} {mean_test_f1:<10.3f} {std_test_f1:<25.3f}")
    print(f"{'Recall/Sensitivity':<25} {mean_test_recall:<10.3f} {'':<25}")
    print(f"{'Precision':<25} {mean_test_precision:<10.3f} {'':<25}")
    print(f"{'Specificity':<25} {mean_test_specificity:<10.3f} {'':<25}")
    print(f"{'Balanced Accuracy':<25} {mean_test_balanced_acc:<10.3f} {'':<25}")
    print(f"{'Разница AUC (Train-Test)':<25} {mean_auc_diff:<10.3f} {'':<25}")

    print(f"\nСТАТИСТИКА ДАННЫХ:")
    print(f"   Количество признаков: {feature_count}")
    print(f"   Количество образцов:  {sample_count}")
    print(f"   Отношение признаков к образцам: {feature_count/sample_count:.3f}")

    print(f"\nАНАЛИЗ ПЕРЕОБУЧЕНИЯ:")
    if abs(mean_auc_diff) < 0.03:
        print("   Практически нет переобучения (разница AUC < 0.03)")
    elif abs(mean_auc_diff) < 0.05:
        print("   Минимальное переобучение (разница AUC < 0.05)")
    elif abs(mean_auc_diff) < 0.07:
        print("   Умеренное переобучение (разница AUC < 0.07)")
    else:
        print("   Значительное переобучение (разница AUC >= 0.07)")

    print(f"\nСТАБИЛЬНОСТЬ МОДЕЛИ:")
    if std_test_auc < 0.03:
        print("   Отличная стабильность между фолдами (std AUC < 0.03)")
    elif std_test_auc < 0.05:
        print("   Хорошая стабильность между фолдами (std AUC < 0.05)")
    else:
        print("   Нестабильные результаты между фолдами (std AUC >= 0.05)")

    # AUC по фолдам
    print(f"\nAUC ПО ФОЛДАМ:")
    for i, auc in enumerate(results['test_auc'], 1):
        print(f"   Fold {i}: {auc:.3f}")

    print("\n" + "="*80)

In [None]:
# === ГЛАВНЫЙ ПАЙПЛАЙН ===
def run_all_models_pipeline():
    """
    Главный пайплайн для всех моделей (ПЕРВЫЙ ЭКСПЕРИМЕНТ - с data leakage)
    """
    print("="*80)
    print("ПАЙПЛАЙН ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE)")
    print("="*80)

    try:
        # 1. Загрузка данных
        X, y, sites, ages, sexes, subject_ids = load_and_align_data()

        # 2. Предобработка с удалением выбросов
        print("\n2. АГРЕССИВНАЯ предобработка данных...")
        from sklearn.ensemble import IsolationForest
        iso_forest = IsolationForest(contamination=0.05, random_state=42)
        outliers = iso_forest.fit_predict(X)
        inliers = outliers == 1

        X = X[inliers]
        y = y[inliers]
        sites = sites[inliers]

        print(f"   Удалено {np.sum(~inliers)} выбросов")
        print(f"   Осталось: X={X.shape}, y={y.shape}")

        scaler = RobustScaler(quantile_range=(10, 90))
        X_scaled = scaler.fit_transform(X)

        # 3. ОТБОР ПРИЗНАКОВ (С DATA LEAKAGE!)
        feature_indices, X_selected = stable_feature_selection_cv(
            X_scaled, y, sites, n_features=60
        )

        print(f"\nФИНАЛЬНАЯ РАЗМЕРНОСТЬ ДАННЫХ:")
        print(f"   Признаков: {X_selected.shape[1]}")
        print(f"   Образцов:  {X_selected.shape[0]}")
        print(f"   Отношение признаков к образцам: {X_selected.shape[1] / X_selected.shape[0]:.3f}")

        # 4. ОЦЕНКА ВСЕХ МОДЕЛЕЙ
        print("\n" + "="*80)
        print("НАЧАЛО ОЦЕНКИ МОДЕЛЕЙ")
        print("="*80)

        # Logistic Regression
        lr_results = evaluate_logistic_regression(X_selected, y, sites, random_state=42)
        print_model_results("LOGISTIC REGRESSION", lr_results, X_selected.shape[1], X_selected.shape[0])

        # LightGBM
        lgb_results = evaluate_lightgbm(X_selected, y, sites, random_state=42)
        print_model_results("LIGHTGBM", lgb_results, X_selected.shape[1], X_selected.shape[0])

        # Ensemble
        ensemble_results = evaluate_ensemble(X_selected, y, sites, random_state=42)
        print_model_results("LOGISTIC REGRESSION + LIGHTGBM (АНСАМБЛЬ)", ensemble_results, X_selected.shape[1], X_selected.shape[0])

        # Сводная информация
        print("\n" + "="*80)
        print("СВОДНАЯ ИНФОРМАЦИЯ")
        print("="*80)

        print(f"\nОБЩАЯ СТАТИСТИКА:")
        print(f"   Количество образцов: {X_selected.shape[0]}")
        print(f"   Количество признаков: {X_selected.shape[1]}")
        print(f"   Баланс классов: ASD={sum(y)} ({sum(y)/len(y)*100:.1f}%), Control={len(y)-sum(y)} ({(1-sum(y)/len(y))*100:.1f}%)")

        return {
            'success': True,
            'logistic_regression': {
                'test_auc': np.mean(lr_results['test_auc']),
                'test_accuracy': np.mean(lr_results['test_accuracy']),
                'test_f1': np.mean(lr_results['test_f1']),
                'test_recall': np.mean(lr_results['test_recall']),
                'test_precision': np.mean(lr_results['test_precision']),
                'auc_diff': np.mean(lr_results['auc_diff'])
            },
            'lightgbm': {
                'test_auc': np.mean(lgb_results['test_auc']),
                'test_accuracy': np.mean(lgb_results['test_accuracy']),
                'test_f1': np.mean(lgb_results['test_f1']),
                'test_recall': np.mean(lgb_results['test_recall']),
                'test_precision': np.mean(lgb_results['test_precision']),
                'auc_diff': np.mean(lgb_results['auc_diff'])
            },
            'ensemble': {
                'test_auc': np.mean(ensemble_results['test_auc']),
                'test_accuracy': np.mean(ensemble_results['test_accuracy']),
                'test_f1': np.mean(ensemble_results['test_f1']),
                'test_recall': np.mean(ensemble_results['test_recall']),
                'test_precision': np.mean(ensemble_results['test_precision']),
                'auc_diff': np.mean(ensemble_results['auc_diff'])
            },
            'feature_count': X_selected.shape[1],
            'sample_count': X_selected.shape[0]
        }

    except Exception as e:
        print(f"\nОШИБКА: {e}")
        import traceback
        traceback.print_exc()
        return {'success': False, 'error': str(e)}

In [None]:
# === ЗАПУСК ПАЙПЛАЙНА ===
if __name__ == "__main__":
    print("ЗАПУСК ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE)...")
    print("ВНИМАНИЕ: Этот пайплайн содержит методологические ошибки!\n")

    result = run_all_models_pipeline()

    if result['success']:
        print("\n" + "="*80)
        print("ПАЙПЛАЙН УСПЕШНО ЗАВЕРШЕН!")
        print("ВНИМАНИЕ: Результаты завышены из-за data leakage")
        print("="*80)

        print(f"\nИТОГОВЫЕ РЕЗУЛЬТАТЫ (ЗАВЫШЕННЫЕ):")
        print("-" * 80)
        print(f"{'Модель':<35} {'AUC':<8} {'Accuracy':<10} {'F1':<8} {'Recall':<8} {'Precision':<10}")
        print("-" * 80)
        print(f"{'Logistic Regression':<35} {result['logistic_regression']['test_auc']:<8.3f} "
              f"{result['logistic_regression']['test_accuracy']:<10.3f} "
              f"{result['logistic_regression']['test_f1']:<8.3f} "
              f"{result['logistic_regression']['test_recall']:<8.3f} "
              f"{result['logistic_regression']['test_precision']:<10.3f}")

        print(f"{'LightGBM':<35} {result['lightgbm']['test_auc']:<8.3f} "
              f"{result['lightgbm']['test_accuracy']:<10.3f} "
              f"{result['lightgbm']['test_f1']:<8.3f} "
              f"{result['lightgbm']['test_recall']:<8.3f} "
              f"{result['lightgbm']['test_precision']:<10.3f}")

        print(f"{'Logistic Regression + LightGBM':<35} {result['ensemble']['test_auc']:<8.3f} "
              f"{result['ensemble']['test_accuracy']:<10.3f} "
              f"{result['ensemble']['test_f1']:<8.3f} "
              f"{result['ensemble']['test_recall']:<8.3f} "
              f"{result['ensemble']['test_precision']:<10.3f}")
        print("-" * 80)

        print(f"\n ВАЖНО: Эти результаты содержат data leakage!")
        print("Отбор признаков выполнялся на всех данных, включая тестовые.")
        print("Для корректных результатов используйте второй пайплайн.")

    else:
        print("\n" + "="*80)
        print("ПАЙПЛАЙН ЗАВЕРШИЛСЯ С ОШИБКОЙ")
        print("="*80)
        print(f"Ошибка: {result.get('error', 'Неизвестная ошибка')}")

ЗАПУСК ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE)...
ВНИМАНИЕ: Этот пайплайн содержит методологические ошибки!

ПАЙПЛАЙН ПЕРВОГО ЭКСПЕРИМЕНТА (С DATA LEAKAGE)
1. Загрузка и выравнивание данных...
   Загружено: X=(871, 6670), y=(871,), sites=(871,)
   Классы: ASD=403 (46.3%), Control=468 (53.7%)

2. АГРЕССИВНАЯ предобработка данных...
   Удалено 44 выбросов
   Осталось: X=(827, 6670), y=(827,)

3. СУПЕР-СТАБИЛЬНЫЙ отбор признаков (цель: 60)...
   Найдено 667 стабильных признаков
   После удаления коррелированных: 59 признаков

ФИНАЛЬНАЯ РАЗМЕРНОСТЬ ДАННЫХ:
   Признаков: 59
   Образцов:  827
   Отношение признаков к образцам: 0.071

НАЧАЛО ОЦЕНКИ МОДЕЛЕЙ

______________________________________________________________________
LOGISTIC REGRESSION
______________________________________________________________________

Фолд 1/5:
----------------------------------------
   Train: 668, Val: 119, Test: 40
   Обучение СИЛЬНО регуляризованной Logistic Regression...
   Оптимальный порог: 0.463 (Val F1=