In [1]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder

# Load dataset (replace with actual path or CSV string if available)
# csv_content = """[full CSV string]"""
# df = pd.read_csv(StringIO(csv_content))
df = pd.read_csv('/content/ucec_tcga_clinical_data.csv')

# Step 1: Basic statistics
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing Percent:\n", df.isnull().mean() * 100)
print("Unique Patients:", df['Patient ID'].nunique())
print("Disease Free Status Counts:\n", df['Disease Free Status'].value_counts())
print("Diagnosis Age Stats:\n", df['Diagnosis Age'].describe())
print("Neoplasm Disease Stage American Joint Committee on Cancer Code Counts:\n",
      df['Neoplasm Disease Stage American Joint Committee on Cancer Code'].value_counts())
print("Neoplasm Histologic Grade Counts:\n", df['Neoplasm Histologic Grade'].value_counts())
print("Cancer Type Detailed Counts:\n", df['Cancer Type Detailed'].value_counts())
print("Mutation Count Stats:\n", df['Mutation Count'].describe())
print("TMB (nonsynonymous) Stats:\n", df['TMB (nonsynonymous)'].describe())
print("Fraction Genome Altered Stats:\n", df['Fraction Genome Altered'].describe())

# Step 2: Cleaning and Imputation
# Replace 'NA' with np.nan
df = df.replace('NA', np.nan)

# Drop columns with >80% missing
missing_percent = df.isnull().mean() * 100
high_missing_cols = missing_percent[missing_percent > 80].index.tolist()
print(f"\nDropping {len(high_missing_cols)} columns with >80% missing: {high_missing_cols}")
df = df.drop(columns=high_missing_cols)

# Drop rows with missing target
print("\nRows before dropping missing target:", df.shape[0])
print("Disease Free Status counts before dropna:\n", df['Disease Free Status'].value_counts())
df = df.dropna(subset=['Disease Free Status'])
print("Rows after dropping missing target:", df.shape[0])
print("Disease Free Status counts after dropna:\n", df['Disease Free Status'].value_counts())

# Define column groups
genomic_cols = ['Mutation Count', 'TMB (nonsynonymous)', 'Fraction Genome Altered']
continuous_cols = ['Diagnosis Age', 'Tumor invasion percent', 'Patient Weight',
                  'Days to Sample Collection.', 'Overall Survival (Months)']
categorical_cols = ['Person Neoplasm Status', 'Menopause Status', 'Ethnicity Category',
                   'Surgical Margin Resection Status', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage']

# Verify categorical columns exist
missing_cats = [col for col in categorical_cols if col not in df.columns]
if missing_cats:
    print(f"Warning: These categorical columns are missing: {missing_cats}")
    # Adjust categorical_cols to only include existing columns
    categorical_cols = [col for col in categorical_cols if col in df.columns]

# Ordinal encode 'Neoplasm Histologic Grade'
ordinal_encoder = OrdinalEncoder(categories=[['G1', 'G2', 'G3', 'High Grade']], handle_unknown='use_encoded_value', unknown_value=-1)
df['Neoplasm Histologic Grade'] = ordinal_encoder.fit_transform(df[['Neoplasm Histologic Grade']].fillna('G1'))

# One-hot encode 'Cancer Type Detailed'
df = pd.get_dummies(df, columns=['Cancer Type Detailed'], prefix='Cancer Type Detailed', drop_first=True, dtype=np.uint8)

# Features for KNN imputation
feature_cols_for_knn = [
    'Diagnosis Age', 'Neoplasm Histologic Grade', 'Patient Weight', 'Tumor invasion percent',
    'Cancer Type Detailed_Uterine Endometrioid Carcinoma',
    'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma'
] + genomic_cols

# Subtype-specific KNN imputation
subtypes = [
    'Cancer Type Detailed_Endometrial Carcinoma',  # Inferred reference
    'Cancer Type Detailed_Uterine Endometrioid Carcinoma',
    'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma'
]
for subtype in subtypes:
    if subtype == 'Cancer Type Detailed_Endometrial Carcinoma':
        mask = (df['Cancer Type Detailed_Uterine Endometrioid Carcinoma'] == 0) & \
               (df['Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma'] == 0)
    else:
        mask = df[subtype] == 1
    if mask.sum() > 0:
        imputer = KNNImputer(n_neighbors=5, weights='distance')
        df.loc[mask, feature_cols_for_knn] = imputer.fit_transform(df.loc[mask, feature_cols_for_knn])

# Fallback global KNN
if df[genomic_cols].isnull().any().any():
    print("\nApplying global KNN imputation...")
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    df[feature_cols_for_knn] = imputer.fit_transform(df[feature_cols_for_knn])

# Mean imputation for continuous
for col in continuous_cols:
    df[col] = df[col].fillna(df[col].mean())

# Mode imputation for categorical
for col in categorical_cols:
    mode_value = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
    df[col] = df[col].fillna(mode_value)

# Reconstruct 'Cancer Type Detailed'
def reconstruct_subtype(row):
    if row['Cancer Type Detailed_Uterine Endometrioid Carcinoma']:
        return 'Uterine Endometrioid Carcinoma'
    elif row['Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma']:
        return 'Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma'
    else:
        return 'Endometrial Carcinoma'
df['Cancer Type Detailed'] = df.apply(reconstruct_subtype, axis=1)

# Log-transform skewed columns
skewed_cols = ['Mutation Count', 'TMB (nonsynonymous)']
for col in skewed_cols:
    if col in df.columns:
        df[col] = np.log1p(df[col])

# Standardize numeric columns
numeric_cols = [
    'Diagnosis Age', 'Mutation Count', 'TMB (nonsynonymous)', 'Fraction Genome Altered',
    'Patient Weight', 'Tumor invasion percent', 'Days to Sample Collection.',
    'Overall Survival (Months)'
]
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Encode categorical columns
nominal_cols = [col for col in ['Person Neoplasm Status', 'Ethnicity Category', 'Surgical Margin Resection Status'] if col in df.columns]
ordinal_cols = [col for col in ['Menopause Status', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage'] if col in df.columns]

# One-hot encode nominal columns
if nominal_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe_nominal = pd.DataFrame(ohe.fit_transform(df[nominal_cols]),
                               columns=ohe.get_feature_names_out(nominal_cols),
                               index=df.index)
    df = pd.concat([df, ohe_nominal], axis=1)
    df.drop(columns=nominal_cols, inplace=True)
else:
    print("No nominal columns to encode.")

# Ordinal encode ordinal columns
if ordinal_cols:
    ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df[ordinal_cols] = ord_encoder.fit_transform(df[ordinal_cols])
else:
    print("No ordinal columns to encode.")

# Define feature columns
feature_cols = numeric_cols + ['Neoplasm Histologic Grade'] + \
               ['Cancer Type Detailed_Uterine Endometrioid Carcinoma',
                'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma'] + \
               (list(ohe.get_feature_names_out(nominal_cols)) if nominal_cols else []) + ordinal_cols

# Verify dtypes
print("\nFeature dtypes:\n", df[feature_cols].dtypes)
assert all(df[feature_cols].dtypes.apply(lambda x: x in ['float64', 'int64', 'uint8', 'bool'])), "Non-numeric dtypes in feature_cols"

# Convert any remaining object columns to numeric
for col in feature_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(np.uint8)
print("\nFeature dtypes after conversion:\n", df[feature_cols].dtypes)

# Prepare X and y
print("\nDisease Free Status unique values:\n", df['Disease Free Status'].unique())
y = df['Disease Free Status'].str.split(':').str[0].map({'DiseaseFree': 0, 'Recurred/Progressed': 1})
if y.isna().any():
    print("Warning: NaNs in target, filling with mode")
    # Check if mode() is empty before accessing [0]
    if not y.mode().empty:
        y = y.fillna(y.mode()[0])
    else:
        print("Warning: Target variable 'y' contains only NaNs and has no mode. Cannot fill NaNs with mode.")

le = LabelEncoder()
y = le.fit_transform(y)
print("y unique values:", np.unique(y))
assert set(np.unique(y)).issubset({0, 1}), "y is not binary"

# Print final DataFrame columns
print("\nFinal DataFrame columns:\n", df.columns.tolist())
print("\nProcessed DataFrame:\n", df)

Shape: (549, 98)
Columns: ['Study ID', 'Patient ID', 'Sample ID', 'Diagnosis Age', 'Neoplasm Disease Stage American Joint Committee on Cancer Code', 'American Joint Committee on Cancer Publication Version Type', 'Cancer Type', 'Cancer Type Detailed', 'Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage', 'Neoplasm American Joint Committee on Cancer Clinical Regional Lymph Node N Stage', 'Neoplasm American Joint Committee on Cancer Clinical Primary Tumor T Stage', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage', 'Days to Sample Collection.', 'Last Alive Less Initial Pathologic Diagnosis Date Calculated Day Value', 'Days to Sample Procurement', 'Disease Free (Months)', 'Disease Free Status', 'Participant Personal Medical History Diabetes Mellitus Ind-3', 'Disease code', 'Ethnicity Category', 'Lymphomatous Extranodal Site Involvement Indicator', 'Form completion date', 'Fraction Genome Altered', 'Neoplasm Histologic Grade', 'Patient Height', 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Debug 'Disease Free Status' values
print("Disease Free Status unique values:\n", df['Disease Free Status'].unique())
print("Disease Free Status counts:\n", df['Disease Free Status'].value_counts())

# Map target values
y = df['Disease Free Status'].str.strip().map({'0:DiseaseFree': 0, '1:Recurred/Progressed': 1})
print("y values after mapping:\n", y.unique())

# Check for NaNs
if y.isna().any():
    print(f"Warning: {y.isna().sum()} NaNs in target, filling with mode")
    mode_value = y.mode()[0]
    y = y.fillna(mode_value)
    print("y values after filling NaNs:\n", y.unique())

# Convert to numeric
y = y.astype(int)
print("y unique values after conversion:\n", np.unique(y))
assert set(np.unique(y)).issubset({0, 1}), "y is not binary"

# Verify X
X = df[feature_cols].values
print("X dtype:", X.dtype)
assert X.dtype in [np.float64, np.float32, np.int64, np.int32], "X contains non-numeric values"

Disease Free Status unique values:
 ['1:Recurred/Progressed' '0:DiseaseFree']
Disease Free Status counts:
 Disease Free Status
0:DiseaseFree            398
1:Recurred/Progressed    110
Name: count, dtype: int64
y values after mapping:
 [1 0]
y unique values after conversion:
 [0 1]
X dtype: float64


In [3]:
!pip install catboost
!pip install xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Verify inputs
print("Feature columns:", feature_cols)
print("X shape:", X.shape)
print("y unique values:", np.unique(y))

# Convert y to numpy array
y_np = y.values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_np, test_size=0.2, random_state=42, stratify=y_np)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.8, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define models and hyperparameter grids
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.05, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [1, 2, 3]
}

catboost = CatBoostClassifier(random_state=42, verbose=0)
catboost_grid = {
    'iterations': [200, 300, 400],
    'depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'l2_leaf_reg': [1, 3]
}

# Tune models
xgb_search = GridSearchCV(xgb, xgb_grid, cv=5, scoring='f1', n_jobs=-1)
xgb_search.fit(X_train_smote, y_train_smote)
best_xgb = xgb_search.best_estimator_
print("Best XGBoost Hyperparameters:", xgb_search.best_params_)

catboost_search = GridSearchCV(catboost, catboost_grid, cv=5, scoring='f1', n_jobs=-1)
catboost_search.fit(X_train_smote, y_train_smote)
best_catboost = catboost_search.best_estimator_
print("Best CatBoost Hyperparameters:", catboost_search.best_params_)

# Generate out-of-fold predictions for base models
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
base_predictions = {'XGBoost': np.zeros(len(y_np)), 'CatBoost': np.zeros(len(y_np))}
meta_features = np.zeros((len(X), 2))  # For two base models

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_np)):
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y_np[train_idx], y_np[val_idx]
    X_train_fold_smote, y_train_fold_smote = smote.fit_resample(X_train_fold, y_train_fold)

    # XGBoost
    best_xgb.fit(X_train_fold_smote, y_train_fold_smote)
    base_predictions['XGBoost'][val_idx] = best_xgb.predict(X_val_fold)
    meta_features[val_idx, 0] = best_xgb.predict_proba(X_val_fold)[:, 1]

    # CatBoost
    best_catboost.fit(X_train_fold_smote, y_train_fold_smote)
    base_predictions['CatBoost'][val_idx] = best_catboost.predict(X_val_fold)
    meta_features[val_idx, 1] = best_catboost.predict_proba(X_val_fold)[:, 1]

# Classification reports for base models
for name in ['XGBoost', 'CatBoost']:
    print(f"\nClassification Report for {name} (Out-of-Fold):")
    print(classification_report(y_np, base_predictions[name], target_names=['DiseaseFree', 'Recurred/Progressed']))

# Define and train stacking ensemble
estimators = [
    ('xgb', best_xgb),
    ('catboost', best_catboost)
]
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42),
    cv=5
)
stacking_clf.fit(X_train_smote, y_train_smote)

# Evaluate stacking ensemble
y_pred_stack = stacking_clf.predict(X_test)
y_pred_stack_proba = stacking_clf.predict_proba(X_test)[:, 1]
print("\nClassification Report for Stacking Ensemble:")
print(classification_report(y_test, y_pred_stack, target_names=['DiseaseFree', 'Recurred/Progressed']))
print("Stacking Ensemble AUC:", roc_auc_score(y_test, y_pred_stack_proba))

# Threshold tuning for Stacking Ensemble
fpr, tpr, thresholds = roc_curve(y_test, y_pred_stack_proba)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("\nOptimal Threshold for Stacking Ensemble:", optimal_threshold)

y_pred_stack_optimal = (y_pred_stack_proba >= optimal_threshold).astype(int)
print("\nClassification Report for Stacking Ensemble (Optimal Threshold):")
print(classification_report(y_test, y_pred_stack_optimal, target_names=['DiseaseFree', 'Recurred/Progressed']))
print("Stacking Ensemble (Optimal Threshold) AUC:", roc_auc_score(y_test, y_pred_stack_proba))

Feature columns: ['Diagnosis Age', 'Mutation Count', 'TMB (nonsynonymous)', 'Fraction Genome Altered', 'Patient Weight', 'Tumor invasion percent', 'Days to Sample Collection.', 'Overall Survival (Months)', 'Neoplasm Histologic Grade', 'Cancer Type Detailed_Uterine Endometrioid Carcinoma', 'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma', 'Person Neoplasm Status_TUMOR FREE', 'Person Neoplasm Status_WITH TUMOR', 'Ethnicity Category_HISPANIC OR LATINO', 'Ethnicity Category_NOT HISPANIC OR LATINO', 'Surgical Margin Resection Status_R0', 'Surgical Margin Resection Status_R1', 'Surgical Margin Resection Status_R2', 'Surgical Margin Resection Status_RX', 'Menopause Status', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage']
X shape: (508, 21)
y unique values: [0 1]
Best XGBoost Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 9, 'n_estimators': 200, 'scale_pos_weight': 3, 'subsample': 0.8}
Best CatBoost Hyperparam

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
import numpy as np

# Verify inputs
print("Feature columns:", feature_cols)
print("X shape:", X.shape)
print("y unique values:", np.unique(y))

# Convert y to numpy array
y_np = y.values

# Define base models
base_models = [
    xgb.XGBClassifier(random_state=42, n_estimators=100, base_score=0.5),
    CatBoostClassifier(random_state=42, iterations=100, verbose=False),
    lgb.LGBMClassifier(random_state=42, n_estimators=100),
    MLPClassifier(random_state=42, hidden_layer_sizes=(50, 50), max_iter=500)
]

# Generate out-of-fold meta-features and base model predictions
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
meta_features = np.zeros((len(X), len(base_models)))
base_predictions = {f"base_model_{i}": np.zeros(len(X)) for i in range(len(base_models))}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_np)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_np[train_idx], y_np[val_idx]
    for model_idx, base_model in enumerate(base_models):
        base_model.fit(X_train, y_train)
        meta_features[val_idx, model_idx] = base_model.predict_proba(X_val)[:, 1]
        base_predictions[f"base_model_{model_idx}"][val_idx] = base_model.predict(X_val)

# Train and evaluate meta-models
meta_models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("XGBoost", xgb.XGBClassifier(random_state=42, n_estimators=100, base_score=0.5)),
    ("SVM", SVC(probability=True, random_state=42))
]

# Generate classification reports for base and meta-models
for model_idx, base_model in enumerate(base_models):
    print(f"\nClassification Report for Base Model {model_idx} ({type(base_model).__name__}):")
    print(classification_report(y_np, base_predictions[f"base_model_{model_idx}"], target_names=['DiseaseFree', 'Recurred/Progressed']))

for name, meta_model in meta_models:
    meta_model.fit(meta_features, y_np)
    y_pred = meta_model.predict(meta_features)
    y_pred_proba = meta_model.predict_proba(meta_features)[:, 1]
    print(f"\nClassification Report for Meta-Model {name}:")
    print(classification_report(y_np, y_pred, target_names=['DiseaseFree', 'Recurred/Progressed']))
    print(f"{name} Stacking AUC:", roc_auc_score(y_np, y_pred_proba))

Feature columns: ['Diagnosis Age', 'Mutation Count', 'TMB (nonsynonymous)', 'Fraction Genome Altered', 'Patient Weight', 'Tumor invasion percent', 'Days to Sample Collection.', 'Overall Survival (Months)', 'Neoplasm Histologic Grade', 'Cancer Type Detailed_Uterine Endometrioid Carcinoma', 'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma', 'Person Neoplasm Status_TUMOR FREE', 'Person Neoplasm Status_WITH TUMOR', 'Ethnicity Category_HISPANIC OR LATINO', 'Ethnicity Category_NOT HISPANIC OR LATINO', 'Surgical Margin Resection Status_R0', 'Surgical Margin Resection Status_R1', 'Surgical Margin Resection Status_R2', 'Surgical Margin Resection Status_RX', 'Menopause Status', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage']
X shape: (508, 21)
y unique values: [0 1]
[LightGBM] [Info] Number of positive: 88, number of negative: 318
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000483 seconds.
You can se

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Verify inputs
print("Feature columns:", feature_cols)
print("X shape:", X.shape)
print("y unique values:", np.unique(y))

# Convert y to numpy array
y_np = y.values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_np, test_size=0.2, random_state=42, stratify=y_np)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.8, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define simplified base models with regularization
base_models = [
    ('xgb', XGBClassifier(
        random_state=42,
        n_estimators=50,  # Reduced from 100
        max_depth=3,      # Shallower trees
        learning_rate=0.1,
        reg_lambda=1.0,   # L2 regularization
        eval_metric='logloss'
    )),
    ('catboost', CatBoostClassifier(
        random_state=42,
        iterations=50,    # Reduced from 100
        depth=4,          # Shallower trees
        l2_leaf_reg=5,    # Increased regularization
        verbose=0
    )),
    ('lgbm', LGBMClassifier(
        random_state=42,
        n_estimators=50,  # Reduced from 100
        max_depth=3,      # Shallower trees
        reg_lambda=1.0    # L2 regularization
    )),
    ('mlp', MLPClassifier(
        random_state=42,
        hidden_layer_sizes=(20,),  # Simpler architecture
        max_iter=200       # Reduced iterations
    ))
]

# Generate out-of-fold meta-features and base model predictions
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
meta_features = np.zeros((len(X), len(base_models)))
base_predictions = {f"{name}": np.zeros(len(X)) for name, _ in base_models}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_np)):
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y_np[train_idx], y_np[val_idx]
    X_train_fold_smote, y_train_fold_smote = smote.fit_resample(X_train_fold, y_train_fold)

    for model_idx, (name, model) in enumerate(base_models):
        model.fit(X_train_fold_smote, y_train_fold_smote)
        meta_features[val_idx, model_idx] = model.predict_proba(X_val_fold)[:, 1]
        base_predictions[name][val_idx] = model.predict(X_val_fold)

# Classification reports for base models
for name, _ in base_models:
    print(f"\nClassification Report for Base Model {name}:")
    print(classification_report(y_np, base_predictions[name], target_names=['DiseaseFree', 'Recurred/Progressed']))

# Define and train meta-models
meta_models = [
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("XGBoost", XGBClassifier(
        random_state=42,
        n_estimators=50,  # Simplified
        max_depth=3,      # Shallower
        reg_lambda=1.0,   # Regularization
        eval_metric='logloss'
    )),
    ("SVM", SVC(probability=True, random_state=42))
]

# Generate classification reports for meta-models
for name, meta_model in meta_models:
    meta_model.fit(meta_features, y_np)
    y_pred = meta_model.predict(meta_features)
    y_pred_proba = meta_model.predict_proba(meta_features)[:, 1]
    print(f"\nClassification Report for Meta-Model {name}:")
    print(classification_report(y_np, y_pred, target_names=['DiseaseFree', 'Recurred/Progressed']))
    print(f"{name} Stacking AUC:", roc_auc_score(y_np, y_pred_proba))

# Evaluate stacking ensemble on test set
estimators = base_models
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42),
    cv=5
)
stacking_clf.fit(X_train_smote, y_train_smote)
y_pred_stack = stacking_clf.predict(X_test)
y_pred_stack_proba = stacking_clf.predict_proba(X_test)[:, 1]
print("\nClassification Report for Stacking Ensemble:")
print(classification_report(y_test, y_pred_stack, target_names=['DiseaseFree', 'Recurred/Progressed']))
print("Stacking Ensemble AUC:", roc_auc_score(y_test, y_pred_stack_proba))

# Threshold tuning for Stacking Ensemble
fpr, tpr, thresholds = roc_curve(y_test, y_pred_stack_proba)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("\nOptimal Threshold for Stacking Ensemble:", optimal_threshold)

y_pred_stack_optimal = (y_pred_stack_proba >= optimal_threshold).astype(int)
print("\nClassification Report for Stacking Ensemble (Optimal Threshold):")
print(classification_report(y_test, y_pred_stack_optimal, target_names=['DiseaseFree', 'Recurred/Progressed']))
print("Stacking Ensemble (Optimal Threshold) AUC:", roc_auc_score(y_test, y_pred_stack_proba))

Feature columns: ['Diagnosis Age', 'Mutation Count', 'TMB (nonsynonymous)', 'Fraction Genome Altered', 'Patient Weight', 'Tumor invasion percent', 'Days to Sample Collection.', 'Overall Survival (Months)', 'Neoplasm Histologic Grade', 'Cancer Type Detailed_Uterine Endometrioid Carcinoma', 'Cancer Type Detailed_Uterine Serous Carcinoma/Uterine Papillary Serous Carcinoma', 'Person Neoplasm Status_TUMOR FREE', 'Person Neoplasm Status_WITH TUMOR', 'Ethnicity Category_HISPANIC OR LATINO', 'Ethnicity Category_NOT HISPANIC OR LATINO', 'Surgical Margin Resection Status_R0', 'Surgical Margin Resection Status_R1', 'Surgical Margin Resection Status_R2', 'Surgical Margin Resection Status_RX', 'Menopause Status', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage']
X shape: (508, 21)
y unique values: [0 1]
[LightGBM] [Info] Number of positive: 254, number of negative: 318
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can s