In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import logging
import datetime
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif

num_class = 2

# === Create timestamped logging folder ===
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = f"logs/{timestamp}"
os.makedirs(log_dir, exist_ok=True)

# === Logging setup (robusto) ===
log_file = os.path.join(log_dir, "model_evaluation.log")

# Rimuove handler pre-esistenti (es. da Jupyter o da un run precedente)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configura logging verso file
logging.basicConfig(
    filename=log_file,
    filemode='w',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Test immediato per confermare che il file venga scritto
logging.info("Logger inizializzato con successo.")


print(f"Log file salvato in: {log_file}")

def save_plot(fig, name):
    fig_path = os.path.join(log_dir, name)
    fig.savefig(fig_path)
    plt.close(fig)

def recode_labels_for_first_classifier(y):
    return np.where((y == 1) | (y == 7), 1, 0)

def filter_data_for_second_classifier(X, y):
    mask = (y == 1) | (y == 7)
    return X[mask], y[mask]

def evaluate_model(model, X_test, y_test, model_name=""):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    log_msg = (
        f"\n--- Evaluation: {model_name} ---\n"
        f"Classification Report:\n{report}\n"
        f"Confusion Matrix:\n{matrix}\n"
        f"Balanced Accuracy: {bal_acc}\n"
        f"F1 Macro: {f1}\n"
        + "-"*80
    )
    logging.info(log_msg)
    print(log_msg)

# Pipeline 1: XGBoost 
def create_xgb_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', XGBClassifier(eval_metric='mlogloss', random_state=42))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 6, 10],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    }
    
    return pipeline, param_dist

# Pipeline 2: Random Forest
def create_rf_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    }
    
    return pipeline, param_dist

# Pipeline 3: SVM 
def create_svm_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', SVC(random_state=42, class_weight='balanced', kernel='linear'))
    ])

    param_dist = {
        'selector__k': [700],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf', 'poly'],
        'classifier__gamma': ['scale', 'auto']  # Only relevant for rbf and poly kernels
    }
    return pipeline, param_dist

# Pipeline 4: Logistic Regression 
def create_lr_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear', 'saga']
    }
    
    return pipeline, param_dist

# Pipeline 5: KNN
def create_knn_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', KNeighborsClassifier())
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_neighbors': [3, 5, 7, 9, 11],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__p': [1, 2]  # 1: manhattan, 2: euclidean
    }
    
    return pipeline, param_dist


def main():
    df = pd.read_csv("../roi_features_train.csv")
    X = df.drop(columns=["image_id", "score", "x1", "y1", "x2", "y2", "label"])
    y_original = df["label"]

    # ========== Stage 1: Healthy (RBC+WBC) vs Infected ==========
    y_stage1 = recode_labels_for_first_classifier(y_original)

    # Encode y_stage1 to 0/1
    le_stage1 = LabelEncoder()
    y_stage1 = le_stage1.fit_transform(y_stage1)

    # Train-test split for Stage 1
    X_train_stage1, X_test_stage1, y_train_stage1, y_test_stage1 = train_test_split(
        X, y_stage1, test_size=0.2, stratify=y_stage1, random_state=42
    )

    print(f"\n{'='*40}\nTraining Stage 1: Healthy vs Infected\n{'='*40}")
    pipeline_stage1, param_dist_stage1 = create_rf_pipeline()

    search_stage1 = RandomizedSearchCV(
        pipeline_stage1,
        param_distributions=param_dist_stage1,
        cv=3,
        n_iter=3,
        scoring='f1_macro',
        random_state=42
    )
    search_stage1.fit(X_train_stage1, y_train_stage1)
    best_model_stage1 = search_stage1.best_estimator_

    logging.info(f"[Stage 1] Best Parameters: {search_stage1.best_params_}")
    evaluate_model(best_model_stage1, X_test_stage1, y_test_stage1, model_name="Stage 1 - Healthy vs Infected")

    # ========== Stage 2: Infected Subtypes Only ==========
    # Predict on X_test_stage1 to find infected samples
    y_pred_stage1 = best_model_stage1.predict(X_test_stage1)
    infected_indices = np.where(y_pred_stage1 == 1)[0]

    if len(infected_indices) == 0:
        print("No infected cells predicted by Stage 1 on the test set.")
        return

    # Step 1: Filter training infected samples
    X_train_stage2, y_train_stage2_raw = filter_data_for_second_classifier(
        X_train_stage1, y_original.iloc[X_train_stage1.index]
    )

    # Step 2: Fit LabelEncoder on training infected labels
    le_stage2 = LabelEncoder()
    y_train_stage2 = le_stage2.fit_transform(y_train_stage2_raw)

    # Step 3: Filter test infected predictions
    X_test_stage2_all = X_test_stage1.iloc[infected_indices]
    y_test_stage2_raw_all = y_original.iloc[X_test_stage2_all.index]

    # Step 4: Keep only labels seen during training
    valid_mask = y_test_stage2_raw_all.isin(le_stage2.classes_)
    X_test_stage2 = X_test_stage2_all[valid_mask]
    y_test_stage2_raw = y_test_stage2_raw_all[valid_mask]
    y_test_stage2 = le_stage2.transform(y_test_stage2_raw)

    if len(X_test_stage2) == 0:
        print("No valid infected subtype labels in test set after filtering.")
        return

    print(f"\n{'='*40}\nTraining Stage 2: Infected Subtype Classification\n{'='*40}")
    pipeline_stage2, param_dist_stage2 = create_svm_pipeline()

    search_stage2 = RandomizedSearchCV(
        pipeline_stage2,
        param_distributions=param_dist_stage2,
        cv=3,
        n_iter=3,
        scoring='f1_macro',
        random_state=42
    )
    search_stage2.fit(X_train_stage2, y_train_stage2)
    best_model_stage2 = search_stage2.best_estimator_

    logging.info(f"[Stage 2] Best Parameters: {search_stage2.best_params_}")
    evaluate_model(best_model_stage2, X_test_stage2, y_test_stage2, model_name="Stage 2 - Infected Subtypes")

    # Final Log Summary
    logging.info("\nPipeline Completed Successfully.")
    print("✅ Pipeline completed successfully.")



if __name__ == "__main__":
    main()


Log file salvato in: logs/20250701_214005/model_evaluation.log

Training Stage 1: Healthy vs Infected

--- Evaluation: Stage 1 - Healthy vs Infected ---
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.77      0.43       488
           1       0.99      0.94      0.96     14214

    accuracy                           0.93     14702
   macro avg       0.65      0.86      0.70     14702
weighted avg       0.97      0.93      0.95     14702

Confusion Matrix:
[[  378   110]
 [  874 13340]]
Balanced Accuracy: 0.8565507453976338
F1 Macro: 0.6994565325781297
--------------------------------------------------------------------------------

Training Stage 2: Infected Subtype Classification

--- Evaluation: Stage 2 - Infected Subtypes ---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13333
           1       0.07      0.57      0.12         7

    accuracy 