In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import logging
import datetime
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif

num_class = 2

# === Create timestamped logging folder ===
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = f"logs/{timestamp}"
os.makedirs(log_dir, exist_ok=True)

# === Logging setup ===
log_file = os.path.join(log_dir, "model_evaluation.log")

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)


logging.basicConfig(
    filename=log_file,
    filemode='w',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

logging.info("Logger inizializzato con successo.")


print(f"Log file salvato in: {log_file}")

def save_plot(fig, name):
    fig_path = os.path.join(log_dir, name)
    fig.savefig(fig_path)
    plt.close(fig)

def recode_labels_for_first_classifier(y):
    return np.where((y == 1) | (y == 7), 1, 0)

def filter_data_for_second_classifier(X, y):
    mask = (y == 1) | (y == 7)
    return X[mask], y[mask]

def evaluate_model(model, X_test, y_test, model_name=""):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    log_msg = (
        f"\n--- Evaluation: {model_name} ---\n"
        f"Classification Report:\n{report}\n"
        f"Confusion Matrix:\n{matrix}\n"
        f"Balanced Accuracy: {bal_acc}\n"
        f"F1 Macro: {f1}\n"
        + "-"*80
    )
    logging.info(log_msg)
    print(log_msg)

# Pipeline 1: XGBoost 
def create_xgb_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', XGBClassifier(eval_metric='mlogloss', random_state=42))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 6, 10],
        'classifier__learning_rate': [0.01, 0.1, 0.2]
    }
    
    return pipeline, param_dist

# Pipeline 2: Random Forest
def create_rf_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    }
    
    return pipeline, param_dist

# Pipeline 3: SVM 
def create_svm_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', SVC(random_state=42, class_weight='balanced', kernel='linear'))
    ])

    param_dist = {
        'selector__k': [700],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf', 'poly'],
        'classifier__gamma': ['scale', 'auto']  # Only relevant for rbf and poly kernels
    }
    return pipeline, param_dist

# Pipeline 4: Logistic Regression 
def create_lr_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear', 'saga']
    }
    
    return pipeline, param_dist

# Pipeline 5: KNN
def create_knn_pipeline():
    pipeline = Pipeline([
        ('smote', BorderlineSMOTE(random_state=42)),
        ('varthresh', VarianceThreshold(threshold=1e-5)),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_classif, k=100)),
        ('lda', LinearDiscriminantAnalysis(solver='svd', n_components=num_class-1)),
        ('classifier', KNeighborsClassifier())
    ])
    
    param_dist = {
        'selector__k': [700],
        'classifier__n_neighbors': [3, 5, 7, 9, 11],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__p': [1, 2]  # 1: manhattan, 2: euclidean
    }
    
    return pipeline, param_dist


def main():
    df = pd.read_csv("../roi_features_train.csv")
    X = df.drop(columns=["image_id", "score", "x1", "y1", "x2", "y2", "label"])
    y = df["label"]

    labels = ['Red Blood Cells', 'Trophozoite', 'Ring', 'Difficult', 'Shizont', 'Gametocyte', 'White Blood Cells']

    y.replace({2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 1}, inplace=True)
    labels = ['Healthy', 'Infected']

    # remove class 1
    # X = X[y != 1]
    # y = y[y != 1]
    # labels.remove('Red Blood Cells')

    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    

    le = LabelEncoder()
    y = le.fit_transform(y)

    smote = BorderlineSMOTE(random_state=42, k_neighbors=5)
    X, y = smote.fit_resample(X, y)

    selector = SelectKBest(score_func=f_classif, k=700)
    X_selected = selector.fit_transform(X, y)

    lda = LinearDiscriminantAnalysis(n_components=num_class-1, solver='svd')
    X_lda = lda.fit_transform(X_selected, y)


    # Visualization of the class distribution after LDA
    if num_class <= 2:
        # Con .ravel()
        X_lda_1d = X_lda.ravel()

        plt.figure(figsize=(10, 5))
        sns.histplot(X_lda_1d[y == 0], color='skyblue', label='Cellule Sane', kde=True, stat="density", bins=30)
        sns.histplot(X_lda_1d[y == 1], color='#ff7f0e', label='Cellule Infette', kde=True, stat="density", bins=30)
        plt.title("Distribuzione della proiezione LDA (1D)")
        plt.xlabel("LDA Component 1")
        plt.legend()
        save_plot(plt.gcf(), "lda_distribution_1d.png")
        plt.show()

    else:
        plt.figure(figsize=(10, 6))
        palette = sns.color_palette('viridis', num_class)
        scatter = sns.scatterplot(
            x=X_lda[:, 0], y=X_lda[:, 1], 
            hue=y, 
            palette=palette, 
            s=50, alpha=0.7, 
            legend='full'
        )
        plt.title("Distribuzione delle classi dopo LDA")
        plt.xlabel("LDA Component 1")
        plt.ylabel("LDA Component 2")
        handles, _ = scatter.get_legend_handles_labels()
        plt.legend(handles=handles, labels=labels, title='Classi')
        save_plot(plt.gcf(), "lda_distribution.png")
        plt.show()

    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    pipelines = [
        ("XGBoost", create_xgb_pipeline()),
        ("Random Forest", create_rf_pipeline()),
        ("SVM", create_svm_pipeline()),
        ("Logistic Regression", create_lr_pipeline()),
        ("KNN", create_knn_pipeline())
    ]

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    results = []

    for name, (pipeline, param_dist) in pipelines:
        print(f"\n{'='*40}\nTraining {name} pipeline\n{'='*40}")
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=param_dist,
            cv=cv,
            scoring='f1_macro',
            n_iter=3,
            random_state=42,
            n_jobs=1
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_

        logging.info(f"Best parameters for {name}: {search.best_params_}")
        logging.info(f"Best score for {name}: {search.best_score_:.4f}")

        y_pred = best_model.predict(X_test)
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        results.append({
            "Model": name,
            "Best Params": search.best_params_,
            "Balanced Accuracy": bal_acc,
            "F1 Macro": f1
        })

        evaluate_model(best_model, X_test, y_test, model_name=name)

    results_df = pd.DataFrame(results)
    logging.info("\nConfronto Finale:\n" + str(results_df.sort_values(by="F1 Macro", ascending=False)))
    
    fig = plt.figure(figsize=(10, 6))
    sns.barplot(x='F1 Macro', y='Model', data=results_df.sort_values('F1 Macro', ascending=False), palette='viridis')
    plt.title("Confronto modelli - F1 Macro")
    plt.xlim(0, 1)
    save_plot(fig, "confronto_modelli.png")

    for handler in logging.root.handlers[:]:
        handler.flush()
        handler.close()

if __name__ == "__main__":
    main()


Log file salvato in: logs/20250704_101952/model_evaluation.log
