In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    f1_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
)
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import matplotlib.pyplot as plt
import seaborn as sns
import optuna  # Import Optuna

# === Load preprocessed data ===
file_path = 'modified_tedsa_data_clean.csv'
teds_a_data = pd.read_csv(file_path)


categorical_features = [col for col in teds_a_data.columns if col != 'SUB1']

# === Prepare features and target ===
X = teds_a_data.drop(columns=['SUB1'])
y = teds_a_data['SUB1']

# === Train/Val/Test split ===
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print("\n--- Data Split ---")
print(f"Total samples:      {len(X)}")
print(f"Training set size:  {len(X_train)} ({len(X_train)/len(X):.0%})")
print(f"Validation set size:{len(X_val)} ({len(X_val)/len(X):.0%})")
print(f"Test set size:      {len(X_test)} ({len(X_test)/len(X):.0%})")
print("--------------------\n")


# === OPTUNA OBJECTIVE ===
def objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y)),
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_estimators': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    model = LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        categorical_feature=categorical_features,
        callbacks=[early_stopping(stopping_rounds=50, verbose=False)],
    )

    preds = model.predict(X_val)
    score = balanced_accuracy_score(y_val, preds)
    return score


# === OPTUNA STUDY ===
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# === FINAL MODEL TRAINING ===
print("\n--- Training final model with best parameters on combined train+validation data ---")
final_params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'boosting_type': 'gbdt',
    'random_state': 42,
    **best_trial.params,
}

final_model = LGBMClassifier(**final_params, n_estimators=2000)
final_model.fit(
    X_train_val,
    y_train_val,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features,
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)],
)

# === EVALUATION ON TEST SET ===
print("\n--- Final Evaluation on the Test Set ---")
y_pred_classes = final_model.predict(X_test)
macro_f1 = f1_score(y_test, y_pred_classes, average='macro')
balanced_acc = balanced_accuracy_score(y_test, y_pred_classes)
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_classes))  # You can add target names if needed

# === CONFUSION MATRIX ===
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix on Test Set (After Tuning)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# === FEATURE IMPORTANCE ===
importances = final_model.booster_.feature_importance(importance_type='gain')
importance_df = pd.DataFrame(
    {'Feature': X.columns, 'Importance': importances / np.sum(importances)}
).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance (Final Model)')
plt.xlabel('Importance (Gain)')
plt.ylabel('Feature')
plt.show()