# 🎯 Train ML Baseline Classifier
Let’s walk through a powerful Colab notebook-style setup that checks all subtasks

## ✅ Features:
### + XGBoost classifier
### + Feature importance plots
### + Optuna hyperparameter tuning

## 📦 Install Dependencies + 🔧 Import

In [None]:
# 📦 Install XGBoost if not yet installed
!pip install xgboost optuna scikit-learn matplotlib seaborn


# ✅ Imports
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

## 📥 Load k-mer Data + Labels

In [None]:
df = pd.read_csv("data/processed/fasta_kmer_6mer.csv")

# ⚠️ TEMP: Simulate multi-class task
df["label"] = [i % 3 for i in range(len(df))]  # Classes: 0,1,2
print(df["label"].value_counts())

X = df.drop(columns=["label"])
y = df["label"]

# Label encode for safety
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## 🔍 Optuna Tuner

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "tree_method": "hist",
        "eval_metric": "mlogloss",
        "objective": "multi:softprob",
        "num_class": len(np.unique(y_encoded))
    }

    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_val)
    return f1_score(y_val, preds, average="weighted")


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("🎯 Best trial params:")
print(study.best_params)

## 🧠 Final Model Training with Best Params

In [None]:
best_params = study.best_params
best_params.update({
    "tree_method": "hist",
    "eval_metric": "mlogloss",
    "objective": "multi:softprob",
    "num_class": len(np.unique(y_encoded))
})

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## 📊 Report Metrics

In [None]:
print("📈 Classification Report:\n")
print(classification_report(y_test, y_pred))

f1 = f1_score(y_test, y_pred, average="weighted")
print(f"✅ Weighted F1 Score: {f1:.4f}")

## 🌟 Feature Importance

In [None]:
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, max_num_features=20, importance_type="gain")
plt.title("Top 20 Most Important k-mers")
plt.tight_layout()
plt.show()

## 💾 Save Model + Artifacts

In [None]:
os.makedirs("models", exist_ok=True)
joblib.dump(model, "src/ml/xgboost_dna_classifier.pkl")
joblib.dump(le, "src/ml/label_encoder.pkl")

print("✅ Model + label encoder saved for RAG or LangChain input!")