<a href="https://colab.research.google.com/github/nmach22/Promoter-Classification/blob/main/notebooks/train_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Env**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import sys
import shutil

# --- CONFIGURATION ---
REPO_PATH = '/content/drive/MyDrive/Promoter-Classification'
REPO_URL = "https://github.com/nmach22/Promoter-Classification.git"

# 1. Delete repo if it already exists (Ensure fresh code)
if os.path.exists(REPO_PATH):
    print(f"Deleting existing repository at {REPO_PATH}...")
    shutil.rmtree(REPO_PATH)

# 2. Clone repository
os.chdir('/content/drive/MyDrive')
print(f"Cloning repository to {REPO_PATH}...")
!git clone {REPO_URL}

# 3. Enter the repository
os.chdir(REPO_PATH)
print(f"Current working directory: {os.getcwd()}")

# 4. Add source code to Python path
sys.path.append(REPO_PATH)

# 5. Install requirements
!pip install -r requirements.txt
!pip install xgboost

# **Imports**

In [None]:
import sys
import os
import yaml
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, classification_report
from Bio import SeqIO
import importlib

# Set ROOT_DIR to the current working directory (which is REPO_PATH set in previous cell)
ROOT_DIR = os.getcwd()
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

import utils.fasta_dataset as dataset_module
import utils.encoding_functions as encoding_module
import utils.data_split as splitter_module
import models.xgboost_model as xgboost_module

importlib.reload(encoding_module)
importlib.reload(xgboost_module)

from utils.fasta_dataset import FastaDataset
from utils.encoding_functions import one_hot_encode, flatten_one_hot_encode, kmer_encode
from utils.data_split import dataset_split
from models.xgboost_model import XGBoostPromoterModel

with open(f"{ROOT_DIR}/config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

print("Config loaded.")
print("Device: CPU (XGBoost)")

# **Feature Engineering**

XGBoost requires numerical input. We will use two types of features:
1. **Flattened One-Hot Encoding**: Preserves positional information (critical for promoter motifs like TATA box at specific locations).
2. **K-mer Counts**: Captures frequency of short subsequences (e.g., 3-mers).

In [None]:
# We use functions from utils.encoding_functions
# flatten_one_hot_encode(seq, max_seq_len)
# kmer_encode(seq, seq_len, k=3)

def combined_features(seq, max_seq_len, k=3):
    """
    Combines flattened one-hot and k-mer counts.
    """
    one_hot = flatten_one_hot_encode(seq, max_seq_len)
    # kmers = kmer_encode(seq, max_seq_len, k=k) # Uncomment to add k-mers
    # return np.concatenate([one_hot, kmers])
    return one_hot # Start with one-hot as primary feature


# **Read & Prepare Data**

In [None]:
# Select Dataset (e.g., E. coli)
data_config = config['data']['ecoli']
prom_path = f"{ROOT_DIR}/{data_config['promoter_fasta']}"
non_prom_path = f"{ROOT_DIR}/{data_config['non_promoter_fasta']}"
seq_length = data_config['seq_len']

# Use FastaDataset
dataset = FastaDataset(
    prom_path, 
    non_prom_path, 
    seq_len=seq_length, 
    encoding_func=lambda s, l: combined_features(s, l)
)

print(f"Total samples: {len(dataset)}")
print(f"Feature vector shape: {dataset[0][0].shape}")

# **Split Data**

In [None]:
train_subset, val_subset, test_subset = dataset_split(dataset)

def subset_to_numpy(subset):
    """
    Converts a PyTorch Subset to numpy arrays (X, y).
    """
    X = []
    y = []
    for i in range(len(subset)):
        sample_x, sample_y = subset[i]
        X.append(sample_x.numpy())
        y.append(sample_y.item())
    return np.array(X), np.array(y)

print("Converting datasets to numpy arrays for XGBoost...")
X_train, y_train = subset_to_numpy(train_subset)
X_val, y_val = subset_to_numpy(val_subset)
X_test, y_test = subset_to_numpy(test_subset)

print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Val shape: {X_val.shape}, {y_val.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}")

# **Train XGBoost Model**

In [None]:
# Initialize model from models/xgboost_model.py
model = XGBoostPromoterModel(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6
)

# Train
print("Training...")
model.fit(X_train, y_train, X_val=X_val, y_val=y_val, verbose=50)

# **Evaluation**

Metrics from paper:
- **Sn (Sensitivity)**: Recall
- **Sp (Specificity)**: True Negative Rate
- **CC**: Matthews Correlation Coefficient


In [None]:
def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    return {
        "Accuracy": accuracy,
        "Sensitivity (Sn)": sensitivity,
        "Specificity (Sp)": specificity,
        "CC (MCC)": mcc,
        "Confusion Matrix": cm
    }

# Predict on Test Set
y_pred = model.predict(X_test)
metrics = calculate_metrics(y_test, y_pred)

print("Test Results:")
print("-------------")
for k, v in metrics.items():
    if k != "Confusion Matrix":
        print(f"{k}: {v:.4f}")
    else:
        print(f"{k}:\n{v}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Non-Promoter', 'Promoter']))

# **Save Model**

In [None]:
import os

save_dir = f"{ROOT_DIR}/models/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

save_path = f"{save_dir}/xgboost_model.json"
model.save_model(save_path)
print(f"Model saved to {save_path}")
