# LightGBM with ESM-2 embeddings

Idea:
- Model: OneVsRestClassifier(LightGBM) --> train one Logistic Regression model for one class
- Features: PCA(n_components=100) --> PCA.fit_transform(ESM-2 embeddings)
- Labels: Three sets for three ontologies (P, C, F)
    - P has 16858 classes
    - C has 2651 classes
    - F has 6616 classes
- Only top 1000 most frequent class in each ontology are used to train

References:
- (EDA + OneVsRestClassifier) https://www.kaggle.com/code/analyticaobscura/cafa-6-decoding-protein-mysteries
- (ESM-2 320-D embeddings) https://www.kaggle.com/code/dalloliogm/compute-protein-embeddings-with-esm2-esm-c/notebook
- (Optional ProtT5 1024-D embeddings) https://www.kaggle.com/code/ahsuna123/t5-embedding-calculation-cafa-6/output?select=train_ids.npy

---

In [None]:
!pip install biopython > /dev/null

## Step 1: Load CAFA6 files

---

In [None]:
# CAFA6 file paths
TRAIN_TERMS = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv"
TRAIN_SEQ = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
TEST_SEQ = "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta"

In [None]:
from Bio import SeqIO 

# Dict {entryId, seq}
train_sequences = {rec.id: str(rec.seq) for rec in SeqIO.parse(TRAIN_SEQ, 'fasta')}
test_sequences  = {rec.id: str(rec.seq) for rec in SeqIO.parse(TEST_SEQ,  'fasta')}

print(f'Loaded {len(train_sequences)} train and {len(test_sequences)} test sequences')

In [None]:
print("Train dict:", list(train_sequences.items())[0])
print("Test dict:", list(test_sequences.items())[0])

In [None]:
train_ids = [i.split('|')[1] for i in train_sequences.keys()]
test_ids = list(test_sequences.keys())

In [None]:
print("train_ids[0:10]:", train_ids[0:10])
print("test_ids[0:10]:", test_ids[0:10])

## Step 2: Feature extraction

---

In [None]:
# Embeddings file paths
ESM_EMBEDDINGS = "/kaggle/input/cafa6-esm2-650m-embedding/esm2_650M"
TRAIN_EMBEDDINGS = ESM_EMBEDDINGS + "/train_sequences_emb.npy"
TEST_EMBEDDINGS = ESM_EMBEDDINGS + "/testsuperset_emb.npy"

In [None]:
import numpy as np

# Load embeddings
X_train = np.load(TRAIN_EMBEDDINGS)
X_test = np.load(TEST_EMBEDDINGS)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100, random_state=42)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced  = pca.transform(X_test)

In [None]:
print("X_train_reduced shape:", X_train_reduced.shape)
print("X_test_reduced shape:", X_test_reduced.shape)

## Step 3: Label encoding and Training

---

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm
import lightgbm as lgb

mlb_dict = {}
lgbm_models = {}   # {aspect: {go_term: model}}

train_terms_df = pd.read_csv(TRAIN_TERMS, sep="\t")

for aspect in ['P', 'C', 'F']:
    print(f"\n========== Training LightGBM for aspect {aspect} ==========")

    ont_terms_df = train_terms_df[train_terms_df['aspect'] == aspect]

    protein_terms = (
        ont_terms_df
        .groupby('EntryID')['term']
        .apply(list)
        .to_dict()
    )

    labels = [protein_terms.get(eid, []) for eid in train_ids]

    mlb = MultiLabelBinarizer(sparse_output=True)
    y_train = mlb.fit_transform(labels)

    mlb_dict[aspect] = mlb

    print(f"y_train shape: {y_train.shape}")

    # ===== chọn top 50% GO terms =====
    TOP_K = 1000   # số GO terms muốn dùng cho LightGBM
    
    y_dense = y_train.toarray()
    term_freq = y_dense.sum(axis=0)   # shape: (n_terms,)
    
    # sort theo tần suất giảm dần
    sorted_indices = np.argsort(term_freq)[::-1]
    
    # lấy top-K
    selected_indices = sorted_indices[:TOP_K]
    
    print(f"Selected {len(selected_indices)}/{len(term_freq)} GO terms (top-{TOP_K})")

    # ===== train LightGBM =====
    models_aspect = {}

    for idx in tqdm(selected_indices, desc=f"LGBM-{aspect}"):
        y_i = y_dense[:, idx]
        go_term = mlb.classes_[idx]

        train_data = lgb.Dataset(X_train_reduced, label=y_i)

        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "min_data_in_leaf": 20,
            "verbosity": -1
        }

        model = lgb.train(
            params,
            train_data,
            num_boost_round=200
        )

        models_aspect[go_term] = model

    lgbm_models[aspect] = models_aspect

    print(f"Finished training {len(models_aspect)} models for aspect {aspect}")


## Step 4: Inference and Submission

In [None]:
BATCH_SIZE = 5000
submission_list = []

for i in tqdm(range(0, len(test_ids), BATCH_SIZE), desc="Predicting (LightGBM)"):
    batch_entry_ids = test_ids[i : i + BATCH_SIZE]
    X_batch = X_test_reduced[i : i + BATCH_SIZE]

    for aspect, models_aspect in lgbm_models.items():
        for go_term, model in models_aspect.items():
            probs = model.predict(X_batch)

            for j, entry_id in enumerate(batch_entry_ids):
                if probs[j] > 0.02:
                    submission_list.append(
                        (entry_id, go_term, round(float(probs[j]), 3))
                    )


In [None]:
submission_df = pd.DataFrame(submission_list, columns=['Protein Id', 'GO Term Id', 'Prediction'])
submission_df.to_csv('submission_no_limit.tsv', sep='\t', index=False, header=False)

print("Applying 2000 prediction limit per protein...")
submission_df = submission_df.sort_values(by=['Protein Id', 'Prediction'], ascending=[True, False])
final_submission_df = submission_df.groupby('Protein Id').head(2000).reset_index(drop=True)
final_submission_df.to_csv('submission.tsv', sep='\t', index=False, header=False)

print("\nSubmission file 'submission.tsv' created successfully.")
print(f"Total predictions in final submission: {len(final_submission_df):,}")
print("Submission DataFrame Head:")
display(final_submission_df.head())