In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# !pip install -q transformers biopython kaggle
# print("‚úÖ ƒê√£ c√†i ƒë·∫∑t xong th∆∞ vi·ªán!")

In [2]:
# import os
# from google.colab import files

# # Upload file kaggle.json
# print("Vui l√≤ng upload file kaggle.json c·ªßa b·∫°n:")
# files.upload()

# # C·∫•u h√¨nh Kaggle API
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

# # T·∫£i d·ªØ li·ªáu cu·ªôc thi (S·∫Ω m·∫•t kho·∫£ng 1-2 ph√∫t)
# print("‚è≥ ƒêang t·∫£i d·ªØ li·ªáu CAFA 6...")
# !kaggle competitions download -c cafa-6-protein-function-prediction
# !unzip -q cafa-6-protein-function-prediction.zip -d /content/cafa6_data
# print("‚úÖ ƒê√£ t·∫£i v√† gi·∫£i n√©n d·ªØ li·ªáu t·∫°i /content/cafa6_data")

In [3]:
import yaml
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import gc
import os
import sys
from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
with open("configs/base_model.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

if CONFIG.get("DEVICE", "auto") == "auto":
    CONFIG["DEVICE"] = "cuda" if torch.cuda.is_available() else "cpu"

CONFIG['ONTOLOGY'] = "MF"
CONFIG['TOP_TERMS_NPY'] = "features/top_terms_by_aspect/top_terms_MF.npy"

os.makedirs(CONFIG['SAVE_DIR'], exist_ok=True)
print(f"üöÄ CAFA 6 - DUAL MODEL (ANKH + ESM) | Device: {CONFIG['DEVICE']}")

torch.manual_seed(CONFIG['SEED'])
np.random.seed(CONFIG['SEED'])

üöÄ CAFA 6 - DUAL MODEL (ANKH + ESM) | Device: cuda


# 1. MEMORY-SAFE DATASET
CH√åA KH√ìA ƒê·ªÇ KH√îNG TR√ÄN RAM

In [5]:
class MultiSourceDataset(Dataset):
    def __init__(self, embedding_paths_dict, y_tensor=None, indices=None):
        self.mmaps = {}
        self.keys = list(embedding_paths_dict.keys())

        # Load mmap
        for name, path in embedding_paths_dict.items():
            self.mmaps[name] = np.load(path, mmap_mode='r')

        # Base length
        first_key = self.keys[0]
        self.total_len = len(self.mmaps[first_key])

        self.indices = indices if indices is not None else np.arange(self.total_len)
        self.y = y_tensor

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        real_idx = self.indices[idx]

        inputs = []
        # Load t·ª´ng vector -> List of Tensors
        for key in self.keys:
            vec = torch.from_numpy(self.mmaps[key][real_idx].copy()).float()
            inputs.append(vec)

        if self.y is not None:
            return inputs, self.y[real_idx]
        return (inputs,)

# 2. LOAD DATA & PROCESS LABELS

In [6]:
print("\n[1/5] Checking Files...")
train_ids = np.load(CONFIG['TRAIN_ID_PATH'])
print(f"   ‚úì Train IDs: {len(train_ids)}")


[1/5] Checking Files...
   ‚úì Train IDs: 82404


In [7]:
print("\n[2/5] Processing Labels (IA Strategy)...")
# 1. Load Terms & IA
df_terms = pd.read_csv(CONFIG['TRAIN_TERMS'], sep='\t', header=0,
                       names=['EntryID', 'term', 'aspect'])
df_ia = pd.read_csv(CONFIG['IA_FILE'], sep='\t', names=['term', 'ia'])
ia_dict = dict(zip(df_ia['term'], df_ia['ia']))

# 2. T√≠nh Score
# map ontology -> aspect char
ONTOLOGY2ASPECT = {'MF': 'F', 'BP': 'P', 'CC': 'C'}
aspect_char = ONTOLOGY2ASPECT[CONFIG['ONTOLOGY']]

# ch·ªâ gi·ªØ terms thu·ªôc ontology n√†y
df_terms_aspect = df_terms[df_terms['aspect'] == aspect_char]

# load top_terms cho ontology n√†y
top_terms = np.load(CONFIG['TOP_TERMS_NPY'], allow_pickle=True).tolist()

# l·ªçc l·∫°i theo top_terms
df_filtered = df_terms_aspect[df_terms_aspect['term'].isin(top_terms)]
id_to_terms = df_filtered.groupby('EntryID')['term'].apply(list).to_dict()

# D·ªçn d·∫πp
del df_terms, df_terms_aspect, df_filtered
gc.collect()


[2/5] Processing Labels (IA Strategy)...


0

# 3. PREPARE LABELS (RAM OPTIMIZED)

In [8]:
print("\n[3/5] Preparing Labels (Sparse Mode)...")

# 1. Sparse MLB
mlb = MultiLabelBinarizer(classes=top_terms, sparse_output=True)
mlb.fit([top_terms])

# 2. Transform -> Sparse Matrix
y_labels_list = [id_to_terms.get(pid, []) for pid in train_ids]
y_train_sparse = mlb.transform(y_labels_list)
del y_labels_list, train_ids # X√≥a ID list kh√¥ng d√πng n·ªØa
gc.collect()

# 3. Weights (Optional)
# N·∫øu d√πng IA weight th√¨ gi·ªØ, n·∫øu mu·ªën model t·ª± h·ªçc th√¨ comment d√≤ng n√†y v√† b·ªè pos_weight trong Loss
weights_list = [ia_dict.get(t, 0.0) for t in mlb.classes_]
pos_weight_tensor = torch.tensor(weights_list, dtype=torch.float32).to(CONFIG['DEVICE'])

# 4. Convert to Dense Float32 & Label Smoothing
print("   ‚è≥ Converting Labels to Tensor...")
# Convert t·ª´ng ph·∫ßn nh·ªè ho·∫∑c convert h·∫øt n·∫øu RAM > 12GB (v·ªõi 80k row x 10k col float32 ~ 3.2GB -> ·ªîn)
y_train_binary = y_train_sparse.astype(np.float32).toarray()
y_train_tensor = torch.from_numpy(y_train_binary)

if CONFIG['LABEL_SMOOTHING'] > 0:
    y_train_tensor.mul_(1 - CONFIG['LABEL_SMOOTHING']).add_(CONFIG['LABEL_SMOOTHING'] / len(top_terms))

del y_train_sparse, y_train_binary
gc.collect()


[3/5] Preparing Labels (Sparse Mode)...
   ‚è≥ Converting Labels to Tensor...


12

# 4. CREATE DATA LOADERS

In [9]:
print("\n[4/5] Creating DataLoaders...")

train_paths = {k: v['train'] for k, v in CONFIG['EMBEDDINGS'].items()}

indices = np.arange(len(y_train_tensor))
train_idx, val_idx = train_test_split(indices, test_size=0.15, random_state=CONFIG['SEED'])

train_dataset = MultiSourceDataset(train_paths, y_train_tensor, indices=train_idx)
val_dataset   = MultiSourceDataset(train_paths, y_train_tensor, indices=val_idx)

# --- AUTO DETECT INPUT DIMS ---
print("   ‚è≥ Measuring input shapes from dataset...")
sample_inputs, _ = train_dataset[0]
INPUT_DIMS_LIST = [x.shape[0] for x in sample_inputs]
print(f"   ‚úì Detected Input Dims: {INPUT_DIMS_LIST}")

train_loader = DataLoader(train_dataset, batch_size=CONFIG['BATCH_SIZE'], shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset, batch_size=CONFIG['BATCH_SIZE'], shuffle=False, num_workers=2)


[4/5] Creating DataLoaders...
   ‚è≥ Measuring input shapes from dataset...
   ‚úì Detected Input Dims: [768, 1280, 1024]


# 5. MODEL & TRAINING

In [10]:
class EncoderBlock(nn.Module):
    def __init__(self, in_dim, layers_config, dropout):
        super().__init__()
        layers = []

        # 1. LAYER NORM ƒê·∫¶U V√ÄO (B·∫ÆT BU·ªòC)
        # ƒê·ªÉ c√¢n b·∫±ng '√¢m l∆∞·ª£ng' gi·ªØa ESM (h√©t to) v√† Ankh (n√≥i nh·ªè)
        layers.append(nn.LayerNorm(in_dim))

        prev_dim = in_dim
        for i, dim in enumerate(layers_config):
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.BatchNorm1d(dim), # ·ªîn ƒë·ªãnh training
                nn.GELU(),           # Hi·ªán ƒë·∫°i h∆°n ReLU
                nn.Dropout(dropout)
            ])
            prev_dim = dim

        self.net = nn.Sequential(*layers)
        self.out_dim = prev_dim

    def forward(self, x):
        return self.net(x)

In [11]:
# Module l·ªçc nhi·ªÖu (Attention)
class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block ƒë·ªÉ l·ªçc nhi·ªÖu sau khi g·ªôp"""
    def __init__(self, in_dim, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_dim, in_dim // reduction, bias=False),
            nn.ReLU(),
            nn.Linear(in_dim // reduction, in_dim, bias=False),
            nn.Sigmoid() # T·∫°o ra mask t·ª´ 0 ƒë·∫øn 1
        )

    def forward(self, x):
        # x shape: [Batch, Dim]
        # Attention weight: [Batch, Dim]
        w = self.fc(x)
        # Nh√¢n tr·ªçng s·ªë v√†o x: C√°i n√†o quan tr·ªçng th√¨ gi·ªØ, r√°c th√¨ nh√¢n v·ªõi 0
        return x * w

In [12]:
class MultiModalNet(nn.Module):
    def __init__(self, input_dims_list, encoder_layers, dropout, num_classes):
        super().__init__()

        self.encoders = nn.ModuleList()
        self.fusion_input_dim = 0

        print("\nüèóÔ∏è Building Advanced Architecture:")

        # 1. X√¢y d·ª±ng c√°c nh√°nh Encoder
        for i, in_dim in enumerate(input_dims_list):
            print(f"   ‚û§ Branch {i+1}: Input {in_dim} -> Output {encoder_layers[-1]}")
            enc = EncoderBlock(in_dim, encoder_layers, dropout)
            self.encoders.append(enc)
            self.fusion_input_dim += enc.out_dim

        print(f"   ‚û§ Fusion Dim: {self.fusion_input_dim}")

        # 2. SE-Block (B·ªô l·ªçc th√¥ng minh)
        self.attention_filter = SEBlock(self.fusion_input_dim)

        # 3. Layer t·ªïng h·ª£p cu·ªëi c√πng
        self.head = nn.Sequential(
            nn.BatchNorm1d(self.fusion_input_dim),
            nn.Linear(self.fusion_input_dim, 512),
            nn.GELU(),
            nn.Dropout(dropout),

            # Th√™m m·ªôt l·ªõp n·ªØa ƒë·ªÉ tƒÉng kh·∫£ nƒÉng h·ªçc
            nn.Linear(512, 512),
            nn.GELU(),
            nn.Dropout(dropout),

            nn.Linear(512, num_classes)
        )

    def forward(self, inputs_list):
        features = []
        # ƒêi qua t·ª´ng nh√°nh
        for i, encoder in enumerate(self.encoders):
            feat = encoder(inputs_list[i])
            features.append(feat)

        # G·ªôp l·∫°i (Concatenate)
        combined = torch.cat(features, dim=1)

        # L·ªåC NHI·ªÑU (ƒêi·ªÉm kh√°c bi·ªát l·ªõn nh·∫•t)
        # M·∫°ng s·∫Ω t·ª± h·ªçc c√°ch "t·∫Øt ti·∫øng" c√°c ƒë·∫∑c tr∆∞ng r√°c t·ª´ ProtT5 n·∫øu n√≥ th·∫•y kh√¥ng c·∫ßn thi·∫øt
        refined = self.attention_filter(combined)

        return self.head(refined)

In [13]:
print("\n[5/5] Building Model...")

model = MultiModalNet(
    input_dims_list = INPUT_DIMS_LIST,
    encoder_layers  = CONFIG['ENCODER_LAYERS'],
    dropout         = CONFIG['DROPOUT_RATE'],
    num_classes     = len(top_terms) # D√πng s·ªë l∆∞·ª£ng th·ª±c t·∫ø sau l·ªçc
).to(CONFIG['DEVICE'])

# Loss & Optimizer
# C√≥ th·ªÉ th·ª≠ b·ªè pos_weight n·∫øu th·∫•y loss dao ƒë·ªông qu√° m·∫°nh
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = optim.AdamW(model.parameters(), lr=CONFIG['LEARNING_RATE'], weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

print(f"\nüöÄ START TRAINING ({CONFIG['EPOCHS']} Epochs)...")
best_val_loss = float('inf')
best_model_path = f"{CONFIG['SAVE_DIR']}/prott5_esm2_ankh_best_{CONFIG['ONTOLOGY']}.pth"

for epoch in range(CONFIG['EPOCHS']):
    model.train()
    train_loss = 0
    for X_b, y_b in train_loader:
        X_b, y_b = [x.to(CONFIG['DEVICE']) for x in X_b], y_b.to(CONFIG['DEVICE'])
        optimizer.zero_grad()
        logits = model(X_b)
        loss = criterion(logits, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()

    avg_train = train_loss / len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_b, y_b in val_loader:
            X_b, y_b = [x.to(CONFIG['DEVICE']) for x in X_b], y_b.to(CONFIG['DEVICE'])
            logits = model(X_b)
            loss = criterion(logits, y_b)
            val_loss += loss.item()

    avg_val = val_loss / len(val_loader)
    scheduler.step(avg_val)

    print(f"Epoch {epoch+1:02d} | Train: {avg_train:.4f} | Val: {avg_val:.4f} | LR: {optimizer.param_groups[0]['lr']:.1e}")

    if avg_val < best_val_loss:
        best_val_loss = avg_val
        torch.save(model.state_dict(), best_model_path)
        print("  ‚≠ê New Best Model!")


[5/5] Building Model...

üèóÔ∏è Building Advanced Architecture:
   ‚û§ Branch 1: Input 768 -> Output 512
   ‚û§ Branch 2: Input 1280 -> Output 512
   ‚û§ Branch 3: Input 1024 -> Output 512
   ‚û§ Fusion Dim: 1536

üöÄ START TRAINING (50 Epochs)...
Epoch 01 | Train: 0.0330 | Val: 0.0032 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 02 | Train: 0.0034 | Val: 0.0031 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 03 | Train: 0.0033 | Val: 0.0030 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 04 | Train: 0.0032 | Val: 0.0030 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 05 | Train: 0.0031 | Val: 0.0029 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 06 | Train: 0.0030 | Val: 0.0028 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 07 | Train: 0.0029 | Val: 0.0027 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 08 | Train: 0.0028 | Val: 0.0027 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 09 | Train: 0.0027 | Val: 0.0026 | LR: 2.0e-04
  ‚≠ê New Best Model!
Epoch 10 | Train: 0.0027 | Val: 0.0026 | LR: 2.0e-04
  ‚≠ê New Best Model!

# 6. INFERENCE (STREAMING)

In [14]:
print("\nüîÆ PREDICTING...")

# D·ªçn d·∫πp Training
del train_loader, val_loader, train_dataset, val_dataset, y_train_tensor, optimizer
gc.collect()
torch.cuda.empty_cache()

model.load_state_dict(torch.load(best_model_path))
model.eval()

# Load Test IDs & Paths
test_paths = {k: v['test'] for k, v in CONFIG['EMBEDDINGS'].items()}
test_ids = np.load(CONFIG['TEST_ID_PATH'])

# Test Dataset (Dual Memmap)
test_dataset = MultiSourceDataset(test_paths)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['BATCH_SIZE']*2, shuffle=False, num_workers=2)

submission_path = f"{CONFIG['SAVE_DIR']}/submission_{CONFIG['ONTOLOGY']}.tsv"
n_predictions = 0

with open(submission_path, 'w') as f:
    current_idx = 0
    with torch.no_grad():
        for (X_b,) in tqdm(test_loader, desc="Inference"):
            X_b = [x.to(CONFIG['DEVICE']) for x in X_b]
            logits = model(X_b)
            probs_batch = torch.sigmoid(logits).cpu().numpy()

            ids_batch = test_ids[current_idx : current_idx + len(probs_batch)]
            current_idx += len(probs_batch)

            for i, pid in enumerate(ids_batch):
                probs = probs_batch[i]

                # TOP-K C·ª®NG (Chi·∫øn thu·∫≠t 0.27 ƒëi·ªÉm)
                top_k = CONFIG['MAX_PREDS_PER_PROTEIN']
                ind = np.argpartition(probs, -top_k)[-top_k:]
                ind = ind[np.argsort(probs[ind])][::-1]

                for idx in ind:
                    score = probs[idx]
                    if score > CONFIG['MIN_CONFIDENCE']:
                        f.write(f"{pid}\t{top_terms[idx]}\t{score:.3f}\n")
                        n_predictions += 1

            del probs_batch, X_b, logits

print(f"\n‚úÖ DONE! File: {submission_path}")
print(f"\n‚úÖ DONE! Predictions: {n_predictions:,}")


üîÆ PREDICTING...


Inference:   0%|          | 0/877 [00:00<?, ?it/s]


‚úÖ DONE! File: output/base_model/submission_MF.tsv

‚úÖ DONE! Predictions: 2,820,571


In [15]:
# # Ch·∫°y l·ªánh n√†y trong m·ªôt cell m·ªõi
# !kaggle competitions submit \
#     -c cafa-6-protein-function-prediction \
#     -f /content/drive/MyDrive/CAFA6_Results/prott5_esm2_ankh_Run/submission.tsv \
#     -m "esm2 ankh prot t5 new model"