# CAFA-5 Protein Function Prediction

First of all, we install the necessary libraries.

In [3]:
!pip install biopython progressbar transformers


Collecting biopython
  Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting progressbar
  Downloading progressbar-2.5.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hBuilding wheels for collected packages: progressbar
  Building wheel for progressbar (setup.py) ... [?25l[?25hdone
  Created wheel for progressbar: filename=progressbar-2.5-py3-none-any.whl size=12066 sha256=aea487b75f9f7a3c0e39468e757f1f2c1e8e6b5ca81cb8691bcfd3d900184879
  Stored in directory: /root/.cache/pip/wheels/cd/17/e5/765d1a3112ff3978f70223502f6047e06c43a24d7c5f8ff95b
Successfully built progressbar
Installing collected packages: progressbar, biopython
Successfully installed biopython-1.85 progressbar-2.5


We check if all our files are uploaded.

In [4]:
import os

print(os.listdir('/kaggle/input/cafa-5-protein-function-prediction/Train/'))


['train_terms.tsv', 'train_sequences.fasta', 'train_taxonomy.tsv', 'go-basic.obo']


##  Protein Sequence Loading and Environment Setup

This cell performs the following tasks:

- **Imports essential libraries** for:
  - Deep learning: `torch`
  - Data handling: `numpy`, `pandas`
  - Biological sequence parsing: `Bio.SeqIO`
  - Pre-trained transformer models: `transformers`
  - Progress tracking: `progressbar`

- **Sets the computation device**:
  - Uses GPU (`cuda`) if available, otherwise falls back to CPU.

- **Specifies the Protein Language Model**:
  - `Rostlab/prot_bert_bfd`, a transformer model trained to understand protein sequences.

- **Loads protein sequences** from the `train_sequences.fasta` file:
  - Parses the FASTA file using `Bio.SeqIO`.
  - Stores the sequences in a dictionary with:
    - **Keys**: Protein IDs
    - **Values**: Corresponding amino acid sequences

This step prepares the dataset for generating embeddings from the pre-trained protein language model.


In [6]:
import torch
import numpy as np
import pandas as pd
from Bio import SeqIO
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import progressbar

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'Rostlab/prot_bert_bfd'

# Load sequences
train_fasta_path = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta'
sequences = {record.id: str(record.seq) for record in SeqIO.parse(train_fasta_path, 'fasta')}
print(f"Loaded {len(sequences)} protein sequences.")


Loaded 142246 protein sequences.


This cell defines the file paths used to access the training data inside the Kaggle environment.

In [7]:
# Paths (use kaggle's built-in paths)
train_fasta_path = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta'
train_terms_path = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv'


## Tokenization & Data Preparation

This code defines a custom `ProteinDataset` class to:

- Tokenize protein sequences using a pre-trained **Protein Language Model** (ProtBERT).
- Format the data for use with a PyTorch `DataLoader`.

We then:
- Load the tokenizer and model (`ProtBERT`) in half-precision for efficiency.
- Create the dataset and batch loader to process sequences in batches of 64.


In [None]:
class ProteinDataset(Dataset):
    def __init__(self, seq_dict, tokenizer, max_len=512):
        self.ids = list(seq_dict.keys())
        self.sequences = list(seq_dict.values())
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        pid = self.ids[idx]
        sequence = self.sequences[idx]
        tokens = self.tokenizer(sequence,
                                padding='max_length',
                                truncation=True,
                                max_length=self.max_len,
                                return_tensors='pt')
        tokens = {k: v.squeeze(0) for k, v in tokens.items()}
        return pid, tokens

# Instantiate tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
model = AutoModel.from_pretrained(MODEL_NAME).half().to(device).eval()

# Create dataset and loader
batch_size = 64  # Adjust if necessary based on GPU
dataset = ProteinDataset(sequences, tokenizer, max_len=512)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

## Generate Protein Embeddings with ProtBERT

This block:

- Loads the **ProtBERT model** and tokenizer to process protein sequences.
- Defines a custom PyTorch `Dataset` for tokenizing sequences and batching them efficiently.
- Uses **mean pooling** over the model's last hidden states to create fixed-size embeddings for each protein.
- Collects all embeddings in a dictionary and saves them as a CSV file (`protein_embeddings.csv`), ready for downstream tasks like classification or clustering.


In [None]:
MODEL_NAME = 'Rostlab/prot_bert_bfd'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
model = AutoModel.from_pretrained(MODEL_NAME).half().to(device)
model.eval()

max_len = 512  # Reduce length for faster embeddings
batch_size = 128  # Set batch size based on your GPU capacity

class ProteinDataset(Dataset):
    def __init__(self, seq_dict, tokenizer, max_len=512):
        self.ids = list(seq_dict.keys())
        self.sequences = list(seq_dict.values())
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        pid = self.ids[idx]
        inputs = self.tokenizer(sequence,
                                padding='max_length',
                                truncation=True,
                                max_length=self.max_len,
                                return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return pid, inputs

dataset = ProteinDataset(sequences, tokenizer, max_len=max_len)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

embeddings_dict = {}

with torch.no_grad():
    bar = progressbar.ProgressBar(maxval=len(loader)).start()
    for idx, (pids, inputs) in enumerate(loader):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        
        embeddings = outputs.last_hidden_state.mean(dim=1).float().cpu().numpy()
        embeddings_dict.update(dict(zip(pids, embeddings)))
        
        bar.update(idx + 1)
    bar.finish()

embedding_df = pd.DataFrame.from_dict(embeddings_dict, orient='index')
embedding_df.index.name = 'Protein Id'
embedding_df.reset_index(inplace=True)

# Save optimized embeddings
embedding_df.to_csv('protein_embeddings.csv', index=False)


100% |############################################################################################|


✅ Optimized embeddings generated and saved!


In [12]:
embedding_df.head()

Unnamed: 0,Protein Id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,P20536,0.030762,0.024231,0.136475,0.03302,-0.065308,-0.136108,-0.046295,0.03244,0.010941,...,0.056915,-0.042145,-0.028549,0.025146,-0.128296,-0.130981,0.026077,-0.08667,-0.114502,-0.013115
1,O73864,0.030762,0.024231,0.136475,0.03302,-0.065308,-0.136108,-0.046295,0.03244,0.010941,...,0.056915,-0.042145,-0.028549,0.025146,-0.128296,-0.130981,0.026077,-0.08667,-0.114502,-0.013115
2,O95231,0.030762,0.024231,0.136475,0.03302,-0.065308,-0.136108,-0.046295,0.03244,0.010941,...,0.056915,-0.042145,-0.028549,0.025146,-0.128296,-0.130981,0.026077,-0.08667,-0.114502,-0.013115
3,A0A0B4J1F4,0.030762,0.024231,0.136475,0.03302,-0.065308,-0.136108,-0.046295,0.03244,0.010941,...,0.056915,-0.042145,-0.028549,0.025146,-0.128296,-0.130981,0.026077,-0.08667,-0.114502,-0.013115
4,P54366,0.030762,0.024231,0.136475,0.03302,-0.065308,-0.136108,-0.046295,0.03244,0.010941,...,0.056915,-0.042145,-0.028549,0.025146,-0.128296,-0.130981,0.026077,-0.08667,-0.114502,-0.013115
