#**AMP Classification using ProtBERT Embeddings + Fast MLP**
This notebook extracts ProtBERT embeddings for  peptide sequences and trains a simple Multi-Layer Perceptron (MLP) to classify antimicrobial peptides (AMPs) vs non-AMPs.

In [1]:
!pip install torch transformers scikit-learn numpy pandas tqdm



In [2]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
import sys

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cpu


##Load Dataset

In [None]:
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    file_path = '/content/drive/MyDrive/ampData.csv'
else:
    file_path = 'ampData.csv'

df = pd.read_csv(file_path)
df['sequence'] = df['sequence'].astype(str).str.upper().str.strip()
df = df.dropna(subset=['sequence','label']).reset_index(drop=True)
df.head()

Mounted at /content/drive


Unnamed: 0,sequence,label
0,GIPCGESCVWIPCISSAIGCSCKSKVCYRN,0
1,SLQYVMSAGPYTWYKDTRTGKTLCKQTIDTASYTFGVMAEGWGKTFH,1
2,MSKRTYQPSRIRRKRTHGFRSRMQTKNGQAVIRRRRARGRKRLVVT...,0
3,GFMDTAKNVAKNVAVTLLDNLKCKITKAC,1
4,MNRIGMITTIITTTITTGNGAG,0


## Extract ProtBERT Embeddings

In [5]:
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert')
model = AutoModel.from_pretrained('Rostlab/prot_bert').to(device)

def get_embedding(sequence):
    seq = ' '.join(list(sequence))
    tokens = tokenizer(seq, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    emb = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return emb

embeddings = []
for seq in tqdm(df['sequence'], desc='Extracting Embeddings'):
    embeddings.append(get_embedding(seq))

X = np.array(embeddings)
y = df['label'].values

np.save('X_embeddings.npy', X)
np.save('y_labels.npy', y)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracting Embeddings:  22%|██▏       | 1412/6410 [21:45<1:05:06,  1.28it/s][A
Extracting Embeddings:  22%|██▏       | 1413/6410 [21:46<1:06:42,  1.25it/s][A
Extracting Embeddings:  22%|██▏       | 1414/6410 [21:46<1:01:10,  1.36it/s][A
Extracting Embeddings:  22%|██▏       | 1415/6410 [21:47<59:10,  1.41it/s]  [A
Extracting Embeddings:  22%|██▏       | 1416/6410 [21:48<1:07:12,  1.24it/s][A
Extracting Embeddings:  22%|██▏       | 1417/6410 [21:49<1:10:18,  1.18it/s][A
Extracting Embeddings:  22%|██▏       | 1418/6410 [21:50<1:07:07,  1.24it/s][A
Extracting Embeddings:  22%|██▏       | 1419/6410 [21:50<1:07:28,  1.23it/s][A
Extracting Embeddings:  22%|██▏       | 1420/6410 [21:51<1:01:46,  1.35it/s][A
Extracting Embeddings:  22%|██▏       | 1421/6410 [21:52<1:01:11,  1.36it/s][A
Extracting Embeddings:  22%|██▏       | 1422/6410 [21:52<58:50,  1.41it/s]  [A
Extracting Embeddings:  22%|██▏       | 1423/6410 [21:5

## Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device)

## Define MLP Classifier

In [7]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)

model_mlp = MLPClassifier(X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_mlp.parameters(), lr=1e-4)

print(model_mlp)

MLPClassifier(
  (layers): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=128, bias=True)
    (4): ReLU()
    (5): Linear(in_features=128, out_features=1, bias=True)
    (6): Sigmoid()
  )
)


## Train MLP

In [8]:
epochs = 20
batch_size = 64

for epoch in range(epochs):
    model_mlp.train()
    perm = torch.randperm(X_train.size(0))
    total_loss = 0
    for i in range(0, X_train.size(0), batch_size):
        idx = perm[i:i+batch_size]
        x_batch, y_batch = X_train[idx], y_train[idx]
        optimizer.zero_grad()
        outputs = model_mlp(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/20, Loss: 51.6445
Epoch 2/20, Loss: 35.9113
Epoch 3/20, Loss: 29.1021
Epoch 4/20, Loss: 26.7737
Epoch 5/20, Loss: 25.1965
Epoch 6/20, Loss: 24.3843
Epoch 7/20, Loss: 23.7409
Epoch 8/20, Loss: 23.0682
Epoch 9/20, Loss: 22.7707
Epoch 10/20, Loss: 21.7106
Epoch 11/20, Loss: 21.3713
Epoch 12/20, Loss: 21.1235
Epoch 13/20, Loss: 20.5017
Epoch 14/20, Loss: 20.1280
Epoch 15/20, Loss: 19.8047
Epoch 16/20, Loss: 19.4195
Epoch 17/20, Loss: 18.9381
Epoch 18/20, Loss: 18.7335
Epoch 19/20, Loss: 18.6848
Epoch 20/20, Loss: 18.2806


## Evaluate

In [9]:
model_mlp.eval()
with torch.no_grad():
    preds = model_mlp(X_test).cpu().numpy().flatten()

pred_labels = (preds >= 0.5).astype(int)
print('ROC-AUC:', roc_auc_score(y_test.cpu(), preds))
print('PR-AUC:', average_precision_score(y_test.cpu(), preds))
print('\nClassification Report:\n', classification_report(y_test.cpu(), pred_labels))
print('Confusion Matrix:\n', confusion_matrix(y_test.cpu(), pred_labels))

ROC-AUC: 0.9586802504861505
PR-AUC: 0.956566926348837

Classification Report:
               precision    recall  f1-score   support

         0.0       0.90      0.90      0.90       641
         1.0       0.90      0.90      0.90       641

    accuracy                           0.90      1282
   macro avg       0.90      0.90      0.90      1282
weighted avg       0.90      0.90      0.90      1282

Confusion Matrix:
 [[574  67]
 [ 67 574]]


## Save Model

In [11]:
torch.save(model_mlp.state_dict(), 'fast_mlp_amp.pt')
print('Model saved as fast_mlp_amp.pt')

Model saved as fast_mlp_amp.pt


In [None]:
from google.colab import files
files.download('fast_mlp_amp.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>