In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_cosine_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import numpy as np
import random
import os

# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the pretrained ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("Rostlab/prot_bert", num_labels=2)  # Change num_labels based on your task
model = model.to(device)

# Define the PeptideDataset class
class PeptideDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_length=512, augmentation=None):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmentation = augmentation

    def __len__(self):
        return len(self.sequences)

    def augment_sequence(self, sequence):
        # Implement data augmentation techniques here if needed
        if self.augmentation == 'random_swap':
            sequence = list(sequence)
            idx = random.sample(range(len(sequence) - 1), k=int(0.02 * len(sequence)))
            for i in idx:
                sequence[i], sequence[i+1] = sequence[i+1], sequence[i]
            sequence = ''.join(sequence)
        return sequence

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]

        # Data augmentation
        if self.augmentation:
            sequence = self.augment_sequence(sequence)

        # Tokenization
        sequence = ' '.join(list(sequence))
        encoding = self.tokenizer(sequence,
                                  add_special_tokens=True,
                                  truncation=True,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  return_tensors='pt')

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids.to(device),
            'attention_mask': attention_mask.to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

# Training function
def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc='Training'):
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    return avg_loss

# Evaluation function
def evaluate(model, loader):
    model.eval()
    y_true = []
    y_pred = []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            outputs = model(input_ids=batch['input_ids'],
                            attention_mask=batch['attention_mask'],
                            labels=batch['labels'])
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            y_true.extend(batch['labels'].cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return avg_loss, accuracy, precision, recall, f1

# Main function
def main():
    # Example data (Replace this with your actual data)
    # sequences: List of peptide sequences (strings)
    # labels: List of labels (0 or 1)
    sequences = [
        'ARNDCEQGHILKMFPSTWYV',
        'MKVIFLTLFLAALAAF',
        'GAVLIPFYWTSNQDEHRKM',
        'LAGVQAHW',
        # Add more sequences
    ]
    labels = [
        1,
        0,
        1,
        0,
        # Corresponding labels
    ]

    # Ensure that sequences and labels have the same length
    assert len(sequences) == len(labels), "Sequences and labels must have the same length."
    model = BertForSequenceClassification.from_pretrained("Rostlab/prot_bert", num_labels=num_classes)

    # Create the dataset
    dataset = PeptideDataset(sequences, labels, tokenizer, augmentation=None)

    # Split dataset into train, validation, and test sets
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    # Data augmentation for the training dataset (if needed)
    # You can create a new PeptideDataset with augmentation for the training set
    train_sequences = [dataset.sequences[i] for i in train_dataset.indices]
    train_labels = [dataset.labels[i] for i in train_dataset.indices]
    train_dataset = PeptideDataset(train_sequences, train_labels, tokenizer, augmentation='random_swap')

    # Data loaders
    batch_size = 16
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Optimizer and scheduler
    epochs = 10
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(0.1 * total_steps),
                                                num_training_steps=total_steps)

    # Training loop
    best_val_accuracy = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_loader)
        scheduler.step()
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, "
              f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}")

        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            model.save_pretrained('best_peptidebert_model')
            tokenizer.save_pretrained('best_peptidebert_model')

    # Load the best model
    model = BertForSequenceClassification.from_pretrained('best_peptidebert_model')
    model = model.to(device)

    # Final test evaluation
    test_loss, test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader)
    print(f"Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, "
          f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1 Score: {test_f1:.4f}")

    # Save the final model
    model.save_pretrained('final_peptidebert_model')
    tokenizer.save_pretrained('final_peptidebert_model')

if __name__ == "__main__":
    main()


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'num_classes' is not defined