# Supervised fine-tuning on experimental data greatly improves performance of language models compared to using supervised few-shot prediction with embeddings

In [None]:
#@title Mount drive and load libraries
import os
from google.colab import drive

drive.mount('/content/drive/')
path = '/content/drive/MyDrive/msc-project-mbalmf01'
os.chdir(path)

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.optim import lr_scheduler
from torch.utils.tensorboard import SummaryWriter

#Load Prot-T5 model
model = AutoModelForSequenceClassification.from_pretrained("Rostlab/prot-t5")
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot-t5")

df = pd.read_csv("antibody_data.csv")
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

#Initialise Prot-T5 model, configure for finetuning.
model.config.num_labels = 2

model.fit(
    x=train_df["sequence"],
    y=train_df["label"],
    epochs=10,
    batch_size=32,
    lr=1e-4)

model.evaluate(
    x=val_df["sequence"],
    y=val_df["label"],
    batch_size=32)


In [None]:
#@title Define callback and learning rate scheduler
class LossMonitor(object):
    def __init__(self, log_dir):
        self.writer = SummaryWriter(log_dir)

    def on_epoch_end(self, epoch, loss, val_loss):
        self.writer.add_scalar("train_loss", loss, epoch)
        self.writer.add_scalar("val_loss", val_loss, epoch)

scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
loss_monitor = LossMonitor("logs")

#Train model
for epoch in range(epochs):
    scheduler.step()
    train_model(model, dataloader, optimizer, epoch)
    val_loss = evaluate_model(model, val_dataloader)
    #Log loss to TensorBoard
    loss_monitor.on_epoch_end(epoch, loss, val_loss)

#Save fine-tuned model
model.save_pretrained("finetuned_prot-t5")
