In [1]:
import numpy as np
import torch.nn as nn
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

The following notebook is mostly unannotated.  

Starting with the **RoBERTa checkpoint**, the following script:  
- Builds a **PyTorch dataloader**  
- Adds a **classification head** to the RoBERTa model  
- Designs a **forward propagation** routine that embeds text pairs, concatenates embeddings, and predicts human preferences over the combined embedding sequences  

Finally, it performs **gradient descent** with:  
- Batch size = `16`  
- Optimizer = **Adam**  
- Learning rate = `5e-5`  
- Weight decay = `15e-2`  

**Results:**  
- Training fit — *CCE = 0.9758203125, Accuracy = 51.8%*  
- Validation fit — *CCE = 1.0341614906832297, Accuracy = 48%*  
- Initial metrics — *Loss ≈ 1.10, Accuracy ≈ 33%*  

These results demonstrate the ability of a **pretrained encoder model (2019)** to be fine-tuned for modeling and predicting **human attitudes and judgments**.


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
training_data = pd.read_csv('llm_classification_finetuning/train.csv').iloc[:,3:]
training_data.head()

Unnamed: 0,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [4]:
from sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(n_splits=1, test_size = .1, random_state=1234567)

for train_idx, validation_idx in splitter.split(training_data):
    train = training_data.iloc[train_idx]
    val = training_data.iloc[validation_idx]

In [5]:
# from huggingface_hub import notebook_login

# notebook_login()

In [5]:
model_id = "FacebookAI/roberta-base"
dtype = torch.bfloat16
device = torch.device('cuda')

tokenizer = AutoTokenizer.from_pretrained(model_id)
encoder = AutoModel.from_pretrained(model_id,
                                    torch_dtype = dtype)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from torch.utils.data import Dataset

class ClassifierDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=None, pad_token_id=tokenizer.pad_token_id):
        self.data = dataframe
        self.pad_token_id = pad_token_id

        self.encoded_texts = [encoded_text for row in 
                              [[tokenizer.encode('Prompt: ' + row['prompt']),
                                tokenizer.encode('Response A: ' + row['response_a']),
                                tokenizer.encode('Response B: ' + row['response_b'])]
                                for _, row in self.data.iterrows()] for encoded_text in row]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts #truncation
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts #padding
        ]

        self.triplet_encoded_texts = [np.array([
            self.encoded_texts[i] + [2],
            self.encoded_texts[i+1] + [2],
            self.encoded_texts[i+2] + [2]
            ])
            for i in range(0, len(self.encoded_texts), 3)]

    def __getitem__(self, index):
        triplet = self.triplet_encoded_texts[index]

        Prompt = torch.tensor(triplet[0], dtype=torch.long)
        prompt_mask = (Prompt != self.pad_token_id).long()

        Response_A = torch.tensor(triplet[1], dtype=torch.long)
        response_A_mask = (Response_A != self.pad_token_id).long()

        Response_B = torch.tensor(triplet[2], dtype=torch.long)
        response_B_mask = (Response_B != self.pad_token_id).long()

        
        inputs = [Prompt, Response_A, Response_B]
        masks = [prompt_mask, response_A_mask, response_B_mask]
        label = self.data.iloc[index][["winner_model_a", "winner_model_b", "winner_tie"]].astype(int).values
        
        return (
            inputs,
            masks,
            torch.tensor(label, dtype=torch.long)
        )
    
    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self):
        max_length=0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length>max_length:
                max_length = encoded_length
        return max_length

max_length = 511

train_dataset = ClassifierDataset(train, tokenizer, max_length=max_length)
val_dataset = ClassifierDataset(val, tokenizer, max_length=max_length)

print(max_length)

511


In [None]:
from torch.utils.data import DataLoader

torch.manual_seed(1234567)

num_workers = 0
batch_size = 16

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = batch_size,
    shuffle = True,
    num_workers = num_workers,
    drop_last = True
)

val_loader = DataLoader(
    dataset = val_dataset,
    batch_size = batch_size,
    shuffle = True,
    num_workers = num_workers,
    drop_last = True
)

In [13]:
for inputs, masks, labels in train_loader:
    pass

print(len(train_loader))
print("Train Inputs Batch Shape: ", inputs[0].shape)
print("Train Target Batch Shape: ", labels.shape )

for inputs, masks, labels in val_loader:
    pass

print(len(val_loader))
print("Validation Inputs Batch Shape: ", inputs[0].shape)
print("Validation Target Batch Shape: ", labels.shape )

3233
Train Inputs Batch Shape:  torch.Size([16, 512])
Train Target Batch Shape:  torch.Size([16, 3])
359
Validation Inputs Batch Shape:  torch.Size([16, 512])
Validation Target Batch Shape:  torch.Size([16, 3])


In [None]:
import torch
import torch.nn as nn

encoder.score = nn.Identity() # Silence the original automodel output layer

class TripletClassifier(torch.nn.Module):
    def __init__(self, encoder, hidden_dim_1, output_dim):
        super().__init__()
        self.encoder = encoder
        self.hidden_dim_1 = hidden_dim_1
        self.output_dim = output_dim

        # Write a new classification head (simple)

        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_dim_1, self.output_dim),
        )

    # Write encoder model

    def encode(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        emb = outputs.pooler_output

        return emb

    # Write a forward pass that encodes each of the three inputs,
    # concats the sequence encodings, then classifies them as a unit.

    def forward(self, inputs, attention_masks):
        # Extract inputs and masks
        prompt = inputs[0]
        response_A = inputs[1]
        response_B = inputs[2]

        prompt_mask = attention_masks[0]
        response_A_mask = attention_masks[1]
        response_B_mask = attention_masks[2]

        # Pass each input to RoBERTa and get encodings
        prompt_emb = self.encode(input_ids = prompt, attention_mask = prompt_mask)
        response_A_emb = self.encode(input_ids = response_A, attention_mask = response_A_mask)
        response_B_emb = self.encode(input_ids = response_B, attention_mask = response_B_mask)

        combined = torch.cat([prompt_emb, response_A_emb, response_B_emb], dim=1) # concatenate embeddings

        logits = self.classifier(combined) # classify embeddings

        return logits

In [15]:
model = TripletClassifier(encoder = encoder, hidden_dim_1 = 3 * 768, output_dim = 3)

model = model.to(device).to(dtype)

inputs, masks = [i.to(device) for i in inputs], [i.to(device) for i in masks]
example_output = model.forward(inputs, masks)

In [16]:
print("Example Output Shape: ", example_output.shape)

Example Output Shape:  torch.Size([16, 3])


In [107]:
print(model)

TripletClassifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [17]:
for param in model.parameters():
    param.requires_grad = False

for param in model.encoder.embeddings.parameters():
    param.requires_grad = True

for param in model.encoder.encoder.layer[10].parameters():
    param.requires_grad = True

for param in model.encoder.encoder.layer[11].parameters():
    param.requires_grad = True   

for param in model.encoder.pooler.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

In [19]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0
    
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (inputs, attention_masks, target_batch) in enumerate(data_loader):
        if i < num_batches:
            inputs[:] = [t.to(device) for t in inputs]
            attention_masks[:] = [t.to(device) for t in attention_masks]
            target_batch = target_batch.to(device)

            with torch.no_grad():
                logits = model(inputs, attention_masks = attention_masks)
           
            predicted_labels = torch.argmax(logits, dim=-1)
            target_batch = torch.argmax(target_batch, dim=-1)
            
            correct_predictions += (predicted_labels == target_batch).sum().item()
            num_examples += logits.shape[0]

        else:
            break

    return correct_predictions / num_examples

In [20]:
train_accuracy = calc_accuracy_loader(train_loader, model, device = device, num_batches=20)
print("Train accuracy: ", train_accuracy)

validation_accuracy = calc_accuracy_loader(val_loader,  model, device = device, num_batches=20)
print("Validation accuracy: ", validation_accuracy)

Train accuracy:  0.296875
Validation accuracy:  0.378125


In [25]:
def calc_loss_batch(inputs, attention_masks, target_batch, model):
    target_batch = target_batch.argmax(dim=1)
    logits = model(inputs,attention_masks=attention_masks)
    
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader)==0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (inputs, attention_masks, target_batch) in enumerate(data_loader):
        inputs[:] = [t.to(device) for t in inputs]
        attention_masks[:] = [t.to(device) for t in attention_masks]
        target_batch = target_batch.to(device)

        if i < num_batches:
            loss = calc_loss_batch(inputs, attention_masks, target_batch, model)
            total_loss += loss.item()

        else:
            break

    return total_loss / num_batches

In [None]:
train_loss = calc_loss_loader(train_loader, model, device, num_batches = 20)
print("Train Loss: ", train_loss)

validation_loss = calc_loss_loader(val_loader,model, device, num_batches = 20)
print("Validation Loss: ", validation_loss)

In [81]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches = eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches = eval_iter)
    model.train()
    return train_loss, val_loss

In [82]:
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs):

    # Initialize lists
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train() # Set model to training mode

        for inputs, attention_masks, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            inputs[:], attention_masks[:], target_batch = [t.to(device) for t in inputs], [t.to(device) for t in attention_masks], target_batch.to(device)

            loss = calc_loss_batch(inputs, attention_masks, target_batch, model, device)
            loss.backward() # Calculate loss gradients

            optimizer.step()
            examples_seen += inputs[0].shape[0]
            global_step += 1

            # Evaluation step
            if global_step % 50 == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter = 30)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
  
            if global_step % 500 == 0:
                # Calculate accuracy after each epoch
                train_accuracy = calc_accuracy_loader(train_loader,model,device, num_batches = 30)
                val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches = 30)
                print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
                print(f"Validation accuracy: {val_accuracy*100:.2f}%")
                train_accs.append(train_accuracy)
                val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [None]:
import time

start_time = time.time()

torch.manual_seed(1234567)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=15e-2)

num_epochs = 1

torch.cuda.empty_cache()

train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs
)

end_time = time.time()
execution_time_minutes = (end_time - start_time)/60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.974, Val loss 1.012
Training accuracy: 49.38% | Validation accuracy: 49.27%
Ep 1 (Step 000050): Train loss 0.997, Val loss 1.056
Ep 1 (Step 000100): Train loss 1.007, Val loss 1.036
Ep 1 (Step 000150): Train loss 1.006, Val loss 1.055
Ep 1 (Step 000200): Train loss 0.989, Val loss 1.069
Ep 1 (Step 000250): Train loss 1.005, Val loss 0.997
Ep 1 (Step 000300): Train loss 0.988, Val loss 1.027
Ep 1 (Step 000350): Train loss 0.980, Val loss 1.009
Ep 1 (Step 000400): Train loss 1.009, Val loss 1.015
Ep 1 (Step 000450): Train loss 0.976, Val loss 1.037
Ep 1 (Step 000500): Train loss 0.990, Val loss 1.026
Training accuracy: 49.58% | Validation accuracy: 46.04%
Ep 1 (Step 000550): Train loss 0.991, Val loss 1.054
Ep 1 (Step 000600): Train loss 1.002, Val loss 1.029
Ep 1 (Step 000650): Train loss 0.965, Val loss 1.049
Ep 1 (Step 000700): Train loss 0.997, Val loss 1.038
Ep 1 (Step 000750): Train loss 0.969, Val loss 1.024
Ep 1 (Step 000800): Train loss 1.014, Va

In [None]:
print(calc_accuracy_loader(train_loader, model, device = device, num_batches=None))
print(calc_accuracy_loader(val_loader, model, device = device, num_batches=None))

print(calc_loss_loader(train_loader, model, device, num_batches=None))
print(calc_loss_loader(val_loader, model, device, num_batches=None))

0.517625
0.4765139751552795
0.9758203125
1.0341614906832297


In [24]:
torch.save(model.state_dict(), 'model.pth')