<a href="https://colab.research.google.com/github/ndarr/bert-poem-regressor/blob/main/BertPoems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!wget example.com #Replace with actual link to data 
!mkdir models
!mkdir losses

In [None]:
import torch
import numpy as np

# Set random seed for better reproducibility
torch.manual_seed(42)
np.random.seed(42)

import torch.nn as nn
from torch import tensor
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

# Chooose between bert-large or distilbert-base
bert = "bert-large"
class RegressionModel(nn.Module):
    def __init__(self, bertname):
        super(RegressionModel, self).__init__()
        self.bertname = bertname 
        self.bert = BertModel.from_pretrained(f"{self.bertname}-uncased")
        self.emb_size = self.bert.config.hidden_size
        self.batch_norm = nn.BatchNorm1d(self.emb_size)
        self.drop_out = nn.Dropout()
        self.dense1 = nn.Linear(self.emb_size, 1)
        self.sig = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask):
        # Get pooler output for whole sequence
        y = self.bert(input_ids, attention_mask)['pooler_output']
        
        y = self.batch_norm(y)
        y = self.drop_out(y)
        
        # hiddensize x 1 
        y = self.dense1(y)

        y = self.sig(y)

        # Convert to float for better compatibility
        y = y.float()
        return y

class PoemScoreDataset(Dataset):
    def __init__(self, poems, targets, bertname):
        assert len(poems) == len(targets)
        self.bertname = bertname
        self.poems = poems
        self.targets = targets
        self.tokenizer = BertTokenizer.from_pretrained(f"{self.bertname}-uncased")
        self.max_len = 80
    
    def __len__(self):
        return len(self.poems)
    
    def __getitem__(self, idx):
        tokenized_poem = self.tokenizer.encode_plus(
            self.poems[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')
        return tokenized_poem, tensor(self.targets[idx], dtype=torch.float)


In [None]:
from statistics import mean
from tqdm.notebook import tqdm
from torch.optim import AdamW
import csv
from sklearn.model_selection import train_test_split
 
columns = ["bws_all", "bws_coherent", "bws_grammatical", "bws_melodious", "bws_moved", "bws_real", "bws_rhyming", "bws_readable", "bws_comprehensible", "bws_intense","bws_liking","crowdgppl_all", "crowdgppl_coherent","crowdgppl_grammatical","crowdgppl_melodious","crowdgppl_moved","crowdgppl_real","crowdgppl_rhyming","crowdgppl_readable","crowdgppl_comprehensible","crowdgppl_intense","crowdgppl_liking"]

for target_column in columns:
    targets = []
    poems = []
    print(f"======{target_column}======")
    # Read scores from csv file
    with open("normalized_scores.csv") as f:
        csv_reader = csv.reader(f)
        header = next(csv_reader)
        target_idx = header.index(target_column)
        for row in csv_reader:
            poem = row[0].replace("<br>", "\n")
            poems.append(row[0])
            targets.append(float(row[target_idx]))
    device = "cuda"

    model = RegressionModel(bertname=bert)
    model = model.to(device)
    model.train()

    # Split poems and targets into train and test set
    poems, test_poems, targets, test_targets = train_test_split(poems, targets, test_size=0.1)

    dataset = PoemScoreDataset(poems, targets, bertname=bert)
    test_dataset = PoemScoreDataset(test_poems, test_targets, bertname=bert)

    optimizer = AdamW(model.parameters())

    # L1 Loss as it worked best among the candidates
    loss_fn = nn.L1Loss()
    # Create dataloaders with their respective data
    dataloader = DataLoader(dataset, batch_size=24, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=24, shuffle=False)


    epochs = 10
    # Dictionary with losses for each epoch
    epoch_losses = {}
    epoch_test_losses = {}

    for epoch in range(epochs):
        model.train()
        tqdm_loader = tqdm(dataloader)
        losses = []

        # Iterate over training batches
        for input_, targets_ in tqdm_loader:
            optimizer.zero_grad()

            # Get input ids and attention mask from BERT tokenized poem
            input_ids = input_['input_ids'].squeeze(1).to(device)
            attention_mask = input_['attention_mask'].squeeze(1).to(device)

            pred = model(input_ids, attention_mask).squeeze()
            
            targets_ = targets_.to(device)
            loss = loss_fn(pred, targets_)
            
            loss.backward()
            optimizer.step()
            
            losses.append(loss.item())
            tqdm_loader.set_description(f"Loss: {round(mean(losses),4)}")
        
        epoch_losses[epoch] = mean(losses)
        test_losses = []
        model.eval()
        # Test on testing data set for generalization capability
        for input_, targets_ in test_dataloader:
            input_ids = input_['input_ids'].squeeze(1).to(device)
            attention_mask = input_['attention_mask'].squeeze(1).to(device)
            pred = model(input_ids, attention_mask).squeeze()
            targets_ = targets_.to(device)
            loss = loss_fn(pred, targets_)
            test_losses.append(loss.item())
        
        print(f"Test loss epoch {epoch}:  {mean(test_losses)}")
        epoch_test_losses[epoch] = mean(test_losses)


    # Save the current model 
    torch.save(model.state_dict(), f"models/{bert}_model_{target_column}.pt")
    
    # Save losses into file 
    loss_file = open(f"losses/{bert}_model_losses_{target_column}.txt","w+")
    loss_file.write(str(epoch_losses))
    loss_file.close()

    # Save test losses into file
    test_loss_file = open(f"losses/{bert}_test_losses_{target_column}.txt","w+")
    test_loss_file.write(str(epoch_test_losses))
    test_loss_file.close()

    # Clear memory as far as possible for next iteration
    del loss_file
    del test_loss_file
    del dataset
    del dataloader
    del optimizer
    del model
    torch.cuda.empty_cache()
    