### Lyrics model.

This notebook contains code for training the lyrics models, which is a built using
a pretrained BERT model.

In [1]:
import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

### Load and split the data function

In [2]:
lyrics_path_list = ["../../data/train/lyrics/lyrics.txt", "../../data/test/lyrics/lyrics.txt",
                    "../../data/validation/lyrics/lyrics.txt"]
labels_path_list = ["../../data/train/lyrics/labels.txt", "../../data/test/lyrics/labels.txt",
                    "../../data/validation/lyrics/labels.txt"]

In [3]:
def split_data(lyrics_path, labels_path):
    """
    Return split data
    :param lyrics_path: path to lyrics .txt file
    :param labels_path: path to labels .txt fil
    :return: Split data
    """

    # load the data
    with open(lyrics_path, "r") as f:
        lyrics = f.read()

    with open(labels_path, "r") as f:
        labels = f.read()

    # remove empty
    lyrics_split = lyrics.split("\n")
    labels_split = labels.split("\n")
    lyrics_split.remove('')
    labels_split.remove('')

    return lyrics_split, labels_split

In [4]:
train_d, train_l = split_data(lyrics_path_list[0], labels_path_list[0])
test_d, test_l = split_data(lyrics_path_list[1], labels_path_list[1])
valid_d, valid_l = split_data(lyrics_path_list[2], labels_path_list[2])

In [5]:
data = train_d[:7000]+valid_d[:2000]
labels = train_l[:7000]+valid_l[:2000]

In [6]:
data = pd.DataFrame({"lyrics": data, "quadrant": labels})
data["quadrant"] = pd.to_numeric(data["quadrant"]) # labels to numbers

In [7]:
data["quadrant"].value_counts()

0    3035
2    2527
3    1788
1    1650
Name: quadrant, dtype: int64

In [8]:
X_train, X_val, y_train, y_val = train_test_split(data.index.values, data.quadrant.values,
                                                 test_size=0.20, random_state=42, stratify=data.quadrant.values)

In [9]:
data["data_type"] = ["not_set"]*data.shape[0]
data.loc[X_train, "data_type"] = "train"
data.loc[X_val, "data_type"] = "val"

data.groupby(["quadrant", "data_type"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,lyrics
quadrant,data_type,Unnamed: 2_level_1
0,train,2428
0,val,607
1,train,1320
1,val,330
2,train,2022
2,val,505
3,train,1430
3,val,358


### Tokenize and Encode the Data

In [10]:
# initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [11]:
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].lyrics.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].lyrics.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type=='train'].quadrant.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type=='val'].quadrant.values)

In [12]:
def encode_data(lyrics_split, labels_split):
    """
    Encodes the split data/lyrics
    :param lyrics_split:
    :param labels_split:
    :return: encoded data and labels
    """

    data = pd.DataFrame({"lyrics": lyrics_split, "quadrant": labels_split})
    data["quadrant"] = pd.to_numeric(data["quadrant"]) # labels to numbers

    encoded_data = tokenizer.batch_encode_plus(
        data.lyrics.values,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    return encoded_data, data.quadrant.values

In [13]:
encoded_data_test, labels_test = encode_data(test_d, test_l)

### Create datasets and dataloaders

In [14]:
trainset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
validset = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [15]:
batch_size = 16
trainloader = DataLoader(trainset, sampler=RandomSampler(trainset), batch_size=batch_size)
validloader = DataLoader(validset, sampler=SequentialSampler(validset), batch_size=batch_size)

In [16]:
def create_dataloader(encoded_data, labels, batch_size):
    """

    :param batch_size:
    :param labels:
    :param encoded_data:
    :return: dataloader
    """

    input_ids = encoded_data["input_ids"]
    attention_masks= encoded_data["attention_mask"]
    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    return dataloader

In [17]:
testloader = create_dataloader(encoded_data_test, labels_test, batch_size=batch_size)

### BERT Pre-trained Model

In [18]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [20]:
# optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(trainloader)*epochs)

In [21]:
# performance metrics
def f1_score_func(preds, labels):
    pred_flat = np.argmax(preds, 1).flatten()
    labels_flat = labels.flatten()

    return f1_score(labels_flat, pred_flat, average="weighted")

def accuracy_per_class(preds, labels):
    quadrant_dict = {0: "Q1", 1: "Q2", 2: "Q3", 3: "Q4"}

    pred_flat = np.argmax(preds, 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = pred_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print('Accuracy of %5s: %2d%% (%2d/%2d)' % (
            quadrant_dict[label], 100 * len(y_preds[y_preds==label]) / len(y_true),
            len(y_preds[y_preds==label]), len(y_true)))

### Training

In [22]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)

def evaluate(data_loader):
    model.eval()

    total_loss = 0
    y_pred, y_true = [], []

    for batch in data_loader:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        total_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        y_pred.append(logits)
        y_true.append(label_ids)

    loss_avg = total_loss / len(data_loader)
    predictions = np.concatenate(y_pred, axis=0)
    true_vals = np.concatenate(y_true, axis=0)

    return loss_avg, predictions, true_vals

In [23]:
# val_inf = np.Inf

for epoch in tqdm(range(1, epochs+1)):
    model.train()

    total_loss = 0

    progress_bar = tqdm(trainloader,
                        desc="Epoch {:1d}".format(epochs),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
                  }

        model.zero_grad()
        output = model(**inputs)
        loss = output[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f"\nEpoch {epoch}")

    loss_avg = total_loss / len(trainloader)
    tqdm.write(f"Training Loss: {loss_avg}")

    torch.save(model.state_dict(), f"finetuned_BERT_model-epoch-v.{epoch}.pt")

    model.eval()
    val_loss, predictions, true_vals = evaluate(validloader)
    val_f1_score = f1_score_func(predictions, true_vals)
    tqdm.write(f"Validation Loss: {val_loss}")
    tqdm.write(f"F1 Score (Weighted): {val_f1_score}')")

#     if val_loss <= val_inf:
#         print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
#             val_inf, val_loss))
#         torch.save(model.state_dict(), f"finetuned_BERT_model.pt")
#         val_inf = val_loss


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/450 [00:00<?, ?it/s]


Epoch 1
Training Loss: 1.3257842148674859
Validation Loss: 1.3012993314624888
F1 Score (Weighted): 0.32853245560694')


Epoch 4:   0%|          | 0/450 [00:00<?, ?it/s]


Epoch 2
Training Loss: 1.2408481771416133
Validation Loss: 1.277445177061368
F1 Score (Weighted): 0.38621973227666434')


Epoch 4:   0%|          | 0/450 [00:00<?, ?it/s]


Epoch 3
Training Loss: 1.0726551144652896
Validation Loss: 1.333635472618373
F1 Score (Weighted): 0.39562440877998045')


Epoch 4:   0%|          | 0/450 [00:00<?, ?it/s]


Epoch 4
Training Loss: 0.8836718943383959
Validation Loss: 1.444184146096221
F1 Score (Weighted): 0.4045886011608124')


### Model loading and Testing

In [24]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [25]:
for i in range(1, 5):

    print(f"Model {i}")
    model.to(device)
    model.load_state_dict(torch.load(f"finetuned_BERT_model-epoch-v.{i}.pt", map_location="cpu"))
    model.eval()

    _, predictions, true_vals = evaluate(testloader)
    accuracy_per_class(predictions, true_vals)

    print()

Model 1
Accuracy of    Q1: 82% (863/1050)
Accuracy of    Q2:  7% (48/666)
Accuracy of    Q3: 48% (467/968)
Accuracy of    Q4:  0% ( 1/596)

Model 2
Accuracy of    Q1: 68% (720/1050)
Accuracy of    Q2: 25% (168/666)
Accuracy of    Q3: 54% (532/968)
Accuracy of    Q4:  7% (42/596)

Model 3
Accuracy of    Q1: 62% (652/1050)
Accuracy of    Q2: 25% (170/666)
Accuracy of    Q3: 55% (538/968)
Accuracy of    Q4: 12% (72/596)

Model 4
Accuracy of    Q1: 60% (631/1050)
Accuracy of    Q2: 33% (225/666)
Accuracy of    Q3: 47% (456/968)
Accuracy of    Q4: 14% (88/596)

