### Lyrics model.

This notebook contains code for training the lyrics models, which is a built using
a pretrained BERT model.

In [1]:
import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

### Load and split the data function

In [2]:
df_train = pd.read_csv("../../data/train.csv", header=None, names=["quadrant", "lyrics"], skiprows=1)
df_valid = pd.read_csv("../../data/validation.csv", header=None, names=["quadrant", "lyrics"], skiprows=1)
df_test = pd.read_csv("../../data/test.csv", header=None, names=["quadrant", "lyrics"], skiprows=1)

data = pd.concat([df_train, df_valid])

In [3]:
data["quadrant"] = pd.to_numeric(data["quadrant"]) # labels to numbers

In [4]:
data["quadrant"].value_counts()

0    4557
2    4109
1    2745
3    2683
Name: quadrant, dtype: int64

In [5]:
X_train, X_val, y_train, y_val = train_test_split(data.index.values, data.quadrant.values,
                                                 test_size=0.10, random_state=42, stratify=data.quadrant.values)

In [6]:
data["data_type"] = ["not_set"]*data.shape[0]
data.loc[X_train, "data_type"] = "train"
data.loc[X_val, "data_type"] = "val"

data.groupby(["quadrant", "data_type"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,lyrics
quadrant,data_type,Unnamed: 2_level_1
0,train,4101
0,val,456
1,train,2470
1,val,275
2,train,3698
2,val,411
3,train,2415
3,val,268


### Tokenize and Encode the Data

In [7]:
# initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [8]:
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].lyrics.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].lyrics.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type=='train'].quadrant.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type=='val'].quadrant.values)

In [9]:
def encode_data(data):
    """
    Encodes the split data/lyrics
    :param lyrics_split:
    :param labels_split:
    :return: encoded data and labels
    """
    
    data["quadrant"] = pd.to_numeric(data["quadrant"]) # labels to numbers

    encoded_data = tokenizer.batch_encode_plus(
        data.lyrics.values,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=126,
        return_tensors="pt"
    )

    return encoded_data, data.quadrant.values

In [10]:
encoded_data_test, labels_test = encode_data(df_test)

### Create datasets and dataloaders

In [11]:
trainset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
validset = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [12]:
batch_size = 8
trainloader = DataLoader(trainset, sampler=RandomSampler(trainset), batch_size=batch_size)
validloader = DataLoader(validset, sampler=SequentialSampler(validset), batch_size=batch_size)

In [13]:
def create_dataloader(encoded_data, labels, batch_size):
    """

    :param batch_size:
    :param labels:
    :param encoded_data:
    :return: dataloader
    """

    input_ids = encoded_data["input_ids"]
    attention_masks= encoded_data["attention_mask"]
    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    return dataloader

In [14]:
testloader = create_dataloader(encoded_data_test, labels_test, batch_size=batch_size)

### BERT Pre-trained Model

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [18]:
# optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=7,
                                            num_training_steps=len(trainloader)*epochs)

### Training

In [19]:
# performance metrics
def f1_score_func(preds, labels):
    pred_flat = np.argmax(preds, 1).flatten()
    labels_flat = labels.flatten()

    return f1_score(labels_flat, pred_flat, average="weighted")

# def accuracy_per_class(preds, labels):
#     quadrant_dict = {0: "Q1", 1: "Q2", 2: "Q3", 3: "Q4"}

#     pred_flat = np.argmax(preds, 1).flatten()
#     labels_flat = labels.flatten()

#     for label in np.unique(labels_flat):
#         y_preds = pred_flat[labels_flat == label]
#         y_true = labels_flat[labels_flat == label]
#         print('Accuracy of %5s: %2d%% (%2d/%2d)' % (
#             quadrant_dict[label], 100 * len(y_preds[y_preds==label]) / len(y_true),
#             len(y_preds[y_preds==label]), len(y_true)))

In [20]:
import random

# seed_val = 17
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed(seed_val)

def evaluate(data_loader):
    model.eval()

    total_loss = 0
    y_pred, y_true = [], []

    for batch in data_loader:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        total_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        y_pred.append(logits)
        y_true.append(label_ids)

    loss_avg = total_loss / len(data_loader)
    predictions = np.concatenate(y_pred, axis=0)
    true_vals = np.concatenate(y_true, axis=0)

    return loss_avg, predictions, true_vals

In [21]:
# val_inf = np.Inf

for epoch in tqdm(range(1, epochs+1)):
    model.train()

    total_loss = 0

    progress_bar = tqdm(trainloader,
                        desc="Epoch {:1d}".format(epochs),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2]
                  }

        model.zero_grad()
        output = model(**inputs)
        loss = output[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f"\nEpoch {epoch}")

    loss_avg = total_loss / len(trainloader)
    tqdm.write(f"Training Loss: {loss_avg}")

    model.eval()
    val_loss, predictions, true_vals = evaluate(validloader)
    val_f1_score = f1_score_func(predictions, true_vals)
    tqdm.write(f"Validation Loss: {val_loss}")
    tqdm.write(f"F1 Score (Weighted): {val_f1_score}')")

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/1586 [00:00<?, ?it/s]


Epoch 1
Training Loss: 1.2965606913665806
Validation Loss: 1.2823544589139648
F1 Score (Weighted): 0.319852974118007')


Epoch 4:   0%|          | 0/1586 [00:00<?, ?it/s]


Epoch 2
Training Loss: 1.1628414841197328
Validation Loss: 1.3100608466035228
F1 Score (Weighted): 0.37894910878716803')


Epoch 4:   0%|          | 0/1586 [00:00<?, ?it/s]


Epoch 3
Training Loss: 0.8874075147611046
Validation Loss: 1.489142568434699
F1 Score (Weighted): 0.38894374249364705')


Epoch 4:   0%|          | 0/1586 [00:00<?, ?it/s]


Epoch 4
Training Loss: 0.6166834300865441
Validation Loss: 1.7948258677444888
F1 Score (Weighted): 0.381631836134988')


In [None]:
# save the trained model
torch.save(model.state_dict(), "model_bert.bin")

### Model loading and Testing

In [22]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)
model.load_state_dict(torch.load("model_bert.bin", map_location="cpu"))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [26]:
from sklearn.metrics import classification_report

model.eval()
val_loss, pred_class, actual_class = evaluate(testloader)
pred_class = [_.argmax(0) for _ in pred_class]
actual_class = [_ for _ in actual_class]

In [42]:
print(classification_report(actual_class, pred_class))

              precision    recall  f1-score   support

           0       0.49      0.53      0.51      1048
           1       0.38      0.33      0.35       663
           2       0.38      0.43      0.40       968
           3       0.28      0.22      0.25       594

    accuracy                           0.41      3273
   macro avg       0.38      0.38      0.38      3273
weighted avg       0.40      0.41      0.40      3273

