In [55]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer

In [6]:
df = pd.read_csv("../../data/train.csv", header=None, names=["lyrics", "quadrant"], skiprows=1)
df = df.iloc[0]

In [7]:
df

lyrics      Gently hold our hands\nGently hold our heads o...
quadrant                                                    1
Name: (213754, -0.6827250804970001, 0.316757791845, Dark Tranquillity, Insanity's Crescendo), dtype: object

In [8]:
MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [17]:
lyrics = df.lyrics
lyrics = " ".join("".join(lyrics).replace("\n", " ").replace("\r", " ").split())
lyrics

"Gently hold our hands Gently hold our heads on high Aimless time in fear new hide Overthrow the plan Confusion lies in all my words Mad is the soul We barricade ourselves in holes of temperament This is the dawning of a new age A heart that beats the wrong way Insanity's crescendo Windcolour, second sight A touch of silence and the violence of dark Illusion span, the aroma of time Shadowlife and the scent of nothingness Infinite fall of instinct Order of one spells deceit Infin"

In [19]:
inputs = tokenizer.encode_plus(
            lyrics,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )

In [43]:
input_ids = inputs["input_ids"]
input_ids = input_ids.unsqueeze(0).type(torch.FloatTensor)
input_ids.shape

torch.Size([1, 1, 128])

In [49]:
model = nn.LSTM(input_size=MAX_LEN, hidden_size=4, num_layers=1, batch_first=True)

In [50]:
out, (ht, ct)  = model(input_ids)
ht

tensor([[[ 0.0000, -0.7616,  0.0000,  0.0000]]], grad_fn=<StackBackward>)

In [51]:
class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [52]:
model = LSTM_Model(MAX_LEN, 3)

In [53]:
model(input_ids)

tensor([[-0.4910, -0.5687, -0.2336,  0.1143]], grad_fn=<AddmmBackward>)

In [91]:
# Custom Dataset
class LyricsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):

        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        lyrics = self.data.lyrics[idx]
        lyrics = " ".join("".join(lyrics).replace("\n", " ").replace("\r", " ").split())
        inputs = self.tokenizer.encode_plus(
            lyrics,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs["input_ids"]

        return {
            "input_ids": input_ids.type(torch.FloatTensor),
            "labels": torch.tensor(self.data.quadrant[idx], dtype=torch.long)
        } 

    def __len__(self):
        return len(self.data)

In [92]:
# Concatenate data
df_train = pd.read_csv("../../data/train.csv", header=None, names=["lyrics", "quadrant"], skiprows=1)
df_valid = pd.read_csv("../../data/validation.csv", header=None, names=["lyrics", "quadrant"], skiprows=1)
df_test = pd.read_csv("../../data/test.csv", header=None, names=["lyrics", "quadrant"], skiprows=1)

frames = [df_train, df_valid, df_test]

df_combined = pd.concat(frames, ignore_index=True)

df_combined["quadrant"] = pd.to_numeric(df_combined["quadrant"])
df_combined.head()

Unnamed: 0,lyrics,quadrant
0,Gently hold our hands\nGently hold our heads o...,1
1,We are the Sun\nWe are the dead stars\nWe are ...,1
2,You're out of touch\nI'm out of time\nBut I'm ...,0
3,You finally close the door\nYou've left open w...,0
4,随分先に行ってしまった 光の下のキャラバン\nトンネルに残響 塞いだ耳 自分嫌いな自分が好き...,3


In [93]:
# Hyper Parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-05

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [94]:
# Create Datasets
trainsize = 0.8
trainset = df_combined.sample(frac=trainsize, random_state=42)
testset = df_combined.drop(trainset.index).reset_index(drop=True)
trainset = trainset.reset_index(drop=True)

print(f"Full Dataset {df_combined.shape}\n"\
      f"Train Dataset {trainset.shape}\n"\
      f"Test Dataset {testset.shape}")

trainset = LyricsDataset(trainset, tokenizer, MAX_LEN)
testset = LyricsDataset(testset, tokenizer, MAX_LEN)

Full Dataset (17427, 2)
Train Dataset (13942, 2)
Test Dataset (3485, 2)


In [98]:
trainset[0]["input_ids"].shape, trainset[0]["labels"]

(torch.Size([1, 128]), tensor(3))

In [99]:
# Create Dataloaders
parameters = {
    "batch_size": BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0
}

trainloader = DataLoader(trainset, **parameters)
testloader = DataLoader(testset, **parameters)

In [100]:
next(iter(trainloader))

{'input_ids': tensor([[[ 101., 3414., 3075.,  ..., 1104., 1602.,  102.]],
 
         [[ 101., 1109.,  181.,  ...,    0.,    0.,    0.]],
 
         [[ 101.,  146., 1286.,  ...,    0.,    0.,    0.]],
 
         ...,
 
         [[ 101., 1188., 1461.,  ...,  117.,  146.,  102.]],
 
         [[ 101.,  146.,  112.,  ...,    0.,    0.,    0.]],
 
         [[ 101., 2119.,  117.,  ...,    0.,    0.,    0.]]]),
 'labels': tensor([2, 3, 1, 3, 2, 3, 3, 0, 2, 0, 0, 2, 0, 1, 1, 0])}

In [101]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [102]:
# Fine tuning
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [103]:
# Training the Model
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0

    model.train()
    for _, batch in enumerate(trainloader, 0):

        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, labels)

        nb_tr_steps += 1
        nb_tr_examples+=labels.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [104]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 1.425758957862854
Training Accuracy per 5000 steps: 12.5
The Total Accuracy for Epoch 0: 19.975613254913213
Training Loss Epoch: 1.4555429522324046
Training Accuracy Epoch: 19.975613254913213
Training Loss per 5000 steps: 1.516968846321106
Training Accuracy per 5000 steps: 12.5
The Total Accuracy for Epoch 1: 20.585281882082914
Training Loss Epoch: 1.450472592077124
Training Accuracy Epoch: 20.585281882082914
Training Loss per 5000 steps: 1.5273208618164062
Training Accuracy per 5000 steps: 0.0
The Total Accuracy for Epoch 2: 20.52072873332377
Training Loss Epoch: 1.450851172072078
Training Accuracy Epoch: 20.52072873332377
Training Loss per 5000 steps: 1.5325345993041992
Training Accuracy per 5000 steps: 25.0
The Total Accuracy for Epoch 3: 19.932577822407115
Training Loss Epoch: 1.4522299318138612
Training Accuracy Epoch: 19.932577822407115
Training Loss per 5000 steps: 1.5450708866119385
Training Accuracy per 5000 steps: 18.75
The Total Accuracy for Epo

In [107]:
# Validatin the Model
def valid(model, testloader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, batch in enumerate(testloader, 0):

            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, labels)

            nb_tr_steps += 1
            nb_tr_examples+=labels.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [108]:
acc = valid(model, testloader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss per 100 steps: 1.6015472412109375
Validation Accuracy per 100 steps: 0.0
Validation Loss Epoch: 1.4403768816125502
Validation Accuracy Epoch: 21.979913916786227
Accuracy on test data = 21.98%
