In [1]:
import pandas as pd

from time import time 
from gensim.models import KeyedVectors
from collections import namedtuple


# Pytorch
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader


# stanza
import stanza as st

import numpy as np

In [2]:

# Pretrained word2vec
import gensim.downloader as api
corpus = api.load('glove-wiki-gigaword-50', return_path=True)
pretrainedwvmodel = KeyedVectors.load_word2vec_format(corpus)
embedding_matrix = pretrainedwvmodel.wv.vectors
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0) # Padding
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0)
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0) # Unknown word

In [3]:

TAG2CLASS = {
    '<PAD>': 0,
    'CC': 1,
    'CD': 2,
    'DT': 3,
    'EX': 4,
    'FW': 5,
    'IN': 6,
    'JJ': 7,
    'JJR': 8,
    'JJS': 9,
    'LS': 10,
    'MD': 11,
    'NN': 12,
    'NNS': 13,
    'NNP': 14,
    'NNPS': 15,
    'PDT': 16,
    'POS': 17,
    'PRP': 18,
    'PRP$': 19,
    'RB': 20,
    'RBR': 21,
    'RBS': 22,
    'RP': 23,
    'SYM': 24,
    'TO': 25,
    'UH': 26,
    'VB': 27,
    'VBD': 28,
    'VBG': 29,
    'VBN': 30,
    'VBP': 31,
    'VBZ': 32,
    'WDT': 33,
    'WP': 34,
    'WP$': 35,
    'WRB': 36,
    '-RRB-': 37,
    '-LRB-':38,
        '<UNK>': 0,
    
}
pos_tagger = st.Pipeline(lang='en')

2021-02-28 14:41:26 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-28 14:41:26 INFO: Use device: gpu
2021-02-28 14:41:26 INFO: Loading: tokenize
2021-02-28 14:41:31 INFO: Loading: pos
2021-02-28 14:41:32 INFO: Loading: lemma
2021-02-28 14:41:32 INFO: Loading: depparse
2021-02-28 14:41:33 INFO: Loading: sentiment
2021-02-28 14:41:33 INFO: Loading: ner
2021-02-28 14:41:34 INFO: Done loading processors!


In [4]:
class DataMapper1(Dataset):
    def __init__(self, sentence_lyrics, wvmodel, sequence_len):
        self.sents = sentence_lyrics
        self.sequence_len = sequence_len
        self.model = wvmodel

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        doc = pos_tagger(self.sents[idx])
        xl = []
        yl = []
        seq = np.zeros(self.sequence_len, dtype=np.int64)
        yseq = np.zeros(self.sequence_len, dtype=np.int64)
        for k in doc.sentences[0].words:
            if (self.model.wv.vocab.get(k.text) is None):
                xl.append(400002)
                yl.append(TAG2CLASS.get('<UNK>'))
                continue
            xl.append(self.model.wv.vocab.get(k.text).index)
            yl.append(TAG2CLASS.get(k.xpos, 0))
        seq[:len(xl)] = xl[:self.sequence_len]
        yseq[:len(yl)] = yl[:self.sequence_len]
        return seq, yseq


In [5]:
data = pd.read_csv('Sentences_15klyrics_mls_20.csv')
train_data = data.sent[:8000].to_numpy()
val_random = np.random.choice(data[:8000].to_numpy().flatten(), 800)
val_data = np.append(val_random, data.sent[10001:10801].to_numpy())
test_data = data.sent[8000:10001].to_numpy()

In [6]:
training_set = DataMapper1(train_data, pretrainedwvmodel, 20)
val_set = DataMapper1(val_data, pretrainedwvmodel, 20)
test_set = DataMapper1(test_data, pretrainedwvmodel, 20)

In [7]:
loader_training = DataLoader(training_set, batch_size=16)
loader_val = DataLoader(training_set, batch_size=16)
loader_test = DataLoader(test_set)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557648 entries, 0 to 557647
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   artist     557648 non-null  object
 1   song_name  557648 non-null  object
 2   song_id    557648 non-null  int64 
 3   sent       557646 non-null  object
dtypes: int64(1), object(3)
memory usage: 17.0+ MB


In [9]:
for x, y in loader_training:
    print('x', x)
    print('y', y)
    break

x tensor([[    20,      9,      7,  37701,   2895,    907,     81,    100,  46215,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [  8738,     20,      9,  11999,   4385,      5,    359,    364,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [   253,     81,  11229,     23,    253,     20,     13,     24,    392,
           3825,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [   392,  72488,   9085,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [    81,    267,      7,   6413,    300,     17,     48,  48271,   1812,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0],
        [   197,     39,    960,     81,     60,    147,  

In [10]:
class Simple_Sequence_LSTM(nn.Module):
    def __init__(self, args):
        super(Simple_Sequence_LSTM, self).__init__()

        # Hyperparameters
        # self.batch_size = args.batch_size
        self.hidden_dim = args.hidden_dim
        self.LSTM_layers = args.lstm_layers
        # self.input_size = args.input_size
        self.embedding_matrix = args.embedding_matrix.cuda()
        self.target_size = args.target_size

        self.dropout = nn.Dropout(0.5)
        # self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
        self.embedding = nn.Embedding.from_pretrained(
            self.embedding_matrix, padding_idx=args.padding_idx, freeze=True)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim,
                            num_layers=self.LSTM_layers, batch_first=True)
        self.fc1 = nn.Linear(in_features=self.hidden_dim,
                             out_features=self.hidden_dim*2)
        self.fc2 = nn.Linear(self.hidden_dim*2, self.target_size)

    def forward(self, x):
        # Hidden and cell state definion
        h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()
        c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()

        # Initialization fo hidden and cell states
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)

        # Each sequence "x" is passed through an embedding layer
        out = self.embedding(x)
        # Feed LSTMs
        out, (hidden, cell) = self.lstm(out, (h, c))
        out = self.dropout(out)
        # The last hidden state is taken
        out = torch.relu_(self.fc1(out[:, -1, :]))
        out = self.dropout(out)
        out = torch.sigmoid(self.fc2(out))

        return out

In [11]:
class Simple_Sequence_LSTMver2(nn.Module):

    def __init__(self, args):
        super(Simple_Sequence_LSTMver2, self).__init__()
        # Hyperparameters
        self.hidden_dim = args.hidden_dim
        self.LSTM_layers = args.lstm_layers
        self.embedding_matrix = args.embedding_matrix.cuda()
        self.target_size = args.target_size
        self.tag_class_size = args.class_number

        self.word_embeddings = nn.Embedding.from_pretrained(
            self.embedding_matrix, padding_idx=args.padding_idx, freeze=True)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tag_class_size)

    def forward(self, sentence):
        # # Hidden and cell state definion
        # h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()
        # c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()

        # # Initialization fo hidden and cell states
        # torch.nn.init.xavier_normal_(h)
        # torch.nn.init.xavier_normal_(c)

        embeds = self.word_embeddings(sentence)
        # lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1),(h, c))
        # print(embeds.view(len(sentence), 1, -1).shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        # print(lstm_out.view(len(sentence), -1).shape)
        # tag_space = tag_space.view(len(sentence), self.tag_class_size)
        tag_scores = torch.sigmoid_(tag_space)
        return tag_scores

In [12]:
embedding_matrix = torch.FloatTensor(embedding_matrix)
train_on_gpu = torch.cuda.is_available()
lstm_dict = {
    # 'batch_size':8,
    'hidden_dim': embedding_matrix.shape[1],
    'lstm_layers':3,
    # 'input_size':embedding_matrix.shape[0],
    'padding_idx': 400001,
    'target_size': 20,
    'class_number': 40,
    'embedding_matrix': embedding_matrix
}
lstm_args = namedtuple('lstm_args', lstm_dict.keys())(**lstm_dict)

In [13]:
# model = Simple_Sequence_LSTM(lstm_args).cuda()
model = Simple_Sequence_LSTMver2(lstm_args)

In [14]:
model.cuda()

Simple_Sequence_LSTMver2(
  (word_embeddings): Embedding(400003, 50, padding_idx=400001)
  (lstm): LSTM(50, 50)
  (hidden2tag): Linear(in_features=50, out_features=40, bias=True)
)

In [15]:
def categorical_accuracy(preds, y, tag_pad_idx=0):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements]).cuda()
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).cuda()

In [16]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.01,momentum=0.9,weight_decay=0.0001)
loss_function = nn.CrossEntropyLoss()

In [17]:
epochs = 1
for i in range(epochs):
    model.train()
    sum_loss = 0.0
    total = 0
    epoch_acc = 0
    for x, y in loader_training:
        x = torch.tensor(x).to(torch.long).cuda()
        y_pred = model(x)
        y = torch.tensor(y).to(torch.long).cuda()
        
        optimizer.zero_grad()
        y_pred_2 = y_pred.view(-1, y_pred.shape[-1])
        y_2 = y.view(-1)
        loss = loss_function(y_pred_2, y_2)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()*y.shape[0]
        total += y.shape[0]
        acc = categorical_accuracy(y_pred_2, y_2)
        # print(acc.item())
        epoch_acc += acc.item()
        break
    break

In [18]:
def validation_metrics (model, valid_dl):
    loss_function = nn.CrossEntropyLoss()
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    acc_total = 0.0
    for x, y in valid_dl:
        x = x.cuda()
        y = y.cuda()
        y_hat = model(x)
        y_pred_2 = y_pred.view(-1, y_hat.shape[-1])
        y_2 = y.view(-1)
        loss = loss_function(y_hy_pred_2at, y_2)
        pred = torch.max(y_hat, 0)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        acc = categorical_accuracy(y_pred_2, y_2)
        acc_total = acc.item()
#     torch.cuda.empty_cache()
    return sum_loss/total, correct/total

In [None]:
validation_metrics(model, loader_val)

In [104]:
y_pred.type

<function Tensor.type>

In [85]:
x.shape

torch.Size([16, 20])

In [86]:
temp_x = model.word_embeddings(x)

In [87]:
temp_x.shape

torch.Size([16, 20, 50])

In [88]:
temp_x.view(len(x), 1, -1).shape

torch.Size([16, 1, 1000])

In [89]:
len(x)

16

In [106]:
y.shape

torch.Size([16, 20])

In [107]:
y_pred.shape

torch.Size([16, 20, 40])

In [124]:
torch.argmax(y_pred, dim=2, keepdims=True).shape

torch.Size([16, 20, 1])

In [19]:
max_preds = y_pred_2.argmax(dim = 1, keepdim = True) # get the index of the max probability
non_pad_elements = (y_2 != 0).nonzero()
correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements]).cuda()

In [20]:
(y_2 != 0).nonzero().shape

RuntimeError: CUDA error: device-side assert triggered

In [21]:
max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])

RuntimeError: CUDA error: device-side assert triggered

In [108]:
y_pred.view(-1, y_pred.shape[-1]).shape

torch.Size([320, 40])

In [109]:
y.view(-1).shape

torch.Size([320])

In [56]:
loss

tensor(160.4284, device='cuda:0', grad_fn=<MseLossBackward>)

In [57]:
y

tensor([[18., 32.,  3., 12., 12., 32., 18., 20.,  7.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [26., 18., 32., 12., 12.,  1.,  3., 12.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [27., 18., 12., 38., 27., 18.,  6., 37., 19., 12.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [19., 12., 12.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [18., 31.,  3., 12., 12.,  6.,  2., 12., 12.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [36., 18., 31., 18., 23., 12.,  6., 12.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [ 1., 36., 18., 31., 23., 18., 31., 29., 25., 27.,  3., 12.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [27.,  3.,  7., 12., 34., 32., 20.,  7.,  6., 12.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.],
        [27., 18., 27., 19., 12.,  0.,  0.,  0.,

In [58]:
y_pred

tensor([[0.4804, 0.5022, 0.5154, 0.5029, 0.4657, 0.4731, 0.5068, 0.4814, 0.5173,
         0.5142, 0.4733, 0.5253, 0.4865, 0.5115, 0.5093, 0.5199, 0.5114, 0.5046,
         0.4833, 0.5047],
        [0.4905, 0.4938, 0.5104, 0.4779, 0.4656, 0.4833, 0.5162, 0.4951, 0.5312,
         0.5081, 0.4711, 0.5254, 0.4972, 0.4989, 0.5116, 0.5252, 0.5231, 0.5059,
         0.4760, 0.5167],
        [0.4889, 0.4954, 0.4981, 0.4817, 0.4880, 0.4883, 0.5224, 0.4950, 0.5327,
         0.5132, 0.4697, 0.5099, 0.4923, 0.4962, 0.5004, 0.5179, 0.5053, 0.5226,
         0.5011, 0.5132],
        [0.4924, 0.4848, 0.5028, 0.4912, 0.4702, 0.4852, 0.4890, 0.4843, 0.5231,
         0.5249, 0.4748, 0.5362, 0.5151, 0.5018, 0.5238, 0.5253, 0.5218, 0.5103,
         0.4878, 0.5152],
        [0.5097, 0.4828, 0.4946, 0.4645, 0.4819, 0.4738, 0.5039, 0.4654, 0.5052,
         0.5198, 0.4682, 0.5340, 0.5118, 0.4906, 0.5061, 0.5169, 0.5201, 0.5156,
         0.4672, 0.5013],
        [0.4909, 0.4949, 0.4876, 0.4803, 0.4870, 0.4810, 0.4

In [32]:
model.fc2

Linear(in_features=100, out_features=20, bias=True)

In [33]:
model.fc1

Linear(in_features=50, out_features=100, bias=True)

In [112]:
loss

tensor(3.6673, device='cuda:0', grad_fn=<NllLossBackward>)