In [1]:
import nltk
import pandas as pd
import heapq
import pprint

from nltk.tokenize import wordpunct_tokenize, blankline_tokenize, line_tokenize, word_tokenize
from itertools import combinations
from nltk.corpus import stopwords
from time import time 
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing
from collections import namedtuple

# # tensorflow
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Pytorch
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# stanza
import stanza as st

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from lyrics_dataloader import DataMapper
from models import Simple_Sequence_LSTM
# Pretrained word2vec
import gensim.downloader as api
corpus = api.load('fasttext-wiki-news-subwords-300', return_path=True)
pretrainedwvmodel = KeyedVectors.load_word2vec_format(corpus)
embedding_matrix = pretrainedwvmodel.wv.vectors
embedding_matrix = np.append(embedding_matrix, np.zeros((1,300)), axis=0) # Padding
embedding_matrix = np.append(embedding_matrix, np.zeros((1,300)), axis=0) # Unknown word

In [5]:
# Max length known from 15K lyrics = 811
# Splitted into 20 lengthed sentences
# Training : Test == 8 : 2
# Training : Val == 8 : 2
data = pd.read_csv('sentences_15klyrics_mls_20.csv')
train_data = data.sent[:8000].to_numpy
val_random = np.random.choice(data[:8000].to_numpy().flatten(), 800)
val_data = np.append(val_random, data.sent[10001:10801].to_numpy())
test_data = data.sent[8000:10001].to_numpy()

training_set = DataMapper(train_data, pretrainedwvmodel, 20)
val_set = DataMapper(val_data, pretrainedwvmodel, 20)
test_set = DataMapper(test_data, pretrainedwvmodel, 20)

loader_training = DataLoader(training_set, batch_size=8)
loader_val = DataLoader(training_set, batch_size=8)
loader_test = DataLoader(test_set)


train_on_gpu = torch.cuda.is_available()
lstm_dict = {
    # 'batch_size':8,
    'hidden_dim': embedding_matrix.shape[1],
    'lstm_layers':3,
    # 'input_size':embedding_matrix.shape[0],
    'padding_idx': 1000001,
    'target_size': 20,
    'embedding_matrix': embedding_matrix
}
lstm_args = namedtuple('lstm_args', lstm_dict.keys())(**lstm_dict)

2021-02-26 17:22:44 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-26 17:22:44 INFO: Use device: gpu
2021-02-26 17:22:44 INFO: Loading: tokenize
2021-02-26 17:22:48 INFO: Loading: pos
2021-02-26 17:22:49 INFO: Loading: lemma
2021-02-26 17:22:49 INFO: Loading: depparse
2021-02-26 17:22:49 INFO: Loading: sentiment
2021-02-26 17:22:50 INFO: Loading: ner
2021-02-26 17:22:51 INFO: Done loading processors!


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557648 entries, 0 to 557647
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   artist     557648 non-null  object
 1   song_name  557648 non-null  object
 2   song_id    557648 non-null  int64 
 3   sent       557646 non-null  object
dtypes: int64(1), object(3)
memory usage: 17.0+ MB


In [10]:
data.sent[:10].to_numpy()

array(["it's a junkie dream makes you so uptight",
       "yeah it's halloween tonight and every night",
       'see you scratch (see it on) your skin', 'your sandpaper throat',
       "you're a symphony man with one fucking note",
       'how they beat you up week after week',
       "and when you grow up you're going to be a freak",
       "want a violent girl who's not scared of anything",
       'help me kill my time', "'cause I'll never be fine"], dtype=object)

In [29]:
len("it's a junkie dream makes you so uptight")

40

In [21]:
val_data = np.append(np.random.choice(data[:8000].to_numpy().flatten(), 800), data.sent[10001:10801].to_numpy())


In [22]:
val_data.shape

(1600,)

In [21]:
Training_set[0]

(array([   19,    24,     7, 45314,  3758,   618,    30,    57, 54995,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0], dtype=int64),
 array([18, 32, 3, 12, 12, 32, 18, 20, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=object))

In [None]:
model = Simple_Sequence_LSTM(lstm_args)

In [None]:
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y in valid_dl:
        x = x.cuda()
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    torch.cuda.empty_cache()
    return sum_loss/total, correct/total, sum_rmse/total

In [None]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.SGD(model.parameters(),lr =lr,momentum=0.9,weight_decay=0.0001)
loss_function = nn.NLLLoss()
for i in range(epochs):
    model.train()
    sum_loss = 0.0
    total = 0
    for x, y in loader_training:
        x = torch.tensor(x).to(torch.long).cuda()
        y_pred = model(x)
        optimizer.zero_grad()
        loss = loss_function(y_pred, y)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()*y.shape[0]
        total += y.shape[0]
    val_loss, val_acc, val_rmse = validation_metrics(model, loader_val)
    torch.cuda.empty_cache()
    if i % 5 == 1:
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc))

In [None]:
for epoch in range(1):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        print(sentence_in)
        print('targets: ', targets)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
        print('predicted: ',torch.argmax(tag_scores, 1))

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()