In [2]:
import torch
import torch.nn as nn
import csv
import random
from collections import Counter
import string
from torch.utils.data import Dataset, random_split, DataLoader
from nltk.translate.bleu_score import sentence_bleu
import time
from torch.distributions.categorical import Categorical

## Preprocessing & IDA
Dataset: https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information
This dataset contains 9GiB of lyrical data of all different genres and languages. I am going to focus on pop music. I would like to create a 
generative pop lyrics model using this dataset. To do so I will train a RNN,LSTM,GRU and compare the ending results. I started by reducing this 9GiB file to around 1.7 GiB of lyrics which are only english pop music.

In [44]:
DSET = "pop.csv"
f = open(DSET)
csv_r = csv.reader(f)
DATA = list(csv_r)
f.close()

In [45]:
class Tokenizer():
    def __init__(self, DATA, min_app, file=None):
        self.tokens = None
        self.token_map = None
        if file == None:
            self.create_tokens(DATA, min_app)
        else:
            self.load_dict(file)

    def load_dict(self, file):
        f = open(file, 'r')
        lines = f.readlines()
        f.close()
        lines = [line.strip() for line in lines]
        self.tokens = lines
        self.token_map = {line:i for i, line in enumerate(lines)}
        
    def process_string(self, lyrics):
        lyrics.translate(str.maketrans('', '', string.punctuation))
        return lyrics.lower().split()
        
    def create_tokens(self, D, min_app):
        wc = Counter()
        for row in D:
            lyrics = row[6]
            wc.update(self.process_string(lyrics))
        wc = dict(filter(lambda x: int(x[1]) > min_app, wc.items()))
        self.tokens = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"] + list(wc.keys())
        self.token_map = {token:i for i, token in enumerate(self.tokens)}
        
    def get_token(self, word):
        if word in self.token_map:
            return self.token_map[word]
        return self.token_map["<UNK>"]

    def tokenize(self, lyrics):
        words = self.process_string(lyrics)
        return [self.get_token(word) for word in words]

    def stringify(self, tokens):
        return [self.tokens[i] for i in tokens]
    
        

In [46]:
tokenizer = Tokenizer(None, 100, "vocab.vocab")
print(len(tokenizer.tokens))
print(tokenizer.tokens[:100])


45509
['<SOS>', '<EOS>', '<PAD>', '<UNK>', '[chorus]', 'what', 'are', 'words', 'worth?', '[verse', '1]', 'in', 'papers,', 'books', 'on', 'tv,', 'for', 'crooks', 'of', 'comfort,', 'peace', 'to', 'make', 'the', 'fighting', 'cease', 'tell', 'you', 'do', 'working', 'hard', 'eat', 'your', 'but', "don't", 'go', 'hungry', 'have', 'always', 'nearly', 'hung', 'me', 'a', 'ram', 'sam', 'sam,', 'hi', 'yay,', 'yippie', 'yi', 'yay', 'awoo', 'ayee', '2]', 'skill', 'and', 'romance', 'thrill', 'stupid,', 'fun', 'can', 'put', 'run', 'mots', 'qui', 'la', 'le', 'fruit', '3]', "it's", 'rap', 'race,', 'with', 'fast', 'pace', 'concrete', 'words,', 'abstract', 'crazy', 'lying', 'hazy', 'dying', 'faith', 'straight', 'rare', 'swear', 'good', 'bad', '4]', 'pay', 'four-letter', 'i', 'cannot', 'say', 'toilet,', 'dirty', 'devil', 'trouble,', 'subtle', 'anger,']


In [47]:
toks = tokenizer.tokenize("The quick brown fox jumped over the i forgot the rest")
print(toks)
print(tokenizer.stringify(toks))

[23, 1509, 3399, 4974, 2998, 101, 23, 91, 5338, 23, 3697]
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'i', 'forgot', 'the', 'rest']


In [48]:
class LyricSet(Dataset):
    def __init__(self, data, tokenizer, sequence_size):
        self.data = data
        self.tokenizer = tokenizer
        self.sequence_size = sequence_size
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        sequence = self.tokenizer.tokenize(self.data[idx][6])
        start_idx = 0 if len(sequence) <= self.sequence_size else random.randint(0, len(sequence) - self.sequence_size-1)
        sequence = sequence[start_idx:]
        if len(sequence) < self.sequence_size-2:
            out =  torch.tensor([tokenizer.get_token("<SOS>")] + sequence + [tokenizer.get_token("<EOS>")] + 
                                [tokenizer.get_token("<PAD>")]*(self.sequence_size-len(sequence)-2), dtype=torch.long)
            assert out.shape[0] == self.sequence_size, str(out.shape) + " " + str(len(sequence))
        else :
            out =  torch.tensor([tokenizer.get_token("<SOS>")] + sequence[:self.sequence_size-2] + [tokenizer.get_token("<EOS>")], 
                                dtype=torch.long)
            assert out.shape[0] == self.sequence_size, str(out.shape)
        return out

In [49]:
dset = LyricSet(DATA, tokenizer, 100)
print(dset[0])
print(dset[0].shape)

tensor([  0,   7, 113, 114, 115, 116, 117,  21,  23, 118, 119, 120,  93,  23,
        116, 121,  22,  42, 122,   7,   6, 112,  42, 123, 124, 125, 126,  93,
          5, 127, 128,  34, 128,   5, 127,  93,  72,  42, 129,  70, 130,  55,
         42, 129,  70, 104, 130,  42, 129, 104,  42,  70, 106,  42, 129,  70,
         70, 131, 132,  16,  23, 133,  86, 134, 135, 136,  23, 137,  69, 138,
        139,   3, 140, 141,  42,   3,   3, 138, 142,   4,  42,  43,  44,  45,
         42,  43,  44,  44,   3,   3,   3,   3,   3,  43,  44,  44,  46,   3,
         47,   1])
torch.Size([100])


In [50]:
train_set, val_set, test_set = random_split(dset, [0.9, 0.05, 0.05])

In [51]:
reference = [['this', 'is', 'ae', 'test','rest','pep','did']]
candidate = ['this', 'is', 'ad', 'test','rest','pep','did']
sentence_bleu(reference, candidate) 

0.488923022434901

In [52]:
test_1 =    list("the quick brown fox is really fast and jumps high".split())
reference = [list("the brown fox is pretty fast and jumps high".split())]
sentence_bleu(reference, test_1)

0.392814650900513

## Model Creation
We will begin with the an RNN model then progress into more "complicated" models which are expected to preform better. Our guiding metric
will be the bleu score, This will also serve as source of the pre-trained embeddings for the later part of this assignment. Natural Language is inherently a time / order dependent sequence, with different permutations having different meanings. This problem could not be solved a non temporal model as a result of this. Time would have to be encoded into the NN in some way which would be really difficult for this specific task as each time point is also a vector. 

In [53]:
class LyricalGenius(nn.Module):
    def __init__(self, n_toks, embedding_size, dim_hidden, n_rnn_layers, device=None):
        super().__init__()
        self.dim_hidden = dim_hidden
        self.n_tok = n_toks
        self.emb = nn.Embedding(n_toks, embedding_size,device=device)
        self.rnn = nn.RNN(input_size=embedding_size, hidden_size=dim_hidden, num_layers=n_rnn_layers,
                            batch_first=True, device=device)
        self.out = nn.Sequential(*[nn.Linear(dim_hidden, dim_hidden*2, device=device), nn.ReLU(), nn.Linear(dim_hidden*2, n_toks, device=device)
                                   , nn.LogSoftmax(dim=1)])
    def forward(self, seq):
        b, n = seq.shape
        seq = self.emb(seq)
        seq, _  = self.rnn(seq)
        seq = self.out(seq.reshape(-1, self.dim_hidden))
        return seq.reshape(b, n, self.n_tok)

class LyricalGenius2(nn.Module):
    def __init__(self, n_toks, embedding_size, dim_hidden, n_rnn_layers, device=None):
        super().__init__()
        self.dim_hidden = dim_hidden
        self.n_tok = n_toks
        self.emb = nn.Embedding(n_toks, embedding_size,device=device)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=dim_hidden, num_layers=n_rnn_layers,
                            batch_first=True, device=device)
        self.out = nn.Sequential(*[nn.Linear(dim_hidden, dim_hidden*2, device=device), nn.ReLU(), nn.Linear(dim_hidden*2, n_toks, device=device)
                                   , nn.LogSoftmax(dim=1)])
    def forward(self, seq):
        b, n = seq.shape
        seq = self.emb(seq)
        seq, _  = self.rnn(seq)
        seq = self.out(seq.reshape(-1, self.dim_hidden))
        return seq.reshape(b, n, self.n_tok)

In [54]:
test_model = LyricalGenius(len(tokenizer.tokens), 256, 1024, 2)

In [55]:
test = dset[0].unsqueeze(0)
test_model(test).shape

torch.Size([1, 100, 45509])

In [56]:
def train(epochs, optim, loss, model, train_loader, val_loader, metric_fn, device):
    for e in range(epochs):
        running_loss = 0
        for i, seq in enumerate(train_loader):
            optim.zero_grad()
            X = seq[:, :-1].to(device)
            Y = seq[:, 1: ].to(device)
            pred = model(X)
            pred = pred.reshape(-1, model.n_tok)
            Y = Y.reshape(-1)
            l = loss(pred, Y)
            running_loss +=l.item()
            l.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optim.step()
            if i % 1000 == 0 and i!=0:
                print("Epoch", e, "iteration", i, "Loss:", running_loss/1000)
                running_loss = 0
        print("Epoch", e, "metric:", metric_fn(model, val_loader, device))
    return model

def BLEU(model, loader, device):
    model.eval()
    batch = next(iter(loader)).to(device)
    X = batch[:, :-1]
    Y = batch[:, 1:].tolist()
    predictions = torch.argmax(model(batch), dim=-1)
    scores = []
    for i in range(batch.shape[0]):
        ex = tokenizer.stringify(predictions[i, :].squeeze(0).tolist())
        reference = [tokenizer.stringify(Y[i])]
        scores.append(sentence_bleu(reference, ex))
    model.train()
    return sum(scores)/len(scores)
    
    


In [59]:
device = torch.device('cuda')
writer = LyricalGenius2(len(tokenizer.tokens), 64, 256, 3, device=device)
NLL = nn.NLLLoss(ignore_index=tokenizer.get_token("<PAD>"))
batch_size = 16
train_loader, val_loader, train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0), \
                                         DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=0),   \
                                         DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)
#writer = LyricalGenius(len(tokenizer.tokens), 64, 256, 3, device=device)


In [155]:
optim = torch.optim.Adam(lr=5e-4, params=writer.parameters())


In [None]:
writer = train(3, optim, NLL, writer, train_loader, val_loader, BLEU, device)

In [157]:
torch.save(writer.state_dict(), "LSTM.pth")

In [42]:
cpu = torch.device('cpu')
def generate_response(starting_sequence, model):
    seq = tokenizer.tokenize(starting_sequence)
    next_token = -1
    while next_token != tokenizer.get_token("<EOS>"):
        if len(seq) == 50:
            break
        X = torch.tensor([seq], dtype=torch.long).to(device)
        pred = model(X)[:, -1].reshape(-1)
        distrib = Categorical(logits=pred)
        next_token = distrib.sample()
        seq.append(next_token)
    print(tokenizer.stringify(seq))


In [159]:
print(len(tokenizer.tokens))
generate_response("i know", writer)


45509
['i', 'know', 'that', 'looked', 'said', 'how', 'bad', 'i', 'could', 'with', 'you,', 'something', 'new', 'maybe', 'free', 'to', 'be', 'surprised', 'seeing', 'me', 'lend', '<EOS>']


## Comparisons 
The quality of the output from the LSTM model is somewhat clear, It is also more clear from the value of the loss function / convergence properties of the model. The LSTM quickly surpasses the RNN's loss value and with a learning rate that is signifigantly higher. The main advantages of the LSTM/GRU over the regular RNN is the inductive basis' baked into these models. By specifying exactly how we want the model to update its weights, (given that this update scheme is relatively good) the model doesn't have to learn these mechanisms on its own. For example with the CNN, their is the inductive bais of locality and weight sharing. These inductive bias' result in temporal models which converge much faster, which is a really big help considering how difficult it is for these models to train in the first place as a result of
vanishing/exploding gradients.

## Simularities
Here is my implementation of two simularity/disimularity metrics. The simularity metric I used is the cosine simularity. When you maximize the output of this functin with respect to the second vector you get the most "simular" vectors. This is done by calculating the cosine
of the angle between the two vectors. The dissimularity metric I use is euclidean distance. When you maximize the dissimulairty metric you get a vector that is very "dissimular" from the first vector. Note that since the pre-trained vectors are a result of pop-lyrics. This can be seen clearly as the most simular word to love is hate which checks out when it comes to the lyrics of pop.

In [151]:
#SIMULARITY METRIC
def cosine_simularity(vec_1, vec_2):
    return torch.dot(vec_1, vec_2)/(torch.norm(vec_1) * torch.norm(vec_2))
#DISSIMULARITY METRIC
def euclidean_distance(vec_1, vec_2):
    return torch.norm(vec_1 - vec_2)
def get_cos_sim(words, writer):
    tokens = torch.tensor([tokenizer.tokenize(words)]).to(device)
    vecs = writer.emb(tokens)
    return cosine_simularity(vecs[0, 0], vecs[0, 1])
def get_euc_dist(words, writer):
    tokens = torch.tensor([tokenizer.tokenize(words)]).to(device)
    vecs = writer.emb(tokens)
    return euclidean_distance(vecs[0, 0], vecs[0, 1])

In [160]:
writer.eval()

LyricalGenius2(
  (emb): Embedding(45509, 64)
  (rnn): LSTM(64, 256, num_layers=3, batch_first=True)
  (out): Sequential(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=45509, bias=True)
    (3): LogSoftmax(dim=1)
  )
)

In [161]:
seed_word = "love "
words = []
for word in tokenizer.tokens:
    score = get_cos_sim(seed_word + word, writer)
    words.append((abs(score.item()), word))
words = sorted(words, reverse=True)
print(words[:10])

[(1.0, 'love'), (0.5264552235603333, 'hate'), (0.5208090543746948, 'forgiven'), (0.5175228714942932, 'torment'), (0.5052473545074463, 'fairyland'), (0.5013605952262878, 'less?'), (0.5011232495307922, 'sheena'), (0.4963341951370239, 'wheat'), (0.4951170086860657, 'di'), (0.4826781153678894, 'self-control')]


In [162]:
seed_word = "love "
words = []
for word in tokenizer.tokens:
    score = get_euc_dist(seed_word + word, writer)
    words.append((abs(score.item()), word))
words = sorted(words)
print(words[:10])

[(0.0, 'love'), (9.328536987304688, 'torment'), (9.346423149108887, '(sorry'), (9.377598762512207, 'mastery'), (9.39035415649414, 'fairyland'), (9.499368667602539, 'carousels'), (9.560234069824219, 'discipline'), (9.686524391174316, 'hate'), (9.752927780151367, 'make-believe'), (9.75964069366455, 'earth)')]
