## Load Dataset and Import Packages 

In [1]:
import re
import pickle
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from os import walk
import numpy as np
import numpy.ma as ma
import json
import os
import pandas as pd
import re 
import seaborn as sns
import nltk
import sys
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import sys
import numpy as np
from collections import Counter
import os
from argparse import Namespace

In [2]:
with open(r'/Users/martina/Desktop/lyricgen/amaury/final.json', 'r',encoding="utf8") as f:
    df = pd.read_json(f,orient='columns')

In [None]:
print('There are ' + str(df.shape[0]) + ' records')
print('There are ' + str(df.columns.size) + ' attributes:')
df_types = df.dtypes
print(df_types)

## Clean and Prepare the Dataset 

In [None]:
# get rid redundant columns
df.dropna(subset = ['lyrics'],axis = 0, inplace = True)
df = df.drop(['tag','cohort_ids','created_month','created_year','has_song_story','song_story_id','has_apple_match','featured_video','has_youtube_url','nrm_target_date','nrm_tier','has_translation_QandA','comment_count','has_description'], axis=1)

In [None]:
# get rid short text and lower everything
df['length'] = df['lyrics'].apply(lambda x: len(x))
cond = df['length'] > 10
df = df[cond]
new_df = df.apply(lambda x: x.astype(str).str.lower())

In [None]:
new_df.head()

In [None]:
##just deal with the english dataset
df_english = new_df[new_df["lyrics_language"].str.contains('en')]

In [None]:
df_sample = df_english.sample(frac = 0.3) 

In [None]:
# keep just alphabet and apostrophe, focus on this 
final_df = [re.sub("[^a-z' ]", "", i) for i in df_sample.lyrics]

In [None]:
df_sample.to_csv(r'/Users/martina/Desktop/lyricgen/amaury/final_df.txt', header=None, index=None, sep='\t', mode='a')

## Deal with lyrics of different length [alternative to padding]

In [None]:
flags = Namespace(
    train_file='final_df.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    #initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [None]:
with open('/Users/martina/Desktop/lyricgen/amaury/final_df.txt', 'r') as f:
    text = f.read()
    text = text.split()
    
#get_data_from_file('/Users/martina/Desktop/lyricgen/amaury/final_df.txt', 16, 32)

#create two dictionaries, one to convert words into integer indices, 
#and the other one to convert integer indices back to word tokens:

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)
    
#convert word tokens into integer indices. 
#These will be the input to the network. 
#And because we will train a mini-batch each iteration, we should be able to split the data into batches evenly. 
#We can assure that by chopping out the last uneven batch

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (32 * 16))
    in_text = int_text[:num_batches * 16 * 32]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (16, -1))
    out_text = np.reshape(out_text, (16, -1))

    print(int_to_vocab, vocab_to_int, n_vocab, in_text, out_text)

In [None]:
print(in_text[:10, :10])
print(out_text[:10, :10])

## Batches

In [None]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

## Model

In [None]:
class RNNModule(nn.Module):
    #We need an embedding layer, an LSTM layer, and a dense layer:
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
    #forward, will take an input sequence and the previous states and produce the output together 
    # with states of the current timestep:
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)
        return logits, state
    #reset states at the beginning of every epoch, we need to define one more method to help us set all states to zero:
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [None]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

## Train

In [None]:
flags = Namespace(
    train_file='final_df.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    #initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [None]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')#transfer to GPU if there
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size) #get variables

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size) #call the model 
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01) # loss function 

    iteration = 0 
    
    # for each epoch, loop through the batches to compute loss values 
    # + update network’s parameters. 
    
    # Call the train() method on the network’s instance 
    # (it will inform inner mechanism that we are about to train, not execute the training)
    # Reset all gradients, Compute output, loss value, accuracy, etc
    # Perform back-propagation, 
    # Update the network’s parameters
    
    for e in range(50):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        
        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            
            # Tell it we are in training mode
            net.train()

            # Reset all gradients
            optimizer.zero_grad()

            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()
            
   # You may notice the detach() thing. Whenever we want to use something that belongs to the computational graph 
   # for other operations, we must remove them from the graph by calling detach() method. 
   # The reason is, Pytorch keeps track of the tensors’ flow to perform back-propagation through a mechanism 
   # called autograd. We mess it up and Pytorch will fail to deliver the loss.

            # Perform back-propagation
            loss.backward()

            _ = torch.nn.utils.clip_grad_norm_(
                net.parameters(), flags.gradients_norm)
            
            # Update the network's parameters
            optimizer.step()
            
            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),
                           'checkpoint_pt/model-{}.pth'.format(iteration))

## Prediction

In [None]:
# Assuming that we have some input (words) compute the final output (word predicted)
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    
        for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))
    
    # We will use that final output as input for the next time step and continue doing so until we have a sequence of length we wanted. 
    # Finally, we simply print out the result sequence to the consol
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

### Test and Try

In [None]:
if __name__ == '__main__':
    main()