<a href="https://colab.research.google.com/github/AntJuLRa/ANLP-project-/blob/dev/generation/LSTM_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pickle
import numpy as np
import torch
import nltk
import random
import torch.nn as nn
import torch.nn.functional as fnc
torch.manual_seed(42)
np.random.seed(42)

In [3]:
OHHLA_data = pickle.load(open( "OHHLAdata_list.p", "rb" ))

In [5]:
flat_data = [word for song in OHHLA_data for word in song]

In [6]:
freq_dict= nltk.FreqDist(flat_data)

In [7]:
valid_vocab= [word for word,freq in freq_dict.items() if freq>30]
invalid_vocab= [word for word,freq in freq_dict.items() if freq<=30]

In [8]:
len(valid_vocab)

10844

In [9]:
index_dict = dict()
for i, x in enumerate(valid_vocab):
  index_dict[x]=i+1

rev_index_dict = dict([reversed(i) for i in index_dict.items()])

def phrase_to_tensor(phrase, ind_dict):
  word_list = [word for line in phrase for word in line]
  tensor = torch.zeros(len(word_list)).long()
  for i, x in enumerate(word_list):
    try:
      tensor[i]=ind_dict[x]
    except KeyError:
      tensor[i]=0
  return tensor

In [10]:
from itertools import groupby
#We train on sequences of four lines
verses = [list(x) for y in OHHLA_data for k, x in groupby(y, lambda x: x =='nxtvrse') if not k]

In [11]:
def get_phrases(verse,num_lines=4,endline='endline'):
  lines = [list(x)+[endline] for k, x in groupby(verse, lambda x: x ==endline) if not k]
  res=[]
  for i,x in enumerate (lines[:len(lines)-num_lines+1]):
    res.append(lines[i:i+num_lines])
  return res

In [12]:
list_of_phrases=[get_phrases(verse) for verse in verses]
phrases=[x for y in list_of_phrases for x in y]

In [13]:
invalid_phrases= []
for i,phrase in enumerate(phrases):
  for word in [word for line in phrase for word in line]:
    if freq_dict[word]<=30:
      invalid_phrases.append(i)
      break

In [14]:
valid_phrases = list(set(list(range(0,len(phrases)))) - set(invalid_phrases))

In [15]:
shuffled = random.sample(valid_phrases, len(valid_phrases))
train_set= shuffled[:25000]
test_set= shuffled[25000:]

In [16]:
def random_valid_phrase(index_list):
  i= random.choice(index_list)
  return phrases[i]

In [17]:
def random_training_tensor(index_list):    
    x=random_valid_phrase(index_list)
    phrase = phrase_to_tensor(x,index_dict)
    input = phrase[:-1]
    target = phrase[1:]
    return input, target

**LSTM for text generation**

In [18]:
class LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_layers, embedding_dim=100):
        super(LSTM, self).__init__()
        self.i_dim= input_dim
        self.e_dim= embedding_dim
        self.h_dim= hidden_dim
        self.o_dim= output_dim
        self.n_layers= num_layers
        
        self.embedding = nn.Embedding(self.i_dim, self.e_dim)
        self.lstm = nn.LSTM(input_size=self.e_dim, hidden_size=self.h_dim, num_layers=self.n_layers)
        self.out = nn.Linear(self.h_dim,self.o_dim)
        
    
    def forward(self, inp, hidden_cell):
        embedded = self.embedding(inp)
        lstm_out, hidden = self.lstm(embedded.view(1,1,-1), hidden_cell)
        res = self.out(lstm_out.view(1, -1))
        res = fnc.log_softmax(res, dim=1)
        
        return res, hidden 
        

    def init_hidden(self):
        hidden=torch.zeros(self.n_layers,1,self.h_dim)
        cell = torch.zeros(self.n_layers,1,self.h_dim)
        return hidden, cell

In [19]:
def generate(LSTM_model, start='i', max_len=150, num_lines=4, temp=0.8):
    hidden, cell = LSTM_model.init_hidden()
    prime_input = phrase_to_tensor([[start]], index_dict)
    predicted = [start]

    for p in range(len(prime_input)):
        _, (hidden, cell) = LSTM_model(prime_input[p], (hidden, cell)) 
    input = prime_input[-1]
    
    line_count=0
    for p in range(max_len):
        if line_count>=num_lines:
          break

        output, (hidden, cell) = LSTM_model(input, (hidden, cell))
        
        output_dist = output.data.view(-1).div(temp).exp()
        i = int(torch.multinomial(output_dist, 1)[0]) 
        predicted_next = rev_index_dict[i]

        if predicted_next=='endline':
          line_count= line_count+1
        
        predicted.append(predicted_next)
        input = phrase_to_tensor([[predicted_next]],index_dict)

    return predicted

In [20]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [21]:
def training_step(LSTM_model, optimizer, input, target, sample_len, criterion):
    hidden, cell = LSTM_model.init_hidden()
    LSTM_model.zero_grad()
    loss = 0
    for c in range(sample_len):
        output, (hidden, cell) = LSTM_model(input[c], (hidden, cell))
        loss = loss + criterion(output, target[c].view(1))

    loss.backward()
    optimizer.step()

    return loss.item() /sample_len

In [30]:
#The simplest neural model we try out has a hidden size of 128 and just one lstm layer
hidden_size = 128
n_layers = 1
n_items= len(index_dict)

lr = 0.005
onelayer_LSTM_model = LSTM(n_items, n_items, hidden_size, n_layers, embedding_dim=100)
optimizer = torch.optim.Adam(onelayer_LSTM_model.parameters(), lr=lr)
criterion = nn.NLLLoss()

In [33]:
#pickle the object for further training sessions
pickle.dump(onelayer_LSTM_model, open( "1lyr_word_level_LSTM.p", "wb" ))

In [24]:
onelayer_LSTM_model = pickle.load(open( "lyr_word_level_LSTM.p", "rb" ))

In [35]:
#train single layer lstm on 30000 random training verses

def train(model, optimizer, criterion, n_epochs,print_every, plot_every, index_list, return_loss_array):
  loss_avg=0
  all_losses=[]
  start = time.time()
  for epoch in range(1, n_epochs+1):
    phrase = random_training_tensor(index_list)
    #on very rare occasions (1 or two samples) this throws an IndexError.
    try:
      loss = training_step(model, optimizer, phrase[0], phrase[1], len(phrase[1]), criterion)
    except IndexError:
      print("Error: ",phrase)
    loss_avg += loss

    if epoch % print_every == 0:
        print('[{} ({} {}%) {:.4f}]'.format(time_since(start), epoch, epoch/n_epochs * 100, loss))
        print(nice_format(generate(model)), '\n')

    if epoch % plot_every == 0:
        all_losses.append(loss_avg/ plot_every)
        loss_avg = 0
    
  if return_loss_array:
    return all_losses

In [36]:
def nice_format(output_list):
        with_linebreaks = ["\n" if x=='endline' else x for x in output_list]
        return " ".join(with_linebreaks)

In [37]:
#A word level network with two hidden layers
hidden_size = 128
n_layers = 2
loss_array = []
n_items= len(index_dict)

lr = 0.005
twolayer_LSTM_model = LSTM(n_items, n_items, hidden_size, n_layers, embedding_dim=100)
optimizer = torch.optim.Adam(twolayer_LSTM_model.parameters(), lr=lr)
criterion = nn.NLLLoss()

In [None]:
loss_array = loss_array +train(twolayer_LSTM_model, optimizer, criterion, 20000, 1000, 10, train_set, return_loss_array=True)

[5m 11s (1000 5.0%) 4.5836]
i go to beg 
 catch care of wallet 
 i smoke you in my couch 
 you know i 've been ridin ' on the passin ' eyes 
 

[10m 22s (2000 10.0%) 4.6040]
i stay away 
 you could go back 
 i would do wo n't know i am the gang 
 and you want it i'ma think there for real 
 

[15m 32s (3000 15.0%) 4.5529]
i ca n't take a plane 
 broken i am see i 'm ridin ' 
 i cant get this game 
 no less clocks with the sun more real 
 

[20m 40s (4000 20.0%) 5.5018]
i go to you real time to down 
 i often 
 but a beast 'fore you get like 
 i got nothing straight tryna pull up 
 

[25m 49s (5000 25.0%) 3.4650]
i 'm like i need to do your name 
 when you with else i earn the power i 'm slapped 
 but i 'm as hell i 'm not a thug up 
 it 's like a shame make a choice 
 

[31m 2s (6000 30.0%) 4.4782]
i made that i can give a fuck 
 you can not listen 
 hold in my chest 
 because i make a true day 
 

[36m 11s (7000 35.0%) 4.7560]
i eat stage hard 
 we do n't care the top 
 go in the futur

In [42]:
#pickle the object for further training sessions
pickle.dump(twolayer_LSTM_model, open( "2lyr_word_level_LSTM.p", "wb" ))
pickle.dump(loss_array, open( "2lyr_losses.p", "wb" ))