In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import FreqDist
from nltk import ngrams
import torch
import nltk
import random
import pickle
import torch.nn as nn
import torch.nn.functional as fnc
import itertools
from sklearn.naive_bayes import MultinomialNB

**The code for the model classes**

In [2]:
class LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_layers, embedding_dim=100):
        super(LSTM, self).__init__()
        self.i_dim= input_dim
        self.e_dim= embedding_dim
        self.h_dim= hidden_dim
        self.o_dim= output_dim
        self.n_layers= num_layers
        
        self.embedding = nn.Embedding(self.i_dim, self.e_dim)
        self.lstm = nn.LSTM(input_size=self.e_dim, hidden_size=self.h_dim, num_layers=self.n_layers)
        self.out = nn.Linear(self.h_dim,self.o_dim)
        
    
    def forward(self, inp, hidden_cell):
        embedded = self.embedding(inp)
        lstm_out, hidden = self.lstm(embedded.view(1,1,-1), hidden_cell)
        res = self.out(lstm_out.view(1, -1))
        res = fnc.log_softmax(res, dim=1)
        
        return res, hidden 
        

    def init_hidden(self):
        hidden=torch.zeros(self.n_layers,1,self.h_dim)
        cell = torch.zeros(self.n_layers,1,self.h_dim)
        return hidden, cell

In [3]:
def generate(LSTM_model, start=['i'], max_len=150, num_lines=4, temp=0.8):
    hidden, cell = LSTM_model.init_hidden()
    prime_input = phrase_to_tensor([start], index_dict)
    predicted = start[:]

    for p in range(len(prime_input)):
        _, (hidden, cell) = LSTM_model(prime_input[p], (hidden, cell)) 
    input = prime_input[-1]
    
    line_count=0
    for p in range(max_len):
        if line_count>=num_lines:
            break
        output, (hidden, cell) = LSTM_model(input, (hidden, cell))
        
        output_dist = output.data.view(-1).div(temp).exp()
        i = int(torch.multinomial(output_dist, 1)[0]) 
        predicted_next = rev_index_dict[i]

        if predicted_next=='endline':
            line_count= line_count+1
        
        predicted.append(predicted_next)
        input = phrase_to_tensor([[predicted_next]],index_dict)

    return predicted

In [4]:
def nice_format(output_list, meta_list=[]):
        no_meta = [x for x in output_list if x not in meta_list]
        with_linebreaks = ["\n" if x=='endline' else x for x in no_meta]
        return " ".join(with_linebreaks)

In [5]:
class ngram_markov_generator(object):
    
    def __init__(self, order,  end='nxtsng', endline='endline', meta_list=['nxtvrse','nxtsng']):
        self.end = end
        self.endline = endline
        self.meta_list = meta_list
        self.order =order
        self.freq_dict = dict()
        
    def train(self, tknzd_txt_list):
        for text in tknzd_txt_list:
            grams = list(ngrams(text, self.order+1))
            for gram in grams:
                self.add_to_dict(gram)
            
    def add_to_dict(self, gram):
        try:
            self.freq_dict[gram[:-1]][gram[-1]]+=1
        except KeyError:
            self.freq_dict[gram[:-1]]= FreqDist([gram[-1]])
    
    def generate_text(self, start, max_len=20, temp=1):
        key = start[-self.order:]
        res_sent= start
        
        for _ in itertools.repeat(None, max_len):
            
            with_temp = {key: value**(1/temp) for key, value in self.freq_dict[tuple(key)].items()}
            dist = nltk.DictionaryProbDist(with_temp,normalize=True)
            
            nextword = str(dist.generate())
            res_sent.append(nextword)
            
            if nextword==self.end:
                break
                
            key =res_sent[-self.order:]
        
        return res_sent
    
    def generate_lines(self, start, num_lines, max_len=200, temp=1):
        key = start[-self.order:]
        res_sent= start
        linecount=0
        for x in range(max_len):
            
            with_temp = {key: value**(1/temp) for key, value in self.freq_dict[tuple(key)].items()}
            dist = nltk.DictionaryProbDist(with_temp,normalize=True)
            
            nextword = str(dist.generate())
            
            if nextword==self.endline:
                linecount = linecount + 1
            
            if linecount >= num_lines:
                break
            
            if nextword==self.end:
                break
            
            res_sent.append(nextword)   
            key =res_sent[-self.order:]
        
        return res_sent
    def nice_format(self, output_list):
        no_meta = [x for x in output_list if x not in self.meta_list]
        with_linebreaks = ["\n" if x==self.endline else x for x in no_meta]
        return " ".join(with_linebreaks)

**unpickling data and neural models**

In [6]:
OHHLA_list = pickle.load(open("OHHLAdata_list.p","rb"))

In [12]:
'''
TwoLyrBig = pickle.load(open("2lyr_word_level_LSTM(2).p","rb"))
OneLyrBig = pickle.load(open("Biglyr_word_level_lstm(1).p","rb"))
TwoLyrSmall = pickle.load(open("smallvocab2(1).p","rb"))
OneLyrSmall = pickle.load(open("smallvocab(1).p","rb"))
'''

**training ngram-markov**

In [7]:
trigram_mc = ngram_markov_generator(2)
trigram_mc.train(OHHLA_list)

In [103]:
fgram_mc = ngram_markov_generator(3)
fgram_mc.train(OHHLA_list)

**Filtered generation function**  
As a proof of concept we implemented a function that uses the classifier as a filter for selecting only the generated lyrics with the best score (probability) according to the classifier. Does not work for neural models yet.

In [10]:
classifier = pickle.load(open("mnb_classifier.p","rb"))
tfidf_dict = pickle.load(open("tfidf_dict.p","rb"))



In [44]:
def filtered_generate(generator, start_sequence= ["nxtvrse","hello","world"], num_lines=4, max_words=200, selection_size=100, temperature=0.8):
    candidates= []
    for x in range(selection_size):
        candidates.append(generator.nice_format(generator.generate_lines(start_sequence[:], num_lines, max_words, temperature)))
    #todo filter for max score candidate
    transformed = tfidf_dict.transform(candidates)
    
    scores = [x[1] for x in classifier.predict_proba(transformed)]
    maxscore_indx = scores.index(max(scores))
    return candidates[maxscore_indx]

In [45]:
#print 20 filtered sentences
for y in range(20):
    print(filtered_generate(trigram_mc))

hello world 
 and when i'm drinkin that goosey witta me brew 
 doin a di realest ting yuh know 
 if you ain't got a grudge i know i took this year
hello world 
 cause i ain't sayin' nothin' 
 reflectin' rhymes eternal like your balls and your nigga want beef then goget boy had this shit 
 i ain't talking 'bout
hello world 
 of a snake 
 pullin up in the hype 
 they say they a lil nigga with a real nigga
hello world how we livin 
 do you want weed holla at ya girl and please tell mi seh slew dem hit dat one yah name!!! cho!!!! 
 tell me what the fuck up the river wit yo body 
 my new honey dew
hello world alright 
 do them niggas feind for more 
 you ain't grabbin' the thang 
 when i get uh huh uh huh
hello world 
 oops caught me in the back of your friends know i was on a hoe like a boss i'm moving on the mic an bust 
 but i never gave a fuck if you let these niggaz acting like she on my line did i do it for the realest 
 man why me love
hello world 
 i ain't livin large checks comin a