In [25]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import nltk 
import copy
import os
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

%matplotlib inline

list_of_phonemes = ['AA','AE','AH','AO','AW','AY','B','CH','D','DH','EH','ER','EY','F','G', 'HH', 'IH', 'IY','JH','K','L','M','N','NG','OW','OY','P','R','S','SH','T','TH','UH','UW','V','W','Y','Z','ZH']
vowels=[u'AA',u'AE',u'AH',u'AO',u'AW',u'AY',u'EH',u'ER',u'EY',u'IH',u'IY',u'OW',u'OY',u'UH',u'UW',u'Y']
arpabet = nltk.corpus.cmudict.dict()
use_cuda = torch.cuda.is_available()

In [2]:
SOS_token = 0
EOS_token = 1


class Lyrics:
    def __init__(self, name):
        self.name = name
        self.phonemes2index = {}
        self.phonemes2count = {}
        self.index2phonemes = {0: "SOS", 1: "EOS"}
        self.phonemes2prob = {}
        self.phonemes2probReverse = {}
        
        self.n_phonemes = 2  # Count SOS and EOS

    # Takes a line of phonemes and adds it
    def addLine(self, line):
        for phoneme in line:
            self.addPhoneme(phoneme)
        
        for i in range(len(line)):
            if i+1 < len(line):
                self.addProb(line[i], line[i+1])
            if i-1 >= 0:
                self.addProbReverse(line[i-1], line[i])
            
    def addPhoneme(self, phoneme):
        if phoneme not in self.phonemes2index:
            self.phonemes2index[phoneme] = self.n_phonemes
            self.phonemes2count[phoneme] = 1
            self.index2phonemes[self.n_phonemes] = phoneme
            self.n_phonemes += 1
        else:
            self.phonemes2count[phoneme] += 1
            
            
    # Add a counter for P(second|first)
    def addProb(self, first, second):
        if first not in self.phonemes2prob:
            self.phonemes2prob[first] = np.zeros(41)
            self.phonemes2prob[first][self.phonemes2index[second]] += 1
        else:
            self.phonemes2prob[first][self.phonemes2index[second]] += 1
        
        
    # first should be the first phoneme in the sentence when it's read as it would be spoken
    # [u'K', u'AE', u'N', u'AY', u'K', u'IH', u'K', u'IH', u'T'] -->  K is first, AE is 2nd
    # Add a counter for P(first|second)
    def addProbReverse(self, first, second):
        if second not in self.phonemes2probReverse:
            self.phonemes2probReverse[second] = np.zeros(41)
            self.phonemes2probReverse[second][self.phonemes2index[first]] += 1
        else:
            self.phonemes2probReverse[second][self.phonemes2index[first]] += 1
            
            
    # P(2|1)
    def getCondProb(self, first, second):
        f = self.index2phonemes[first]
        s = self.index2phonemes[second]
        return self.phonemes2prob[f][s]/self.phonemes2count[f]
    
    # P(1|2)
    def getCondProbReverse(self, first, second):
        f = self.index2phonemes[first]
        s = self.index2phonemes[second]
        #a = self.phonemes2probReverse[s][first]
        #print(a)
        return self.phonemes2probReverse[s][first]/self.phonemes2count[s]

In [3]:
def readLyrics(name, reverse=False):
    print("Reading lines...")

    # Split every line into pairs and normalize
    indexes_of_pairs = pd.read_csv('rhyming_pairs2.csv', header=1)
    
    pruned_data = pd.read_csv('pruned_data', header=None)
    pruned_data = pruned_data[0]
    
    pairs = []
    for index, row in indexes_of_pairs.iterrows():
        first_line = pruned_data[row[0]]
        second_line = pruned_data[row[1]]
    
        pairs.append( (first_line,second_line) )
    
    # Reverse pairs
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        
    lyr = Lyrics(name)

    return lyr, pairs

In [4]:
def convert_to_phonemes(s):
    #filename = 'testfile'
    #global arpabet
    #filename = os.path.join("home", "m3","nltk_data", "corpora", "cmudict", "cmudict")
    #print(os.path.abspath(filename))
    #filename = "/home/m3/nltk_data/corpora/cmudict/cmudict"
    s = s.split(' ')
    line_phonemes = []
    word_phonemes = []
    pure = True
    
    for word in s:
        try:
            line_phonemes.append(wordWithoutNum(word))
        except:
            if word[-3:] == "in'":
                word_in = word
                word = word[:-3] + 'ing'
                ing_as_phonemes = convert_to_phonemes(word) 
                print(ing_as_phonemes)
                for pro in ing_as_phonemes[0]:
                    #print(pro)
                    pro[-1] = u'N'
                line_phonemes.append(ing_as_phonemes)
                #with open(filename,"a") as f:
                    #print("ing_as_phonemes:", ing_as_phonemes)
                    #print("ing_as_phonemes[0]:", ing_as_phonemes[0])
                    #print("ing_as_phonemes[0][0]:", ing_as_phonemes[0][0])
                    #print(word_in.upper() + " 1 " + " ".join(map(str, ing_as_phonemes[0][0])))
                    #f.write(word_in.upper() + " 1 " + " ".join(map(str, ing_as_phonemes[0][0])))
                    #f.write("\n")
                #arpabet = nltk.corpus.cmudict.dict()
            elif "'" in word:
                try:
                    line_phonemes.append(wordWithoutNum(word.replace("'","")))
                except:
                    pass
                    #print('Deleting an apostrophe didn\'t work')
            elif word == '':
                pass
            else:
                pure = False
    return line_phonemes
# print(convert_to_phonemes("hangin' in the good day feelin' good"))
# print(convert_to_phonemes("a whitened sandwich and again it stopped"))
# print(convert_to_phonemes("a derelick makes a real long speech"))
#l = convert_to_phonemes("after hours 'it' was cool")
#l2 = convert_to_phonemes("ten after one i think i'll hop the horse")

In [None]:
# this function takes a sound and a line and returns the number of times that sound appears in that line. 
def traverseLineForMatches(sound, line):
    count = 0
    for word in line:
        try:
            for pronunciation in word:
                #print(pronunciation)
                if sound in pronunciation:
                    #print('found a matching sound!')
                    count += 1
        except:
            print('Something went wrong. Skipping this bit....')
    #print(sound)
    return count


# this function takes a pronunciation and returns a list of the vowels that appeared in it
def find_vowels(pronunciation):
    found_vowels = []
    for sound in pronunciation:
        if sound in vowels:
            found_vowels.append(sound)
    return found_vowels

# this takes a word (a string or a unicode) and returns the nltk pronunciations without stress numbers
# and without duplicate pronunciations. it does NOT choose the best pronunciation from the remaining
# list of unique pronunciations for that word

def wordWithoutNum(word):
    #word = word.decode('utf-8').lower()
    s = arpabet[word] #s is a list of lists
    stripped_s = []
    stripped_s_final = []
    for pronunciation in s: #pronunciation is a list of unicode strings
        stripped_p = []
        for sound in pronunciation: #for every sound, remove digits from the str
            stripped_sound = ''.join([i for i in sound if not i.isdigit()])
            stripped_p.append(stripped_sound)
        stripped_s.append(stripped_p)
        
    #sometimes removing the numbers creates duplicates, for example:
    # arpabet['the'] = [[u'DH', u'AH0'], [u'DH', u'AH1'], [u'DH', u'IY0']] 
    # after removing digits, we want [[u'DH', u'AH'], [u'DH', u'IY']]
    # The following for loop performs this removal
    
    for pro in stripped_s: 
        if pro not in stripped_s_final:
            stripped_s_final.append(pro)
                
    return stripped_s_final



# this function will ideally take a line and a previous line, where line is an uncleaned list of lists of phonemes,
# and prev_line is a cleaned line of phonemes,
# and returns the first arguement with no words with multiple pronunciations

def clean_line(line, prev_line): 

    cleaned_line = []
    line_as_pure_phonemes = [item for word in line for item in word]
    line_as_pure_phonemes = [sound for word in line_as_pure_phonemes for sound in word]
    # look at every word in the line
    # Line looks like this 
#     [[[u'IH', u'N']],     In
#     [[u'DH', u'AH'], [u'DH', u'IY'], [u'TH', u'AH'], [u'TH', u'IY']], the
#     [[u'M', u'IH', u'S', u'T']], mist 
#     [[u'DH', u'OW']], though
#     [[u'B', u'AH', u'T']], but
#     [[u'DH', u'AH'], [u'DH', u'IY'], [u'TH', u'AH'], [u'TH', u'IY']], the
#     [[u'R', u'IH', u'TH', u'S']], ryth's
#     [[u'M', u'UW', u'V']], move
#     [[u'IH', u'N']]] in
    
    
    
    # word looks like this, here's "the": [[u'DH', u'AH'], [u'DH', u'IY'], [u'TH', u'AH'], [u'TH', u'IY']] 
    for word in line:
        
        pronunciation_vowels = []
        pronunciation_vowels_scores = []
        
        
        
        if len(word) != 1:
            #print(word)
            # pronunciation looks like this: [u'DH', u'AH'], so word[0] = u'DU'
            for pronunciation in word:
                #Get the vowels for the pronunciations
                pronunciation_vowels.append(find_vowels(pronunciation))
                
            
            # v is a list of vowels, could be one but maybe more. is a list
            for v in pronunciation_vowels:
                count = 0
                # for each sound in this v, tally the number of times it appears in the unfiltered list
                for sound in v:
                    count += line_as_pure_phonemes.count(sound)
                    count += prev_line.count(sound)
                    
                count = count / float(len(v)) #take the average, in case there are 3 vowels in on pro, and 2 in the other
                pronunciation_vowels_scores.append(count)
               
            #find which vowel had the highest count
            max_score = max(pronunciation_vowels_scores)

            #get the location of the highest count vowel
            max_score_index = pronunciation_vowels_scores.index(max_score)
            
            max_score_vowel = pronunciation_vowels[max_score_index]
            cleaned_line.append([word[max_score_index]])
            #print(pronunciation_vowels)
            #print(pronunciation_vowels_scores)
            
        # otherwise, only one pronunciation, so append it to the list
        else:
            cleaned_line.append(word)

    return cleaned_line

#this is a sample starting line
#prev_line = remove_lists(convert_to_phonemes("ten after one i think i'll hop the horse"))
#clean_line(convert_to_phonemes("hangin' in the good day feelin' good"), prev_line)
# print(convert_to_phonemes("a whitened sandwich and again it stopped"))
# print(convert_to_phonemes("a derelick makes a real long speech"))
# print(convert_to_phonemes("in the mist though but the rhyth's move in"))

In [None]:
MAX_LENGTH=30
def prepareData(name, reverse=False):
    lyr, pairs = readLyrics(name, reverse)
    print("Reading %s sentence pairs" % len(pairs))
    ## CONVERT TO PHONEMES HERE
    pairs_as_phonemes = []
    pairs_not_as_phonemes = []
    print("Counting words...")
    num_of_pairs = 0
    for pair in pairs:
        cur_phonemes = []
        cur_phonemes.append(convert_to_phonemes(pair[0]))
        cur_phonemes.append(convert_to_phonemes(pair[1]))
        cur_phonemes[0] = clean_line(cur_phonemes[0], cur_phonemes[1])
        cur_phonemes[1] = clean_line(cur_phonemes[1], cur_phonemes[0])
        
        cur_phonemes[0] = [sound for sublist in cur_phonemes[0] for sound in sublist]
        cur_phonemes[1] = [sound for sublist in cur_phonemes[1] for sound in sublist]
        
        foo = []
        for w in cur_phonemes[0]:
            for s in w:
                if s in list_of_phonemes:
                    foo.append(s)
                
        cur_phonemes[0] = foo
        
        foo = []
        for w in cur_phonemes[1]:
            for s in w:
                if s in list_of_phonemes:
                    foo.append(s)
                
        cur_phonemes[1] = foo
        if len(cur_phonemes[0]) < MAX_LENGTH and len(cur_phonemes[1]) < MAX_LENGTH:
            num_of_pairs += 1
            pairs_as_phonemes.append(cur_phonemes)
            pairs_not_as_phonemes.append(pair)
            lyr.addLine(cur_phonemes[0])
            lyr.addLine(cur_phonemes[1])
        
    print("Pairs under ", MAX_LENGTH)
    print(num_of_pairs)
    
    return lyr, pairs_as_phonemes, pairs_not_as_phonemes


lyr, pairs, normal_pairs = prepareData('rap')
print('done')

In [None]:
def indexesFromLine(lyr, line):
    l = []
    #print(line)
    for phoneme in line:
        l.append(lyr.phonemes2index[phoneme])
    return l

#[lyr.phonemes2index[phoneme] for phoneme in line]


def variableFromLine(lyr, line):
    indexes = indexesFromLine(lyr, line)
    indexes.insert(0, SOS_token)
    indexes.append(EOS_token)
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result

def variablesFromPair(pair):
    input_variable = variableFromLine(lyr, pair[0])
    target_variable = variableFromLine(lyr, pair[1])
    return (input_variable, target_variable)

In [None]:
use_cuda

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH+2):
    input_variable = variableFromLine(lyr, sentence)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei],
                                                 encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

    
    decoder_input = Variable(torch.LongTensor([[EOS_token]]))  # EOS
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        #print(decoder_attention)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0].cpu()
        ni = ni.numpy()
        if ni == SOS_token:
            decoded_words.append('<SOS>')
            break
        else:
            #print(decoder_output.data.topk(1))
            #print(int(ni))
            decoded_words.append(lyr.index2phonemes[int(ni)])

        decoder_input = Variable(torch.LongTensor([[int(ni)]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.rcParams['figure.figsize'] = [10, 5]
    plt.show()


def evaluateAndShowAttention(input_sentence, encoder, decoder):
    output_words, attentions = evaluate(encoder, decoder, input_sentence)
    print(output_words)
    print("WARNING: Attentions graph does NOT use beam_search, meaning predictions will be severely worsened")

    showAttention(input_sentence, output_words[::-1], attentions)
    return output_words

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        pair_as_text = normal_pairs[pairs.index(pair)]
        print('>', pair[0])
        print('>', pair_as_text[0])
        print('=', pair[1])
        print('=', pair_as_text[1])
        phos = evaluateBeamSearch(encoder, decoder, pair[0])
        output_sentence = ' '.join(phos)
        print('<', output_sentence)
        print('')

In [None]:
total_plot_losses = []
def trainItersAttentionBackwards(encoder, decoder, n_iters, plot_losses, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [variablesFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]
        
        #print(target_variable)
        
        
        # create inverted indices
        idx = [i for i in range(target_variable.size(0)-1, -1, -1)]
        idx = Variable(torch.cuda.LongTensor(idx))
        #print(idx)
        inverted_tensor = target_variable.index_select(0, idx)
        #print(inverted_tensor)
        reversed_list = []
        for r in range(len(target_variable.data)):
            reversed_list.append(target_variable.data[(len(target_variable.data)-1)-r][0])
        
        target_variable = inverted_tensor

        loss = trainAttentionBackwards(input_variable, target_variable, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            #torch.save(encoder.state_dict(), 'encoder_128_tfr_80.pt')
            #torch.save(decoder.state_dict(), 'decoder_128_tfr_80_do_50.pt')
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            total_plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
teacher_forcing_ratio = 0.1
def trainAttentionBackwards(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH+2):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0
    
    for ei in range(input_length):
        #print(input_variable[ei])
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]
        
    #print(input_length)
    #print(encoder_outputs.size())
    #print(encoder_outputs)
        
    # first input is EOS because we start predicting from the end of the sentence
    decoder_input = Variable(torch.LongTensor([[EOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(1, target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(1, target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = Variable(torch.LongTensor([[ni]]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_variable[di])
            if ni == SOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH+2):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size).cuda()
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length).cuda()
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size).cuda()
        self.dropout = nn.Dropout(self.dropout_p).cuda()
        self.gru = nn.GRU(self.hidden_size, self.hidden_size).cuda()
        self.out = nn.Linear(self.hidden_size, self.output_size).cuda()

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # add if w/ cuda
        self.embedding = nn.Embedding(input_size, hidden_size).cuda()
        self.gru = nn.GRU(hidden_size, hidden_size).cuda()

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [None]:
hidden_size = 512
teacher_forcing_ratio = .5
encoder_512_tfr_50 = EncoderRNN(lyr.n_phonemes, hidden_size)
decoder_512_tfr_50_do_20 = AttnDecoderRNN(hidden_size, lyr.n_phonemes, dropout_p=0.2)

encoder_512_tfr_50.load_state_dict(torch.load('encoder_512_tfr_50.pt'))
decoder_512_tfr_50_do_20.load_state_dict(torch.load('decoder_512_tfr_50_do_20.pt'))

total_plot_losses = []
# To train, uncomment this. 
#trainItersAttentionBackwards(encoder_512_tfr_50, decoder_512_tfr_50_do_20, 250000, total_plot_losses, print_every=1000)

In [None]:
beam_size = 5
def evaluateBeamSearch(encoder, decoder, line, reverse=True, max_length=MAX_LENGTH+2):
    input_variable = variableFromLine(lyr, line)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei], 
                                                 encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

    token = EOS_token if reverse else SOS_token
    decoder_input = Variable(torch.LongTensor([[token]]))  # SOS or EOS
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    decoded_words = [] 
    decoder_attentions = torch.zeros(max_length, max_length)

    
    beam = beam_search(beam_size, encoder, decoder, line, decoder_input, decoder_hidden, encoder_outputs, reverse=True)
    beam_list = []
    for b in beam.pho:
        beam_list.append(lyr.index2phonemes[b.tolist()])
        
    if reverse:
        beam_list = beam_list[::-1]
    #print(beam.attentions)
    return beam_list

In [None]:
def beam_search(beam_size, encoder, decoder, line, first_input, first_hidden, encoder_outputs, old_beams=None, reverse=False):
    #print('using beam search...')
    input_variable = variableFromLine(lyr, line)
    
    # Get initial decoder outputs. The input is the not up for debate, so it starts every beam as well.
    dec_out, dec_hidden, dec_attention = decoder(
        first_input, first_hidden, encoder_outputs)
    
    
    #This will start off all of our beams.
    dec_hidden_start = dec_hidden
    
    # take out the predictions, these are our beams
    proposed_v, proposed_i = dec_out.data.topk(beam_size)

    #convert the indices to list of lists w/ one item
    proposed_i = [x for x in proposed_i[0]]
    proposed_v = [x for x in proposed_v[0]]
    
    
    if(old_beams is not None):
        beams = old_beams
    else:
        beams = []    
        for i in range(beam_size):
            beam = Beam(beam_size)

            beam.pho.append(proposed_i[i])
            beam.prob.append(proposed_v[i])
            beam.update_prob(reverse)

            beams.append(beam)
            
    #this for loop should go until all beams are EOS
    beams_finished = False
    count = 0
    while not beams_finished:
        count += 1
        extended_beams = []
        
        for j in range(len(beams)):
            extended_beams.append(beams[j].extend_beams(beam_size, encoder, decoder, dec_hidden_start, encoder_outputs, reverse))

        # we get the extended beams in lists of 5, so now extended beams is a matrix.
        # we flatten it to find the highest value easier.

        flat_list = []
        for sublist in extended_beams:
            for item in sublist:
                flat_list.append(item)
                
        flat_list = sorted(flat_list, key=lambda beam: beam.total_sum, reverse=True)
            
        beams = flat_list[:beam_size]
        
        # infinite loop, something went wrong.
        if count > 31:
            return beams[0]
            
        for beam in beams:
            if reverse:
                prediction_end = SOS_token
            else: 
                prediction_end = EOS_token
                
            if beam.pho[-1] == prediction_end or len(beam.pho) > MAX_LENGTH+2:
                beams_finished = True
                #print('success')
                return beam
                
        #print('On search %d:' % count)
        #for b in beams:
            #print(b.pho, b.total_sum)
            
    return beams


    #       When expanding this beam, check if it's valid. 
    #       if valid is false and the final pho isn't EOS, 
    #            dont expand the beam. 
    #       if valid is false and the final pho is EOS, don't expand but keep

In [None]:
class Beam(object):
    
    def __init__(self, beam_width):
        self.prob = []
        self.pho = []
        self.total_sum = 0
        self.valid = True
        self.beam_width = beam_width
        
        #this will create an error later
        self.attentions = torch.zeros(MAX_LENGTH+2, MAX_LENGTH+2)

        
    def extend_beams(self, beam_width, encoder, decoder, first_hidden, encoder_outputs, reverse):
        if reverse:
            token = SOS_token
        else:
            token = EOS_token
            
        if self.pho[-1] == token or self.valid == False:
            #print('reached the end of a beam, either it is invalid or the last phoneme is the signal to stop predictions')
            return[self]
            
        guess_hidden = first_hidden
        
        #first, run the phonemes of the beam thru the decoder, using teacher forcing the whole way
        for phoneme_index in self.pho:
            dec_input =  Variable(torch.LongTensor([[phoneme_index]]))
            dec_input = dec_input.cuda() if use_cuda else dec_input

            guess_out, guess_hidden, guess_attention = decoder(
                dec_input, guess_hidden, encoder_outputs)
            
            ind = self.pho.index(phoneme_index)
            self.attentions[ind] = guess_attention.data
            

        # second, take the top beam_size predictions of the final out and put them in new beams
        guess_v, guess_i = guess_out.topk(beam_size)

        guess_i = [x for x in guess_i[0]]
        guess_v = [x for x in guess_v[0]]

        extended_beams = []
        
        for i in range(beam_width):
            
            new_beam = Beam(beam_width)
            for n in self.pho:
                new_beam.pho.append(n)
                
            for p in self.prob:
                new_beam.prob.append(p)

            new_beam.pho.append(guess_i[i].data[0])
            new_beam.prob.append(guess_v[i].data[0]) 
            new_beam.update_prob(reverse)
            
            extended_beams.append(new_beam)

        
        #return the extended beams
        return extended_beams


    
    def update_prob(self, reverse=False):
        s = 0
        for p in self.prob:
             s += p     
        if len(self.pho) > 1:
            
            # [0, 1]
            # prev = 0
            # cur = 1
            for i in range(1, len(self.pho)):
                prev = self.pho[i-1]
                cur = self.pho[i]
                
                if reverse:
                    try:
                        s += math.log(lyr.getCondProbReverse(cur.tolist(), prev.tolist()))
                    except:
                        self.valid = False
                else:
                    # P(2|1)
                    s += math.log(lyr.getCondProb(cur, prev))

                
                
        self.total_sum = s/len(self.prob)
        

In [None]:
#TODO
#parse new lyric jsons <--- more data
#train GloVe word embeddings <--- needed for output of words
#Get g2p-seq2seq stuff goin' on <--- more usable words so more data
#finalize new double network <--- needed outputs of words
#order training short to long <--- fix attn attending beyond the sentence
#evaluateAndShowAttention doesn't use beam_search  <--- ...

In [None]:
good_pairs = [2030,8350,3271,3013,13575,371]
for p in good_pairs:
    pair = normal_pairs[p]
    print(pair)
    print(normal_pairs.index(pair))

    l = []
    for w in clean_line(convert_to_phonemes(pair[0]), []):
        for s in w[0]:
            l.append(s)

    out = evaluateAndShowAttention(l, encoder_512_tfr_50, decoder_512_tfr_50_do_20)
    print(out.reverse())
    
evaluateRandomly(encoder_512_tfr_50, decoder_512_tfr_50_do_20)

In [None]:
class CombinedEncoderRNN(nn.Module):
    def __init__(self, input_size, phoneme_embedding_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # add if w/ cuda
        self.gloveEmbedding = nn.Embedding(input_size, glove_size).cuda()
        
        self.phonemeEmbedding = nn.Embedding(input_size, phoneme_embedding_size).cuda()
        self.phonemeLSTM = nn.LSTM(phoneme_embedding_size, hidden_size, num_layers=3).cuda()
        self.phonemeLinear = nn.Linear(hidden_size, glove_size)
        
        self.scaleVector = nn.Parameter(torch.tensor(1, hidden_size))
        self.b = nn.Parameter(torch.tensor(1, 1))

        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=3).cuda()
        
        
    def forward(self, input, hidden):
        glove = self.gloveEmbedding(input).view(1, 1, -1)
        
        for phoneme in phonemesFromWord(input):
            phonemeEmbedding = self.phonemeEmbedding(phoneme).view(1, 1, -1)
            phonemeOutput, phonemeHidden = self.phonemeLSTM(phonemeEmbedding, phonemeHidden)
            
        xchar = self.phonemeLinear(phonemeHidden)
        scale = nn.Sigmoid(torch.dot(self.scaleVector, xchar) + self.b)
            
        lstmInput = (1-scale)*glove + scale*xchar
        output, hidden = self.lstm(lstmInput, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [None]:
class CombninedAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, phonme_size, vocab_size, dropout_p=0.1, max_length=MAX_LENGTH+2):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.gloveEmbedding = nn.Embedding(self.vocab_size, self.hidden_size).cuda()
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length).cuda()
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size).cuda()
        self.dropout = nn.Dropout(self.dropout_p).cuda()
        self.combinedLSTM = nn.LSTM(self.hidden_size, self.hidden_size).cuda()
        self.out = nn.Linear(self.hidden_size, self.output_size).cuda()


    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [1]:
import json
from pprint import pprint

with open('Lyrics_Eminem.json') as f:
    data = json.load(f)

pprint(data)

{'artist': 'Eminem',
 'songs': [{'album': None,
            'artist': 'Eminem',
            'image': 'https://images.genius.com/cae72672d0f5e0b565e0576b7e5be55b.676x676x1.jpg',
            'lyrics': 'Thank you for your interest in Revival, the number one '
                      'slightly invasive treatment for Atrox Rithimis. You '
                      "only get one shot to beat AR. Don't miss your chance "
                      'with Revival. Please hold to speak with a patient care '
                      'representative.\n'
                      "While you wait, here's some additional information:\n"
                      'You should discuss the potential benefits and risks of '
                      'Revival with your doctor or clergyman. Revival is a '
                      'non-injectable medication given through the ear canal '
                      'and is not intended for anal use. Revival is a '
                      'non-narcotic medication intended for daily use that can '

                              'and Bizarre\n'
                              '(unintelligible)\n'
                              "That's why you beg to differ me,you ain't got "
                              'no style\n'
                              'Plus you lack delivery, not to brag\n'
                              "I don't need to boast\n"
                              'Look, my face is pale, but you look like you '
                              'seen a ghost\n'
                              '(unintelligible)\n'
                              "You couldn't slip up in the zone, if I put "
                              'banana peels\n'
                              'Around this fucking microphone\n'
                              'So get a bulldozer, start from bottom to top\n'
                              'And tear this building down\n'
                              "'cause that's the only way you 'recking shop",
                    'lyrics_owner_id': 4275619,
                    'lyr

                                          'header_image_url': 'https://images.genius.com/3fbb95c267d2db73322a3948de228974.1000x563x1.jpg',
                                          'id': 45,
                                          'image_url': 'https://images.genius.com/b379811c1415017b9f3e3a71b9d26060.400x400x1.jpg',
                                          'iq': 144990,
                                          'is_meme_verified': True,
                                          'is_verified': True,
                                          'name': 'Eminem',
                                          'url': 'https://genius.com/artists/Eminem'}],
                    'pyongs_count': None,
                    'recording_location': None,
                    'release_date': '2009-07-17',
                    'song_art_image_thumbnail_url': 'https://images.genius.com/b379811c1415017b9f3e3a71b9d26060.300x300x1.jpg',
                    'song_art_image_url': 'https://images.genius.com/b37981

                                                       'api_path': '/songs/19709',
                                                       'full_title': 'My '
                                                                     'Lifestyle '
                                                                     'by\xa0'
                                                                     'Fat\xa0'
                                                                     'Joe',
                                                       'header_image_thumbnail_url': 'https://images.genius.com/4a7618aab6a23480b8790a58ffad66c8.300x300x1.jpg',
                                                       'header_image_url': 'https://images.genius.com/4a7618aab6a23480b8790a58ffad66c8.1000x1000x1.jpg',
                                                       'id': 19709,
                                                       'lyrics_owner_id': 50,
                                                       'lyrics_stat

                                          'image_url': 'https://images.genius.com/b379811c1415017b9f3e3a71b9d26060.400x400x1.jpg',
                                          'iq': 144990,
                                          'is_meme_verified': True,
                                          'is_verified': True,
                                          'name': 'Eminem',
                                          'url': 'https://genius.com/artists/Eminem'}],
                    'pyongs_count': 20,
                    'recording_location': None,
                    'release_date': '2011-03-09',
                    'song_art_image_thumbnail_url': 'https://images.genius.com/b379811c1415017b9f3e3a71b9d26060.300x300x1.jpg',
                    'song_art_image_url': 'https://images.genius.com/b379811c1415017b9f3e3a71b9d26060.400x400x1.jpg',
                    'song_relationships': [{'songs': [], 'type': 'samples'},
                                           {'songs': [], 'type': 'sampled

                    'api_path': '/songs/603',
                    'current_user_metadata': {'excluded_permissions': ['follow',
                                                                       'award_transcription_iq',
                                                                       'remove_transcription_iq',
                                                                       'pyong',
                                                                       'edit_lyrics',
                                                                       'view_annotation_engagement_data',
                                                                       'publish',
                                                                       'unpublish',
                                                                       'edit_spotify_details',
                                                                       'hide',
                                                                  

                                                               'url': 'https://genius.com/Eminem-4-verses-lyrics'},
                                               'annotations': [{'api_path': '/annotations/12995290',
                                                                'authors': [{'attribution': 1.0,
                                                                             'pinned_role': None,
                                                                             'user': {'api_path': '/users/53993',
                                                                                      'avatar': {'medium': {'bounding_box': {'height': 400,
                                                                                                                             'width': 300},
                                                                                                            'url': 'https://images.genius.com/e1ae896204efa260fcb6e186445c09a1.600x338x48.gif'},
 

                                              'iq_by_action': {},
                                              'permissions': ['see_pageviews',
                                                              'create_comment',
                                                              'view_song_story_gallery'],
                                              'relationships': {}},
                    'custom_performances': [{'artists': [{'api_path': '/artists/45128',
                                                          'header_image_url': 'https://assets.genius.com/images/default_avatar_300.png?1527887249',
                                                          'id': 45128,
                                                          'image_url': 'https://assets.genius.com/images/default_avatar_300.png?1527887249',
                                                          'is_meme_verified': False,
                                                          'is_verified': False,
    

                                                                                                          'url': 'https://images.genius.com/avatars/tiny/ba3edc56c07c146c508e28ac1ec892e3'}},
                                                                                      'current_user_metadata': {'excluded_permissions': ['follow'],
                                                                                                                'features': [],
                                                                                                                'interactions': {'following': False},
                                                                                                                'permissions': []},
                                                                                      'header_image_url': 'https://s3.amazonaws.com/filepicker-images-rapgenius/iwbNamgGRkughbBsX7YA_374385_10100485763333475_1933986828_n.jpg',
                           

                                                                                      'avatar': {'medium': {'bounding_box': {'height': 400,
                                                                                                                             'width': 300},
                                                                                                            'url': 'https://s3.amazonaws.com/rapgenius/avatars/medium/21Y1rRnbTiGjtEsQPxmk.jpg'},
                                                                                                 'small': {'bounding_box': {'height': 100,
                                                                                                                            'width': 100},
                                                                                                           'url': 'https://s3.amazonaws.com/rapgenius/avatars/small/21Y1rRnbTiGjtEsQPxmk.jpg'},
                                                        

                                               'annotator_id': 11272,
                                               'annotator_login': 'Diirty_Dishes',
                                               'api_path': '/referents/3540687',
                                               'classification': 'accepted',
                                               'fragment': '8 Mile: '
                                                           'Lyckety-Splyt vs '
                                                           'B-Rabbit',
                                               'id': 3540687,
                                               'is_description': True,
                                               'path': '/3540687/Eminem-8-mile-lyckety-splyt-vs-b-rabbit/8-mile-lyckety-splyt-vs-b-rabbit',
                                               'range': {'content': '8 Mile: '
                                                                    'Lyckety-Splyt '
                                

                                                                                                            'as '
                                                                                                            'a '
                                                                                                            'birthday '
                                                                                                            'gift '
                                                                                                            'from '
                                                                                                            'his '
                                                                                                            'mother. '
                                                                                                            'He '
                                                                         

                                                                                                 'thumb': {'bounding_box': {'height': 32,
                                                                                                                            'width': 32},
                                                                                                           'url': 'https://images.rapgenius.com/avatars/thumb/1dc47ef89214adb3bb5e64e8eb221553'},
                                                                                                 'tiny': {'bounding_box': {'height': 16,
                                                                                                                           'width': 16},
                                                                                                          'url': 'https://images.rapgenius.com/avatars/tiny/1dc47ef89214adb3bb5e64e8eb221553'}},
                                                               

                                                                'votes_total': 3}],
                                               'annotator_id': 19312,
                                               'annotator_login': 'Shockie',
                                               'api_path': '/referents/8599648',
                                               'classification': 'accepted',
                                               'fragment': "'99 Tim Westwood "
                                                           'Freestyle '
                                                           '(Unreleased)',
                                               'id': 8599648,
                                               'is_description': True,
                                               'path': '/8599648/Eminem-99-tim-westwood-freestyle-unreleased/99-tim-westwood-freestyle-unreleased',
                                               'range': {'content': "'99 Tim "
                         

                                                                                                   'Horror'],
                                                                                      'tag': 'em'}],
                                                                        'data': {'api_path': '/referents/70322'},
                                                                        'tag': 'a'},
                                                                       '.'],
                                                          'tag': 'p'},
                                                         '',
                                                         {'children': ['Pretty '
                                                                       'sick '
                                                                       'place… '
                                                                       'but Em '
                                                           

                                                                                                            'David '
                                                                                                            'Axelrod’s ',
                                                                                                            {'attributes': {'href': 'http://www.whosampled.com/sample/view/18439/Eminem-Any%20Man_David%20Axelrod-The%20Mental%20Traveler/',
                                                                                                                            'rel': 'nofollow'},
                                                                                                             'children': ['“The '
                                                                                                                          'Mental '
                                                                                                                      

                                        'is_verified': True,
                                        'name': 'Eminem',
                                        'url': 'https://genius.com/artists/Eminem'}]},
            'title': 'Armaggedon (The Invasion Part 3)',
            'year': None},
           {'album': 'Revival',
            'artist': 'Eminem',
            'image': 'https://images.genius.com/0ea4069ebfe956d3b80296fda13eccde.1000x1000x1.jpg',
            'lyrics': 'Part I: Arose\n'
                      'If I could rewind time like a tape\n'
                      'Inside a boombox, one day for every pill or Percocet '
                      'that I ate\n'
                      "Cut down on the Valium, I'da heard everything\n"
                      'But death is turning so definite—wait!\n'
                      'They got me all hooked up to some machine\n'
                      "I love you, Bean, didn't want you to know I was "
                      'struggling\n'
                

                                           {'songs': [],
                                            'type': 'interpolated_by'},
                                           {'songs': [], 'type': 'cover_of'},
                                           {'songs': [], 'type': 'covered_by'},
                                           {'songs': [], 'type': 'remix_of'},
                                           {'songs': [], 'type': 'remixed_by'},
                                           {'songs': [],
                                            'type': 'live_version_of'},
                                           {'songs': [],
                                            'type': 'performed_live_as'}],
                    'stats': {'accepted_annotations': 0,
                              'contributors': 1,
                              'hot': False,
                              'iq_earners': 1,
                              'transcribers': 1,
                              'unreviewed_annota

                                                                                                          'url': 'https://s3.amazonaws.com/rapgenius/avatars/tiny/183_me.PNG'}},
                                                                                      'current_user_metadata': {'excluded_permissions': ['follow'],
                                                                                                                'features': [],
                                                                                                                'interactions': {'following': False},
                                                                                                                'permissions': []},
                                                                                      'header_image_url': 'https://s3.amazonaws.com/rapgenius/avatars/medium/1358288268_183_me.PNG',
                                                                                    

                      'No one was there to catch you fall\n'
                      'You pick yourself back up, you dust your jacket off\n'
                      "You grab your balls like they're gargantuan\n"
                      "And ask yourself how fuckin' bad you want it\n"
                      "Pull out your pass for whoopin' ass and flash it on "
                      "'em\n"
                      "Nobody's gonna back you in that corner, you're a "
                      'hornet\n'
                      "No one's more ignorant than you fuckin' four in the "
                      "mornin'\n"
                      "You're at the la-bor-atory\n"
                      "Stormin' like there's nothin' that's more important\n"
                      "Emcees, you better consider this a formal warnin', "
                      "you're in for it!\n"
                      'Girl, what would you do if I said your body was off the '
                      'chain\n'
                      'And I to

                                                                       'fellow '
                                                                       'Detroit '
                                                                       'rapper '
                                                                       'and ',
                                                                       {'attributes': {'href': 'http://rap.genius.com/Soul-intent-what-color-is-soul-lyrics'},
                                                                        'children': ['collaborator'],
                                                                        'data': {'api_path': '/songs/123240'},
                                                                        'tag': 'a'},
                                                                       '.'],
                                                          'tag': 'p'},
                                                         '',
           

                                                                                                                'permissions': []},
                                                                                      'header_image_url': 'https://assets.genius.com/images/default_avatar_300.png?1527887249',
                                                                                      'human_readable_role_for_display': 'Verified '
                                                                                                                         'Artist',
                                                                                      'id': 194307,
                                                                                      'iq': 6285,
                                                                                      'login': 'Seedy',
                                                                                      'name': 'Seedy',
                       

                                                                       'she '
                                                                       'argues '
                                                                       'that '
                                                                       'Eminem '
                                                                       'is “an '
                                                                       'excellent '
                                                                       'father,” '
                                                                       'but '
                                                                       'she '
                                                                       '“[doesn’t] '
                                                                       'know '
                                                                       'if he '
                                 

                                                                                                            'realises '
                                                                                                            'he ',
                                                                                                            {'children': ['is'],
                                                                                                             'tag': 'em'},
                                                                                                            ' '
                                                                                                            'a '
                                                                                                            'role '
                                                                                                            'model '
                                                       

                                                       'path': '/Pop-tops-mamy-blue-lyrics',
                                                       'primary_artist': {'api_path': '/artists/404678',
                                                                          'header_image_url': 'https://images.genius.com/95e6c7fe478a9920e194ce89dcf68934.263x263x1.jpg',
                                                                          'id': 404678,
                                                                          'image_url': 'https://images.genius.com/95e6c7fe478a9920e194ce89dcf68934.263x263x1.jpg',
                                                                          'is_meme_verified': False,
                                                                          'is_verified': False,
                                                                          'name': 'Pop '
                                                                                  'Tops',
     

                                                                                          'iq_by_action': {},
                                                                                          'permissions': ['create_comment']},
                                                                'custom_preview': None,
                                                                'has_voters': True,
                                                                'id': 3494360,
                                                                'pinned': False,
                                                                'rejection_comment': None,
                                                                'share_url': 'https://genius.com/3494360',
                                                                'source': None,
                                                                'state': 'accepted',
                                                                'url'

                                                                                                                'interactions': {'following': False},
                                                                                                                'permissions': []},
                                                                                      'header_image_url': 'https://s3.amazonaws.com/rapgenius/avatars/medium/1358288268_183_me.PNG',
                                                                                      'human_readable_role_for_display': 'Contributor',
                                                                                      'id': 183,
                                                                                      'iq': 22739,
                                                                                      'login': 'Shadowcast',
                                                                                      'name': 'S

                              'Find the light, find the light, find the light\n'
                              "I'm standin' in the flames\n"
                              'It’s a beautiful kind of pain\n'
                              "Settin' fire to yesterday\n"
                              'Find the light, find the light, find the light\n'
                              '\n'
                              "Today's like the morning after: your world is "
                              'torn in half\n'
                              'You wake in its wake to start the mourning '
                              'process\n'
                              "And rebuilding, you're still a work in "
                              'progress\n'
                              "Today's a whole new chapter\n"
                              "It's like an enormous asthma\n"
                              'Thunderstorm has passed ya\n'
                              'You weathered it and poked its eye out\n'


                                                                       'making '
                                                                       'use of '
                                                                       'his '
                                                                       'trademark '
                                                                       'wordplay '
                                                                       'and '
                                                                       'imagery, '
                                                                       'no '
                                                                       'matter '
                                                                       'how '
                                                                       'corny '
                                                                       'it '
                                       

                                                          'id': 34878,
                                                          'image_url': 'https://images.genius.com/d36cf5ab776b96bf105d7a46387ce12c.512x512x1.jpg',
                                                          'is_meme_verified': False,
                                                          'is_verified': False,
                                                          'name': 'Jason Lader',
                                                          'url': 'https://genius.com/artists/Jason-lader'}],
                                             'label': 'Guitar'},
                                            {'artists': [{'api_path': '/artists/34878',
                                                          'header_image_url': 'https://images.genius.com/d36cf5ab776b96bf105d7a46387ce12c.512x512x1.jpg',
                                                          'id': 34878,
                                                 

                                        'is_meme_verified': False,
                                        'is_verified': False,
                                        'name': 'Billy Squier',
                                        'url': 'https://genius.com/artists/Billy-squier'},
                                       {'api_path': '/artists/27794',
                                        'header_image_url': 'https://images.genius.com/c10b9ff27fddc3000e3298dd22d6c886.300x300x1.jpg',
                                        'id': 27794,
                                        'image_url': 'https://images.genius.com/c10b9ff27fddc3000e3298dd22d6c886.300x300x1.jpg',
                                        'iq': 24791,
                                        'is_meme_verified': True,
                                        'is_verified': True,
                                        'name': 'Rick Rubin',
                                        'url': 'https://genius.com/artists/Rick-rubin'

                                       {'api_path': '/artists/179',
                                        'header_image_url': 'https://images.genius.com/ac6c8ba8d6aa88b06e5886a2840ed9f0.1000x428x1.jpg',
                                        'id': 179,
                                        'image_url': 'https://images.genius.com/43f9c82f7269ffb7397a0cc13844ec8d.513x513x1.jpg',
                                        'iq': 10270,
                                        'is_meme_verified': True,
                                        'is_verified': True,
                                        'name': 'Joe Budden',
                                        'url': 'https://genius.com/artists/Joe-budden'},
                                       {'api_path': '/artists/262',
                                        'header_image_url': 'https://images.genius.com/21ca2955e7c1038e10b6d882c758be86.475x336x1.jpg',
                                        'id': 262,
                             

                                                                        'children': ['track '
                                                                                     '(featuring '
                                                                                     'Xzibit '
                                                                                     'and '
                                                                                     'Nate '
                                                                                     'Dogg)'],
                                                                        'data': {'api_path': '/songs/1383'},
                                                                        'tag': 'a'},
                                                                       ' that '
                                                                       'appeared '
                                                                       'on 

                                                                'pinned': False,
                                                                'rejection_comment': None,
                                                                'share_url': 'https://genius.com/3763922',
                                                                'source': None,
                                                                'state': 'accepted',
                                                                'url': 'https://genius.com/3763922/Eminem-b-rabbit-vs-lotto-8-mile/B-rabbit-vs-lotto-8-mile',
                                                                'verified': False,
                                                                'verified_by': None,
                                                                'votes_total': 0}],
                                               'annotator_id': 1933056,
                                               'annotator_login': 'G-Nyus'

                      '"Who can say for sure? Perhaps a frontal lobotomy would '
                      'be the answer. If science could operate on this '
                      'distorted brain and put it to good use, society would '
                      'reap a great benefit."\n'
                      '\n'
                      'I walk around like a space cadet, place your bets\n'
                      "Who's likely to become a serial killer? Case of "
                      'Tourettes\n'
                      "Fuck, fuck-fuck, can't take the stress\n"
                      'I make a mess as the day progresses\n'
                      "Angry and take it out on the neighbors' hedges\n"
                      "Like this is how I'll cut your face up, bitches\n"
                      "With these hedge-trimmin' scissors with razor edges\n"
                      "Imagination's dangerous\n"
                      "It's the only way to escape this mess\n"
                      'And make the best

                                                          'tag': 'p'}],
                                            'tag': 'root'}},
                    'description_annotation': {'_type': 'referent',
                                               'annotatable': {'api_path': '/songs/739391',
                                                               'client_timestamps': {'lyrics_updated_at': 1432428464,
                                                                                     'updated_by_human_at': 1432428463},
                                                               'context': 'Eminem',
                                                               'id': 739391,
                                                               'image_url': 'https://images.genius.com/0b1c9568017c50ed4a1b0e810ff2a60c.480x360x1.png',
                                                               'link_title': 'Brenda '
                                                                    

                      "Any worse, we're gon' lose another soldier to this "
                      'game\n'
                      'And if I get killed for this rap I got a million in '
                      'cash\n'
                      "That says I will get you back in Hailie's name\n"
                      '\n'
                      "You ain't no motherfucking (bully)\n"
                      "And I ain't bowing to no motherfucking (bully)\n"
                      "I won't allow it, ain't gon' cower to no (bully)\n"
                      "I'll be damned if I don't stand up to a (bully)\n"
                      'Fight like a man and throw my hands up to a (bully...)\n'
                      'Irvine, get your boy off that (E...)\n'
                      '\n'
                      'Get at me, dog\n'
                      'Holla',
            'raw': {'album': {'api_path': '/albums/11406',
                              'artist': {'api_path': '/artists/45',
                                

KeyboardInterrupt: 

In [23]:
num = 2
print(data['songs'][num]['title'])
print(data['songs'][num]['lyrics'])

1997 Freestyle Live at Wetlands, NYC
Everybody, duck down, all you hear is the sound
Of Slim Shady spittin' rounds at the Lyricist Lounge
What year is this now? I don't know, I forgot
All I know is that I'd die for pot, make the cypher hot
Smoke weed and fuck up my life a lot
Give me the money now, bitch
Did you want to see your wife or not?
'Fore I stab her wit' this knife I got
I got a sci-fi plot like a science fiction flick that I'm a hit you wit'
I got the Outsidaz up in this bitch
Peace to Thirstin' Howl, A.L. and Wordsworth
My mother smoked crack, I had a premature birth
I'm just a nerd cursed wit' badly disturbed nerves
Who wanna step to this and get served first?
Ninety-nine percent of aliens prefer Earth
So I'm here to rule the planet, startin' wit' your turf
I hid a secret message inside of a word search
Wit' smeared letters, runnin' together in blurred spurts
Rappers can't fuck wit' me, it hurts worth
When they get abolished, polyps'll murk worse
Wrote a secret message insi

In [40]:
directory = os.fsencode('lyric_files')
s = 0 
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    with open('lyric_files/' + filename) as f:
        data = json.load(f)
        print(data['songs'][0]['artist'])
        print(len(data['songs']))
        s += len(data['songs'])

print(s)
   

2Pac
562
50 Cent
387
A$AP Rocky
141
A Tribe Called Quest
123
Busta Rhymes
334
Cardi B
52
Chance The Rapper
114
Childish Gambino
266
Common
217
Danny Brown
159
DMX
277
Drake
489
Eminem
458
Future
396
Ghostface Killah
239
J. Cole
272
JAY-Z
405
Kanye West
494
Kendrick Lamar
333
Kid Cudi
220
Lil' Kim
166
Lil Wayne
1074
Mac Miller
357
Migos
356
Missy Elliott
153
N.W.A
73
Nas
365
Nicki Minaj
254
OutKast
162
Playboi Carti
159
Pusha-T
109
ScHoolboy Q
131
Snoop Dogg
616
The Notorious B.I.G.
142
The Roots
217
Travis Scott
181
Tyler, The Creator
174
Vince Staples
108
Wiz Khalifa
519
Wu-Tang Clan
200
11454


In [33]:
directory = os.fsencode('lyric_files')

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    
    if filename.endswith(".json"): 
        print(filename)
        continue
    else:
        continue

Lyrics_2Pac.json
Lyrics_50Cent.json
Lyrics_A$APRocky.json
Lyrics_ATribeCalledQuest.json
Lyrics_BustaRhymes.json
Lyrics_CardiB.json
Lyrics_ChanceTheRapper.json
Lyrics_ChildishGambino.json
Lyrics_Common.json
Lyrics_DannyBrown.json
Lyrics_DMX.json
Lyrics_Drake.json
Lyrics_Eminem.json
Lyrics_Future.json
Lyrics_GhostfaceKillah.json
Lyrics_J.Cole.json
Lyrics_JAY-Z.json
Lyrics_KanyeWest.json
Lyrics_KendrickLamar.json
Lyrics_KidCudi.json
Lyrics_Lil'Kim.json
Lyrics_LilWayne.json
Lyrics_MacMiller.json
Lyrics_Migos.json
Lyrics_MissyElliott.json
Lyrics_N.W.A.json
Lyrics_Nas.json
Lyrics_NickiMinaj.json
Lyrics_OutKast.json
Lyrics_PlayboiCarti.json
Lyrics_Pusha-T.json
Lyrics_ScHoolboyQ.json
Lyrics_SnoopDogg.json
Lyrics_TheNotoriousB.I.G..json
Lyrics_TheRoots.json
Lyrics_TravisScott.json
Lyrics_Tyler,TheCreator.json
Lyrics_VinceStaples.json
Lyrics_WizKhalifa.json
Lyrics_Wu-TangClan.json
