# Lyrical RNN
### JT Wolohan, Uteerna Koul and Paritosh Prakash

**{jwolohan, ukoul, pmorpari}@indiana.edu**

*[Copyright (c) 2018 - Mozilla Public License v. 2.0](https://www.mozilla.org/en-US/MPL/2.0/)*

####  Imports and data loading
We're just bringing in the data we're going to need later and some modules for our data loading steps.
The actual modeling is **way** below.

In [1]:
import spacy
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
from gensim.models.fasttext import FastText
from functools import reduce
import numpy as np
import nltk
from math import log10
import csv
import pickle
import Lyrics2Vectors as l2v

In [2]:
_nlp = spacy.load("en_core_web_md")

In [3]:
lyricVectors = FastText.load("LyricVectors.pkl")

In [5]:
titlesAndLyrics = l2v.loadTitlesLyrics()
lyr_idfs = pickle.load(open("LyricTokenIDFs.pkl","rb"))

#### Lyr2Mat Class
This class is the workhorse of our preprocessing efforts. Feed this class with a spacy NLP model, a word vector model, and a dict of IDF scores from the training data and it'll be able to produce input matricies for use in the sequence to sequence RNN.

In [6]:
class Lyr2Mat:
  def __init__(self,nlp,vecs,idfs):
    self._nlp = nlp
    self._wv = vecs
    self._idfs = idfs
    self._lyrics_tf = {}
  def _cleanLookup(self,tkn):
    if tkn in self._wv.vocab.keys():
      v = self._wv[tkn]
    else:
      v = np.zeros(100)
    return v   
  def vectorize(self,token):
    """Turns token into wordspace, pos, tf, idf vector"""
    tkn = token.text
    pos = np.array(token.pos)
    idf = np.array(self._idfs.get(tkn,0))
    tf = np.array(log10(1+self._lyrics_tf.get(tkn,0)))
    v = self._cleanLookup(tkn)
    return np.hstack((v,pos,tf,idf))
  def title2Seq(self,title):
    """Converts a title to a sequence of vectors"""
    return np.array([self._cleanLookup(tkn) for tkn in nltk.tokenize.word_tokenize(title)])
  def decodeMatrix(self,M):
    return " ".join([self._wv.similar_by_vector(row,topn=1)[0][0] for row in M])
  def creatify(self,words,creative="happy"):
    return " ".join([lyricVectors.wv.most_similar(positive=[word,creative])[0][0] for word in words.split()])
  def createInputMatrix(self,lyrics):
    """Converts lyrics into input matrix"""
    tokens = nltk.tokenize.word_tokenize(lyrics)
    self._lyrics_tf = {x:tokens.count(x) for x in set(tokens)}
    tokens = self._nlp(lyrics)
    return np.array([self.vectorize(token) for token in tokens])

##### Lyr2Mat example

In [158]:
L2M = Lyr2Mat(_nlp,lyricVectors.wv,lyr_idfs)
lyrics = titlesAndLyrics['Lyrics'][:100]
title = titlesAndLyrics['Titles'][:100]

max_encoder_seq_length = max([len(titlesAndLyrics['Lyrics'][i].split()) for i in range(100)])
max_encoder_seq_length = 1000
max_decoder_seq_length = max([len(titlesAndLyrics['Titles'][i].split()) for i in range(100)])
max_decoder_seq_length = 50

print(max_encoder_seq_length)

inputs = np.zeros((100,max_encoder_seq_length,103))
for i in range(100):
    zeros=np.zeros(103)
    zeros=zeros.reshape(1,103)
    
    ones=np.ones(103)
    ones=ones.reshape(1,103)

    start=zeros

    # Appending 0s at start
    temp=np.append(start,L2M.createInputMatrix(titlesAndLyrics['Lyrics'][i]),axis=0)

    #Appending 1s to end the sequence
    temp=np.append(temp,ones,axis=0)

    # Standardize the input_size to length 1000

    for j in range(len(temp),max_encoder_seq_length):
        temp=np.append(temp,zeros,axis=0)
    inputs[i]=temp

# inputs = [L2M.createInputMatrix(titlesAndLyrics['Lyrics'][i]) for i in range(100)]
# inputs=np.asarray(inputs)
# inputs.shape

# targets = [L2M.title2Seq(titlesAndLyrics['Titles'][i]) for i in range(100)]
#pickle.dump(inputs,open("ShortInput.pkl","wb"))
#pickle.dump(targets,open("ShortOutput.pkl","wb"))


1000


In [161]:
max_decoder_seq_length = 50
targets = np.zeros((100,max_decoder_seq_length,103))
for i in range(100):
    zeros=np.zeros(103)
    zeros=zeros.reshape(1,103)
    start=zeros
    
    # Appending 0s at start
    temp=np.append(start,L2M.createInputMatrix(titlesAndLyrics['Titles'][i]),axis=0)
    
    ones=np.ones(103)
    ones=ones.reshape(1,103)
    #Appending 1s to end the sequence
    temp=np.append(temp,ones,axis=0)
    
    # Standardize the output to length 50 
    for j in range(len(temp),max_decoder_seq_length):
        temp=np.append(temp,zeros,axis=0)
    targets[i]=temp
targets.shape

(100, 50, 103)

In [163]:
t_shift_target=np.zeros(targets.shape)
for t in range[50]:
    t_shift_target[:,t+1,:]
    

TypeError: 'type' object is not subscriptable

In [8]:
print(_nlp.vocab.strings["apple"])


8566208034543834098


In [9]:
t = titlesAndLyrics["Titles"][45]
print(t)
t = L2M.decodeMatrix(L2M.title2Seq(t))
print(L2M.creatify(t,"love"))
print(L2M.creatify(t,"lust"))
print(L2M.creatify(t,"loss"))
print(L2M.creatify(t,"happy"))
print(L2M.creatify(t,"sad"))

I've Been Waiting For You
I-love-you i've Aberdeen waiting Foreclosure You-you-you
'just i've spleen Awaiting Force You'se
los i've Eileen longing Foreclosure oiou
'happy you've Eileen waiting Unhappy You-you-you
afriad i've Eileen longing Forsaken oiou


### Actual Seq2Seq RNN begins here...

In [10]:
### Below is boilerplate from the Keras
### TODO: Change this!
### https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [12]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense,Embedding


Using TensorFlow backend.


In [124]:
# Prep the input
inputs.shape

(100, 1000, 103)

In [126]:
latent_dim=64

In [143]:
num_encoder_tokens = 200
num_decoder_tokens = 10
batch_size = len(inputs)
num_parameters = 103
# max_encoder_seq_length = max([len(titlesAndLyrics['Lyrics'][i].split()) for i in range(100)])
# max_decoder_seq_length = max([len(titlesAndLyrics['Titles'][i].split()) for i in range(100)])

encoder_inputs = Input(shape=(None, 103))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None, 103))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(103, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [145]:
# Run training
model.compile(optimizer='rmsprop', loss='mse')
model.fit([inputs, targets], targets,
          batch_size=64,
          epochs=1,
          validation_split=0.2)

Train on 80 samples, validate on 20 samples
Epoch 1/1


<keras.callbacks.History at 0x19e566ea5f8>

In [156]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [157]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [155]:
#inputs[0]
decode_sequence(inputs[1])

ValueError: Error when checking : expected input_20 to have 3 dimensions, but got array with shape (1000, 103)