In [142]:
# Imports

import io
import os
import sys
import string
import numpy as np
import pandas as pd
import tensorflow
from collections import Counter
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras import Input, Model, backend, utils
from keras.layers import *
backend.clear_session()

In [143]:
# Put song data into dataframe

translator = str.maketrans('', '', string.punctuation)

df = pd.read_csv("/data/lyrics.csv", sep="\t", engine="python", encoding="utf-8", error_bad_lines=False)
# Drops lyrics with NaN as their value
df.index.name = 'id'
df = df.dropna()

df.head()




  exec(code_obj, self.user_global_ns, self.user_ns)
Skipping line 2188: unexpected end of data


Unnamed: 0_level_0,song_id,lyrics
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."
5,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma..."


In [144]:
def split_text(x):

   text = x['lyrics']

   sections = text.split('\\n\\n')

   keys = {'Verse 1': np.nan,'Verse 2':np.nan,'Verse 3':np.nan,'Verse 4':np.nan, 'Chorus':np.nan}

   lyrics = str()

   single_text = []

   res = {}

   
   for s in sections:

       key = s[s.find('[') + 1:s.find(']')].strip()

       if ':' in key:

           key = key[:key.find(':')]
          

       if key in keys:
          single_text += [x.lower().replace('(','').replace(')','').translate(translator) for x in s[s.find(']')+1:].split('\\n') if len(x) > 1]

       res['single_text'] =  ' \n'.join(single_text)
   return pd.Series(res)


df = df.join( df.apply(split_text, axis=1), lsuffix="_Left", rsuffix="_Right")

df.head()

Unnamed: 0_level_0,song_id,lyrics,single_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3e9HZxeyfWwjeyPAMmWSSQ,['[Verse 1]\nThought I\'d end up with Sean\nBu...,thank you next next \nthank you next next \nth...
1,5p7ujcrUXASCNwRaWNHR1C,"[""[Verse 1]\nFound you when your heart was bro...",tell me hows it feel sittin up there \nfeelin ...
2,2xLMifQCjDGFmkHkpNLD9h,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun...",woo made this here with all the ice on in the ...
4,1rqqCSm0Qe4I9rUvWncaom,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t...",had to have high high hopes for a living \nsho...
5,0bYg9bo50gSsH3LtXe2SQn,"[""[Intro]\nI-I-I don't want a lot for Christma...",i dont want a lot for christmas \nthere is jus...


In [151]:
# Filtering our dataset for common vs. uncommon words
text_list = []
word_frequencies = Counter()
uncommon_words = set()
MIN_FREQUENCY = 7
MIN_SEQ = 5
BATCH_SIZE = 32


def extract_text(text):
   global text_list
   text_list += [w for w in text.split(' ') if w.strip() != '' or w == '\n']


df['single_text'].apply( extract_text )

print('Total words: ', len(text_list))


for w in text_list:

   word_frequencies[w] = word_frequencies.get(w, 0) + 1
  

uncommon_words = set([key for key in word_frequencies.keys() if word_frequencies[key] < MIN_FREQUENCY])

words = sorted(set([key for key in word_frequencies.keys() if word_frequencies[key] >= MIN_FREQUENCY]))


num_words = len(words)

word_indices = dict((w, i) for i, w in enumerate(words))

indices_word = dict((i, w) for i, w in enumerate(words))

print('Words with less than {} appearances: {}'.format( MIN_FREQUENCY, len(uncommon_words)))

print('Words with more than {} appearances: {}'.format( MIN_FREQUENCY, len(words)))


valid_seqs = []

end_seq_words = []

for i in range(len(text_list) - MIN_SEQ ):

   end_slice = i + MIN_SEQ + 1

   if len( set(text_list[i:end_slice]).intersection(uncommon_words) ) == 0:

       valid_seqs.append(text_list[i: i + MIN_SEQ])

       end_seq_words.append(text_list[i + MIN_SEQ])
      

print('Valid sequences of size {}: {}'.format(MIN_SEQ, len(valid_seqs)))


X_train, X_test, y_train, y_test = train_test_split(valid_seqs, end_seq_words, test_size=0.02, random_state=42)

print(X_train[2:5])

Total words:  493654
Words with less than 7 appearances: 14398
Words with more than 7 appearances: 4252
Valid sequences of size 5: 368773
[['that', '\ni', 'had', 'someone', 'tell'], ['see', 'why', '\nnowadays', 'theres', 'still'], ['\ni', 'like', 'my', 'kisses', 'down']]


In [152]:
# Data generator for fit and evaluate

def generator(sentence_list, next_word_list, batch_size):

   index = 0

   while True:

       x = np.zeros((batch_size, MIN_SEQ), dtype=np.int32)

       y = np.zeros((batch_size), dtype=np.int32)

       for i in range(batch_size):

           for t, w in enumerate(sentence_list[index % len(sentence_list)]):

               x[i, t] = word_indices[w]

           y[i] = word_indices[next_word_list[index % len(sentence_list)]]

           index = index + 1

       yield x, y


# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py

def sample(preds, temperature=1.0):

   # helper function to sample an index from a probability array

   preds = np.asarray(preds).astype('float64')

   preds = np.log(preds) / temperature

   exp_preds = np.exp(preds)

   preds = exp_preds / np.sum(exp_preds)

   probas = np.random.multinomial(1, preds, 1)

   return np.argmax(probas)


def on_epoch_end(epoch, logs):

   # Function invoked at end of each epoch. Prints generated text.

   examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)


   # Randomly pick a seed sequence

   seed_index = np.random.randint(len(X_train+X_test))

   seed = (X_train+X_test)[seed_index]


   for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:

       sentence = seed

       examples_file.write('----- Diversity:' + str(diversity) + '\n')

       examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')

       examples_file.write(' '.join(sentence))


       for i in range(50):

           x_pred = np.zeros((1, MIN_SEQ))

           for t, word in enumerate(sentence):

               x_pred[0, t] = word_indices[word]


           preds = model.predict(x_pred, verbose=0)[0]

           next_index = sample(preds, diversity)

           next_word = indices_word[next_index]


           sentence = sentence[1:]

           sentence.append(next_word)


           examples_file.write(" "+next_word)

       examples_file.write('\n')

   examples_file.write('='*80 + '\n')

   examples_file.flush()

In [156]:
def getModel():

   print('Build model...')

   model = Sequential()

   model.add(Embedding(input_dim=len(words), output_dim=1024))

   model.add(Bidirectional(LSTM(128)))

   model.add(Dense(len(words)))

   model.add(Activation('softmax'))

   return model

def getEncoder():
   # int sequences.
   enc_inputs = Input(shape=(20,), name='enc_inputs')

   
   # Embedding lookup and GRU
   embedding = Embedding(input_dim=100, output_dim=64)(enc_inputs)
   whole_sequence = GRU(4, return_sequences=True)(embedding)

   # Query-value attention of shape [batch_size, Tq, filters].
   query_value_attention_seq = Attention()([whole_sequence, whole_sequence])

   # build encoder model 
   encoder = Model(enc_inputs, query_value_attention_seq, name='encoder')

   return encoder

def getDecoder():
  # int sequences.
  dec_input = Input(shape=(20, 4), name='dec_inputs')

  # LSTM
  whole_sequence = LSTM(4, return_sequences=True)(dec_input)

  # Query-value attention of shape [batch_size, Tq, filters].
  query_value_attention_seq = AdditiveAttention()([whole_sequence, dec_input])

  # Reduce over the sequence axis to produce encodings of shape
  # [batch_size, filters].
  query_value_attention = GlobalAveragePooling1D()(query_value_attention_seq)

  # classification
  dec_output = Dense(1, activation='sigmoid')(query_value_attention)

  # build decoder model
  decoder = Model(dec_input, dec_output, name='decoder')
  return decoder

def getAutoEncoder():
  encoder = getEncoder()
  encoder_init = Input(shape=(20, ))
  encoder_output = encoder(encoder_init)
  print(encoder_output.shape)

  decoder = getDecoder()
  decoder_output = decoder(encoder_output)
  print(decoder_output.shape)

  autoencoder = Model(encoder_init, decoder_output)
  return autoencoder


In [None]:
# model = getAutoEncoder()
model = getModel()

model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])


file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_accuracy:.4f}" % \
            (len(words), MIN_SEQ, MIN_FREQUENCY)


checkpoint = ModelCheckpoint(file_path, monitor='val_accuracy', save_best_only=True)

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

early_stopping = EarlyStopping(monitor='val_accuracy', patience=20)

callbacks_list = [checkpoint, print_callback, early_stopping]


examples_file = open('examples.txt', "w")

model.fit(generator(X_train, y_train, BATCH_SIZE),

                   steps_per_epoch=int(len(valid_seqs)/BATCH_SIZE) + 1,

                   epochs=20,

                   callbacks=callbacks_list,

                   validation_data=generator(X_test, y_train, BATCH_SIZE),

                   validation_steps=int(len(y_train)/BATCH_SIZE) + 1)