In [0]:
# Lyrics Generator

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.callbacks import LambdaCallback
from keras.models import Sequential, Model
from keras.layers import Dense, Input, GRU
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import random
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.layers import Dropout

Using TensorFlow backend.


In [0]:
# Extract data from zip file
import zipfile
with zipfile.ZipFile('lyrics.csv.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [0]:
# Read into dataframe and drop all the NAs
df = pd.read_csv('lyrics.csv')
df = df.dropna()
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [0]:
# Removing songs without lyrics
pop = df.loc[df['genre'] == 'Pop']
pop = pop.loc[pop['lyrics'].str.len() >= 100]
pop = pop.loc[pop['lyrics'].str.len() <= 1000]
pop = pop.head(2000)

In [0]:
# concating and splitting the lyrics into tokens
lyrics = pop['lyrics'].str.cat(sep = ' ')
lyrics = lyrics.replace('\n', ' \n ')
lyrics = lyrics.lower()
tokens = [w for w in lyrics.split(' ') if w.strip() != '' or w == '\n']

In [0]:
# chekc tokenized lyrics
lyrics[0:100], tokens[0:20]
ts_tokens = tokens
len(ts_tokens)

331595

In [0]:
words = set(ts_tokens)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [0]:
# splitting tokens into sequences
seq_leng = 6
sentences = []
next_words = []
for i in range(0, len(ts_tokens) - seq_leng, 1):
  sentences.append(ts_tokens[i:i+seq_leng])
  next_words.append(ts_tokens[i+seq_leng])
  


In [0]:
sentences[0], next_words[0]

(['if', 'you', 'search', '\n', 'for', 'tenderness'], '\n')

In [0]:
len(words)

20037

In [0]:
# Training, testing split
sentences_train, sentences_test, words_train, words_test = train_test_split(sentences, next_words, random_state = 43)

In [0]:
# vectorize data
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, seq_leng, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [0]:
# LSTM Model

"""
model = Sequential()
model.add(LSTM((128), input_shape=(seq_leng, len(words))))
Dense(128, activation='relu')
model.add(Dropout(0.3))
model.add(Dense(len(words), activation='softmax'))
"""
inp = Input(shape=(seq_leng, len(words)))
x = LSTM(128, return_sequences=True)(inp)
x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
outp = Dense(len(words), activation='softmax')(x)

model = Model(inputs=[inp], outputs=[outp])
model.compile(
        loss='categorical_crossentropy',
        optimizer='rmsprop',
        metrics=['accuracy']
    )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 6, 20037)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 6, 128)            10324992  
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 20037)             2584773   
Total params: 13,057,861
Trainable params: 13,057,861
Non-trainable params: 0
________________________________________________________________

In [0]:
# optimizer = RMSprop(lr=0.01)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer)

filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [0]:
BATCH_SIZE = 128
EPOCHS = 20

model.fit_generator(generator(sentences_train, words_train, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=EPOCHS,
    validation_data=generator(sentences_test, words_test, BATCH_SIZE),
                   validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1,
                   callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 6.24011, saving model to weights-improvement-01-6.2401.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.24011 to 5.79983, saving model to weights-improvement-02-5.7998.hdf5
Epoch 3/20

Epoch 00003: loss improved from 5.79983 to 5.67467, saving model to weights-improvement-03-5.6747.hdf5
Epoch 4/20

Epoch 00004: loss improved from 5.67467 to 5.64053, saving model to weights-improvement-04-5.6405.hdf5
Epoch 5/20

Epoch 00005: loss improved from 5.64053 to 5.63315, saving model to weights-improvement-05-5.6331.hdf5
Epoch 6/20

Epoch 00006: loss improved from 5.63315 to 5.60683, saving model to weights-improvement-06-5.6068.hdf5
Epoch 7/20

Epoch 00007: loss improved from 5.60683 to 5.52559, saving model to weights-improvement-07-5.5256.hdf5
Epoch 8/20

Epoch 00008: loss improved from 5.52559 to 5.40340, saving model to weights-improvement-08-5.4034.hdf5
Epoch 9/20

Epoch 00009: loss improved from 5.40340 to 5.36965, saving model to weig

In [0]:
# load the network weights from the best result
filename = "weights-improvement-09-5.3684.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [0]:
# Generate Starting Seed from keywords
import nltk
bigram = list(nltk.ngrams(ts_tokens, 6))
fdist = nltk.FreqDist(bigram)

In [0]:
keyword = "sky"
grams = []
for ngrams, freq in fdist.items():
  if keyword in ngrams and ngrams[5] == '\n':
    grams.append(ngrams)


pattern = list(grams[random.randint(0, len(grams))])
pattern
# pattern = ['you', 'are', 'my', 'sunshine', '\n', 'my']

['by', 'helicopter', 'to', 'the', 'sky', '\n']

In [0]:
# Generate texts

texts = pattern

for i in range(200):
  pred = np.zeros((seq_leng, len(words)), dtype=np.bool)
  for i, w in enumerate(pattern):
    pred[i, word_indices[w]] = 1
  pred = np.reshape(pred, (1, seq_leng, len(words)))
  prediction = model.predict(pred, verbose=0)
  index = np.argmax(prediction)
  pattern.append(indices_word[index])
  pattern = pattern[1:]
  texts.append(indices_word[index])
 
print(' '.join(texts))


by helicopter to the sky 
 and and i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be 
 i want to be


In [0]:
# vectorize the starting seed

In [0]:
######################################
######################################
# Try new model new embeddings
dataX = [[word_indices[w] for w in s] for s in sentences]
dataY = [word_indices[w] for w in next_words]

In [0]:
len(dataX), len(dataY)

(91987, 91987)

In [0]:
X = np.reshape(dataX, (len(dataX), seq_leng, 1))
X = X / len(words)
y = np_utils.to_categorical(dataY)
X.shape

(91987, 6, 1)

In [0]:
# Compile model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [0]:
model.fit(X, y, epochs=50, batch_size=128, callbacks=callbacks_list)

In [0]:
# load weights and predict
model.load_weights('weights-improvement-50-1.9784.hdf5')
model.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print(' '.join([indices_word[value] for value in pattern]))

pattern


Seed:
me lord, if you think there's


[2270, 2710, 63, 3262, 292, 5799]

In [0]:
start = [indices_word[value] for value in pattern]

for i in range(100):
  x = np.reshape(pattern, (1, len(pattern), 1))
  x = x / float(len(words))
  prediction = model.predict(x, verbose=0)
  index = np.argmax(prediction)
  result = indices_word[index]
  seq_in = [indices_word[value] for value in pattern]
  pattern.append(index)
  pattern = pattern[1:len(pattern)]
  start.append(result)
  
start

['me',
 'lord,',
 'if',
 'you',
 'think',
 "there's",
 'me',
 'lost',
 '"yoncã©"',
 'ha',
 'boy',
 '\n',
 'make',
 'spit',
 '\n',
 'mind)',
 '\n',
 '\n',
 'virgo,',
 'aaaaaaaaa-oooo',
 '\n',
 'you',
 'without',
 'toughest',
 "that's",
 '\n',
 '\n',
 'with',
 'works',
 'a',
 '\n',
 'si',
 '\n',
 'one',
 'you',
 'you',
 '\n',
 'to',
 'goes',
 'the',
 '\n',
 'on',
 'h,',
 'left',
 'a',
 '\n',
 'into',
 '\n',
 '\n',
 '(*',
 '50',
 'cent',
 '\n',
 'my',
 'killing',
 '\n',
 'easy',
 'you',
 'la',
 'you',
 'what?',
 'party',
 'i',
 '\n',
 '\n',
 'i',
 'low',
 'my',
 'to',
 'you',
 '\n',
 'boy,',
 '\n',
 'cat-walks,',
 '\n',
 'baby,',
 '\n',
 '(oh)',
 '\n',
 'refuse',
 '\n',
 '(oh)',
 '\n',
 'lost',
 '\n',
 '(lost,',
 '\n',
 '(oh)',
 '\n',
 'you',
 '\n',
 '(oh)',
 '\n',
 'must',
 '\n',
 '(oh)',
 '\n',
 'have',
 '\n',
 '(oh)',
 '\n',
 'lost',
 '\n',
 '(oh)',
 '\n',
 'yo']