In [2]:

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import sys, re
import random
from collections import Counter


In [4]:
base_dir = '/home/ubuntu/'
filename = 'all_lyrics_no-umlaut.txt'

path = base_dir + filename

try: 
    text = open(path).read().lower()
except UnicodeDecodeError:
    import codecs
    text = codecs.open(path, encoding='utf-8', errors='ignore').read().lower()



In [5]:


text = re.sub('\".*?\"', '', text)  # remove titles of songs
text = text.replace('\n\n\n', '\n')
text = text.replace('\n\n', '\n')
text = text.replace('\r\n', '\n')
    
text = text.replace('  ', ' ')


print('corpus length:', len(text))

chars = set(text)
words = set(text.split())

print("chars:",type(chars))
print("words",type(words))
print("total number of unique words",len(words))
print("total number of unique chars", len(chars))


('corpus length:', 144902)
('chars:', <type 'set'>)
('words', <type 'set'>)
('total number of unique words', 3361)
('total number of unique chars', 55)


In [19]:
def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = np.log(a) / temperature
    a = np.exp(a) / (np.sum(np.exp(a)+0.0000001))
    return np.argmax(np.random.multinomial(1, a, 1))
#    return np.argmax( a )


In [6]:

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

print("word_indices", type(word_indices), "length:",len(word_indices) )
print("indices_words", type(indices_word), "length", len(indices_word))

maxlen = 30
step = 3
print("maxlen:",maxlen,"step:", step)
sentences = []
next_words = []
list_words = []

sentences2=[]
list_words=text.lower().split()

#split the text into sentences of length maxlen --> sentences 
#split the list of words into list of "next_words" 
for i in range(0,len(list_words)-maxlen, step):
    sentences2 = ' '.join(list_words[i: i + maxlen])
    sentences.append(sentences2)
    next_words.append((list_words[i + maxlen]))
    
    

('word_indices', <type 'dict'>, 'length:', 3361)
('indices_words', <type 'dict'>, 'length', 3361)
('maxlen:', 30, 'step:', 3)


In [12]:
print('nb sequences(length of sentences):', len(sentences))
print("length of next_word",len(next_words))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(words)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for (t, word) in enumerate(sentence.split()):
        print(i,t,word)
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1


('nb sequences(length of sentences):', 9634)
('length of next_word', 9634)
Vectorization...
(0, 0, 'when')
(0, 1, "i'm")
(0, 2, "lyin'")
(0, 3, 'in')
(0, 4, 'my')
(0, 5, 'bed')
(0, 6, 'at')
(0, 7, 'night')
(0, 8, 'i')
(0, 9, "don't")
(0, 10, 'wanna')
(0, 11, 'grow')
(0, 12, 'up')
(0, 13, 'nothing')
(0, 14, 'ever')
(0, 15, 'seems')
(0, 16, 'to')
(0, 17, 'turn')
(0, 18, 'out')
(0, 19, 'right')
(0, 20, 'i')
(0, 21, "don't")
(0, 22, 'wanna')
(0, 23, 'grow')
(0, 24, 'up')
(0, 25, 'how')
(0, 26, 'do')
(0, 27, 'you')
(0, 28, 'move')
(0, 29, 'in')
(1, 0, 'in')
(1, 1, 'my')
(1, 2, 'bed')
(1, 3, 'at')
(1, 4, 'night')
(1, 5, 'i')
(1, 6, "don't")
(1, 7, 'wanna')
(1, 8, 'grow')
(1, 9, 'up')
(1, 10, 'nothing')
(1, 11, 'ever')
(1, 12, 'seems')
(1, 13, 'to')
(1, 14, 'turn')
(1, 15, 'out')
(1, 16, 'right')
(1, 17, 'i')
(1, 18, "don't")
(1, 19, 'wanna')
(1, 20, 'grow')
(1, 21, 'up')
(1, 22, 'how')
(1, 23, 'do')
(1, 24, 'you')
(1, 25, 'move')
(1, 26, 'in')
(1, 27, 'a')
(1, 28, 'world')
(1, 29, 'of')
(2, 

In [13]:
#build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(words))))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(words)))
#model.add(Dense(1000))
model.add(Activation('softmax'))

history = model.compile(loss='categorical_crossentropy', optimizer='rmsprop')


Build model...


In [None]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)
callbacks_list = [checkpoint]

history = model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)
plot_logs(history)

Epoch 1/20
Epoch 2/20
Epoch 3/20
1152/9634 [==>...........................] - ETA: 17s - loss: 0.0362

In [24]:

start_index = random.randint(0, len(list_words) - maxlen - 1)

for diversity in [0.2, 0.5, 1.0, 1.2]:
    print()
    print('----- diversity:', diversity)
    generated = ''
    sentence = list_words[start_index: start_index + maxlen]
    generated += ' '.join(sentence)
    print('----- Generating with seed: "' , sentence , '"')
    print()
    sys.stdout.write(generated)
    print()

    for i in range(128):
        x = np.zeros((1, maxlen, len(words)))
        for t, word in enumerate(sentence):
            x[0, t, word_indices[word]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]
        generated += next_word
        del sentence[0]
        sentence.append(next_word)
        sys.stdout.write(' ')
        sys.stdout.write(next_word)
        sys.stdout.flush()
    print()



()
('----- diversity:', 0.2)
('----- Generating with seed: "', ['crummy', 'crummy', 'stuff', 'crummy', 'crummy', 'stuff', 'crummy', 'music,', 'crummy', 'tv', 'crummy', 'people,', 'crummy', 'movies', 'like', 'a', 'cat', 'caught', 'up', 'a', 'tree', 'this', 'could', 'only', 'happen', 'to', 'me', 'i', 'had', 'enough,'], '"')
()
crummy crummy stuff crummy crummy stuff crummy music, crummy tv crummy people, crummy movies like a cat caught up a tree this could only happen to me i had enough,()
 i tell i rough i can't

  app.launch_new_instance()


 enough of the crummy are crummy crummy stuff you're crummy real good not not you've it's life we had baby baby love her stay i know this had we don't need a 8th more i've 8th cretin good my gun. and the crusher is my 8th everything i a party well, 8th i was wanna around in a 8th more gonna really 8th she never why down to girl some 8th with a part tonight for something her break of 'til as the drink of 8th in the last rock for then there place out for 8th in the feel this when i can't wait from me i'm the promise more he got a car of a 8th good a makes her()
()
('----- diversity:', 0.5)
('----- Generating with seed: "', ['crummy', 'crummy', 'stuff', 'crummy', 'crummy', 'stuff', 'crummy', 'music,', 'crummy', 'tv', 'crummy', 'people,', 'crummy', 'movies', 'like', 'a', 'cat', 'caught', 'up', 'a', 'tree', 'this', 'could', 'only', 'happen', 'to', 'me', 'i', 'had', 'enough,'], '"')
()
crummy crummy stuff crummy crummy stuff crummy music, crummy tv crummy people, crummy movies like a cat cau