In [1]:
import os
import sys
sys.path.append('..')

In [23]:
import numpy
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [3]:
from preprocessing import get_dataframe, set_seed

In [4]:
set_seed(42)

In [5]:
df = get_dataframe(os.path.join('..', 'data', 'lyrics'))
df.head()

Unnamed: 0,album,song,text
0,AHardDaysNight,Ill_Be_Back.txt,"You know, if you break my heart I'll go But I'..."
1,AHardDaysNight,Cant_Buy_Me_Love.txt,"Can't buy me love, oh Love, oh Can't buy me lo..."
2,AHardDaysNight,Any_Time_At_All.txt,Any time at all Any time at all Any time at al...
3,AHardDaysNight,A_Hard_Days_Night.txt,It's been a hard day's night And I've been wor...
4,AHardDaysNight,Ill_Cry_Instead.txt,I've got every reason on earth to be mad 'Caus...


In [7]:
texts = ' '.join(str(elem) for elem in df.text)

In [8]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [9]:
processed_inputs = tokenize_words(texts)

In [10]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [11]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 99824
Total vocab: 36


In [12]:
seq_length = 100
x_data = []
y_data = []

In [13]:
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 99724


In [16]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [17]:
X.shape

(99724, 100, 1)

In [19]:
y = np_utils.to_categorical(y_data)

In [24]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [26]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
model.fit(X, y, epochs=1, batch_size=256)  # increase to >100 to get acceptable results



<tensorflow.python.keras.callbacks.History at 0x7f84ce34af50>

In [36]:
set_seed(111)
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
num_to_char = dict((i, c) for i, c in enumerate(chars))

for i in range(10):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

oe  o   e 