In [10]:
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint

import numpy as np
import random
import io
import json
import re

In [11]:
# Loading the training data
path = '../data/manele-merged.json'
text = []
with io.open(path, encoding="utf-8") as f:
    file = json.load(f)
    for manea in file:
        for lyric in manea['lyrics']:
            text.append(lyric)
text = ''.join(text)

# Cleaning the text
text = text.lower()
to_replace = list('!"$&()*+/:;<=>@[]^_~{}#%\\|–…\ufeff\xa0§«»')
to_replace.append("'")
to_replace.append("refren")
to_replace.append("ref")
to_replace.append("florin salam")
to_replace.append("salam")
to_replace.append("bis")
to_replace.append("augustin")
to_replace.append("nicolae guta")
to_replace.append("nicoleta guta")
to_replace.append("guta")
to_replace.append("costel biju")
to_replace.append("liviu pustiu")
to_replace.append("dani mocanu")
to_replace.append("vali vijelie")
to_replace.append("solo")
to_replace.append("x2")
to_replace.append("2x")

for word in to_replace:
    text = text.replace(word, '')

text = re.sub('â|ă|а', 'a', text)
text = re.sub('í|î|ï|і|ἰ', 'i', text)
text = re.sub('ş|ș|ѕ', 's', text)
text = re.sub('ţ', 't', text)
text = re.sub('ν', 'v', text)
text = re.sub('в', 'b', text)
text = re.sub('е', 'e', text)
text = re.sub('к', 'k', text)
text = re.sub('м', 'm', text)
text = re.sub('н', 'h', text)
text = re.sub('о', 'o', text)
text = re.sub('р', 'p', text)
text = re.sub('с', 'c', text)
text = re.sub('т', 't', text)
text = re.sub('у', 'y', text)
text = re.sub('х', 'x', text)
text = re.sub('ј', 'j', text)


text = re.sub(r'\d\.', '', text)
text = re.sub(r'st?rofa \d*', '', text)
text = re.sub(r'-{2,}', '', text)
text = re.sub(r'sh', 's', text)
text = re.sub(r'\.{4,}', '...', text)
text = re.sub(r'\n\s*\n', '\n\n', text)
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print(chars)
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


Corpus length: 1577240
['\n', ' ', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Total chars: 42


In [12]:
text = text.replace('\n', ' \n ')
text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']

In [13]:
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1
MIN_WORD_FREQUENCY = 50
ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)
words = set(text_in_words)
print('Unique words before ignoring:', len(words))
# print('Ignoring words with frequency')
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 15386
Unique words after ignoring: 380


In [14]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
STEP = 1
SEQUENCE_LEN = 10
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 351390
Remaining sequences: 24084


In [15]:
def generator(sentence_list, next_words_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.bool)
        y = np.zeros((batch_size), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
                y[i] = next_words_list[index % len(sentence_list)]
                index = index + 1
                yield x, y


In [20]:
def generator_embedding(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t] = 1
            y[i] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [17]:
model = Sequential()
model.add(Embedding(len(words), 64))
model.add(Dropout(0.2))
model.add(GRU(64))
model.add(Dropout(0.2))
model.add(Dense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [18]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [21]:
epochs = 3
BATCH_SIZE = 128 

model.fit(generator(sentences, next_words, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2bb80757d30>

In [22]:
generated_length = 30
for diversity in [0.2, 0.5]:
    print("...Diversity:", diversity)

    generated = ""
    # original_sentence = text[start_index : start_index + maxlen]
    original_sentence = ["iubire"]
    sentence = original_sentence
    print('...Generating with seed:\n "' + str(sentence) + '"')

    for i in range(generated_length):
        x_pred = np.zeros((1, generated_length, len(word_indices)))
        for t, char in enumerate(sentence):
            x_pred[0, t, word_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]
        sentence = sentence[1:] + next_word
        generated += next_word

    print("...Generated:\n", original_sentence + generated)
    print()

...Diversity: 0.2
...Generating with seed:
 "['iubire']"


ValueError: in user code:

    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\training.py:1586 predict_function  *
        return step_function(self, iterator)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\training.py:1576 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\training.py:1569 run_step  **
        outputs = model.predict_step(data)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\training.py:1537 predict_step
        return self(x, training=False)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\sequential.py:369 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\functional.py:415 call
        inputs, training=training, mask=mask)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\functional.py:550 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\layers\recurrent.py:659 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\base_layer.py:1020 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    E:\ProgramData\Anaconda3\envs\dl\lib\site-packages\keras\engine\input_spec.py:218 assert_input_compatibility
        str(tuple(shape)))

    ValueError: Input 0 of layer gru_1 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 30, 380, 64)


In [39]:
model.save("../models/model-15-1.05.h5")

In [15]:
# Save weights in hdf5 format
filepath="model-{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath + ".hdf5", monitor='loss', verbose=1, save_best_only=True, mode='min')