In [1]:
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint

import numpy as np
import random
import io
import json
import re

In [32]:
# Loading the training data
path = '../data/manele-merged.json'
text = []
with io.open(path, encoding="utf-8") as f:
    file = json.load(f)
    for manea in file:
        for lyric in manea['lyrics']:
            text.append(lyric)
text = ''.join(text)

# Cleaning the text
text = text.lower()
to_replace = list('!"$&()*+/:;<=>@[]^_~{}#%\\|–…\ufeff\xa0§«»')
to_replace.append("'")
to_replace.append("refren")
to_replace.append("ref")
to_replace.append("florin salam")
to_replace.append("salam")
to_replace.append("bis")
to_replace.append("augustin")
to_replace.append("nicolae guta")
to_replace.append("nicoleta guta")
to_replace.append("guta")
to_replace.append("costel biju")
to_replace.append("liviu pustiu")
to_replace.append("dani mocanu")
to_replace.append("vali vijelie")
to_replace.append("solo")
to_replace.append("x2")
to_replace.append("2x")

for word in to_replace:
    text = text.replace(word, '')

text = re.sub('â|ă|а', 'a', text)
text = re.sub('í|î|ï|і|ἰ', 'i', text)
text = re.sub('ş|ș|ѕ', 's', text)
text = re.sub('ţ', 't', text)
text = re.sub('ν', 'v', text)
text = re.sub('в', 'b', text)
text = re.sub('е', 'e', text)
text = re.sub('к', 'k', text)
text = re.sub('м', 'm', text)
text = re.sub('н', 'h', text)
text = re.sub('о', 'o', text)
text = re.sub('р', 'p', text)
text = re.sub('с', 'c', text)
text = re.sub('т', 't', text)
text = re.sub('у', 'y', text)
text = re.sub('х', 'x', text)
text = re.sub('ј', 'j', text)


text = re.sub(r'\d\.', '', text)
text = re.sub(r'st?rofa \d*', '', text)
text = re.sub(r'-{2,}', '', text)
text = re.sub(r'sh', 's', text)
text = re.sub(r'\.{4,}', '...', text)
text = re.sub(r'\n\s*\n', '\n\n', text)
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print(chars)
print("Total chars:", len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


Corpus length: 1577240
['\n', ' ', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Total chars: 42


In [42]:
text = text.replace('\n', ' \n ')
text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']

In [None]:
word_freq = {}
for word in text_in_words:
    word_freq{word} = word_freq.get(word, 0) + 1
MIN_WORD_FREQUENCY = 450
ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)
words = set(text_in_words)
print('Unique words before ignoring:', len(words))
# print('Ignoring words with frequency')
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [44]:

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 25
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


Number of sequences: 609665


In [5]:
model = Sequential()
model.add(LSTM(256, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model2 = Sequential()
model2.add(Embedding(len(words), 64))
model2.add(Dropout(0.2))
model2.add(GRU(64))
model2.add(Dropout(0.2))
model2.add(Dense(len(words)))
model2.add(Activation('softmax'))
model2.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [6]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
epochs = 3
batch_size = 128 

model.fit(x, y, batch_size=batch_size, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1bd8ce117b8>

In [11]:
generated_length = 300
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5]:
    print("...Diversity:", diversity)

    generated = ""
    # original_sentence = text[start_index : start_index + maxlen]
    original_sentence = "iubire"
    sentence = original_sentence
    print('...Generating with seed:\n "' + sentence + '"')

    for i in range(generated_length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char

    print("...Generated:\n", original_sentence + generated)
    print()

...Diversity: 0.2
...Generating with seed:
 "iubire"
...Generated:
 iubire      oo    yyy966q9758599984y666965y6q96959999888699666w9996796969650yq996166969959899986999999699999968969666995996w9866699q55y9994w9866998q9999w9999699669q965965y895q9994y666967q5y555y69y6555y5yy9956969996999999699696995q9999y969996969675y5q55y5yq89696975y5qq595y58699696689996999969999999844y6696

...Diversity: 0.5
...Generating with seed:
 "iubire"
...Generated:
 iubire s y g hyyhyy65q555yy58y5q6858y5888969986979975w99w66qq95559yy695q594yy966699w9996671886448hy81hgy999hdqy669q55hy1y.yy668357y8q989q86969564yq696755hyyyy.9q666865118y864w7996695495y689667936y9568996hyw81ww90997w64w689645.w80yw0668w86999699656y97q955y599yq64966w9q5558y9619666406161wy5969q641648144y798



In [39]:
model.save("../models/model-15-1.05.h5")

In [15]:
# Save weights in hdf5 format
filepath="model-{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath + ".hdf5", monitor='loss', verbose=1, save_best_only=True, mode='min')