In [10]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding, GRU
import numpy as np
import sys
import io
import os
import re
import json
import tensorflow as tf

In [11]:
# Parameters: change to experiment different configurations
SEQUENCE_LEN = 8
MIN_WORD_FREQUENCY = 18
STEP = 1
BATCH_SIZE = 32
GENERATED_LENGTH = 80
examples_filepath = "../data/examples_18.txt"

In [12]:
path = '../data/songs.json'
text = []
with io.open(path, encoding="utf-8") as f:
    file = json.load(f)
    for manea in file:
        for lyric in manea['lyrics']:
            text.append(lyric)
text = ''.join(text)

# Cleaning the text
text = text.lower()
to_replace = list('!"$&()*+/:;<=>@[]^_~{}#%\\|–…\ufeff\xa0§«»')
to_replace.append("'")
to_replace.append("refren")
to_replace.append("ref")
to_replace.append("florin salam")
to_replace.append("salam")
to_replace.append("bis")
to_replace.append("augustin")
to_replace.append("nicolae guta")
to_replace.append("nicoleta guta")
to_replace.append("guta")
to_replace.append("costel biju")
to_replace.append("liviu pustiu")
to_replace.append("dani mocanu")
to_replace.append("vali vijelie")
to_replace.append("solo")
to_replace.append("x2")
to_replace.append("2x")
to_replace.append("x4")
to_replace.append("x 2")

for word in to_replace:
    text = text.replace(word, '')

text = re.sub('â|ă|а', 'a', text)
text = re.sub('í|î|ï|і|ἰ', 'i', text)
text = re.sub('ş|ș|ѕ', 's', text)
text = re.sub('ţ', 't', text)
text = re.sub('ν', 'v', text)
text = re.sub('в', 'b', text)
text = re.sub('е', 'e', text)
text = re.sub('к', 'k', text)
text = re.sub('м', 'm', text)
text = re.sub('н', 'h', text)
text = re.sub('о', 'o', text)
text = re.sub('р', 'p', text)
text = re.sub('с', 'c', text)
text = re.sub('т', 't', text)
text = re.sub('у', 'y', text)
text = re.sub('х', 'x', text)
text = re.sub('ј', 'j', text)
text = re.sub('k', 'ca', text)

text = re.sub(r'viatza', 'viata', text)
text = re.sub(r'lai', 'l-ai', text)
text = re.sub(r'pt', 'pentru', text)
text = re.sub(r'nam', 'n-am', text)
text = re.sub(r'spunemi', 'spune-mi', text)

text = re.sub(r'(?<!\S)((\S+)(?:\s+\2))(?:\s+\2)+(?!\S)', r'\1\1\1', text)
text = re.sub(r'\d\.', '', text)
text = re.sub(r'st?rofa \d*', '', text)
text = re.sub(r'-{2,}', '', text)
text = re.sub(r'sh', 's', text)
text = re.sub(r'\.{4,}', '...', text)
text = re.sub(r'\n\s*\n', '\n\n', text)
text = text.replace('\n', ' \n ')
print("Corpus length:", len(text))

chars = sorted(list(set(text)))
print(text)


Corpus length: 1717431
 
  
 te cunosc de ani de zile, viata mea rachi tachi tarara  
 sunt mai fericit pe lume rachi tachi tarara, rachi tachi tarara  
 nu pleca iubirea mea rachi tachi tachi tachi, rachi tachi tarara  
 ai grija de viata mea 
 viata mea, iubirea mea 
  
 zig zag zagala  
 tu esti frumusetea mea 
 zaga zaga zagala  
 miai cucerit inima 
 da, da, da-i adevarat  
 frumusete ca a ta 
 n-am vazut la nimenea 
 tig, da, pa  
 copilu vagabont 
 zig zag zagala  
 bate, bate inima 
 zig zag zagala  
 te iubesc dragostea mea 
 zig zag zagala  
 fara tine nu pot sta  
 ca tu esti toata fericirea mea 
  
 esti tot ce am mai spump pe lume, viata mea zig zag zagala  
 ce mas face fara tine 
 zig zag zagala  
 sa-mpartim viata in doi 
 zig zag zagala  
 se mira lumea de noi 
 cum ne iubim amandoi 
  
 zig zag zagala  
  
 zig zag zagala  
 tu esti frumusetea mea 
 zaga zaga zagala  
 mi-ai cucerit inima 
 da, da, da-i adevarat 
 frumusete ca a ta 
 n-am vazut la nimenea 
 tig, da, p

In [13]:
text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
text_in_lyrics = [w for w in text.split('\n')]
print('Corpus length in words:', len(text_in_words))
print('Corpus length in lyrics:', len(text_in_lyrics))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)
    print(k, v)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

with open("../data/vocabulary.txt", "w") as f:
    for w in words:
        f.write("%s\n" % w)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences_original = []
next_words_original = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences_original.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words_original.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored + 1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences_original))

Corpus length in words: 375318

 62870
sa 9604
si 9088
de 8162
nu 8156
ca 7113
ma 4591
mai 4570
te 4449
la 4352
cu 4035
ce 3855
in 3595
am 3431
mea 3331
pe 3083
eu 2861
o 2820
viata 2533
tu 2531
e 2470
tine 2410
mine 2057
ai 1812
cand 1785
pentru 1780
dar 1753
sunt 1652
inima 1608
daca 1596
din 1590
esti 1528
da 1526
imi 1464
un 1442
ta 1429
cum 1349
tot 1275
doar 1202
vreau 1176
fi 1118
numai 1104
as 1096
iubirea 1085
bine 1042
fac 1014
a 983
pot 981
iubesc 943
asa 933
stiu 903
iti 899
nici 894
se 888
zi 870
cat 848
meu 842
fara 803
ea 780
va 768
vrea 755
ne 752
mult 746
iubire 718
dau 714
cine 697
n-am 693
fost 681
le 676
mereu 672
vrei 654
voi 631
bani 621
hai 613
mare 599
tau 595
toata 592
stii 587
acum 582
banii 575
lumea 549
langa 534
te-am 524
sa-mi 520
poate 516
dragostea 498
toate 497
mea, 491
care 490
rau 489
fata 487
toti 474
dor 473
mor 472
m-ai 470
sufletul 468
face 462
mi-e 460
orice 458
facut 457
mie 454
este 449
mi-ai 442
noi 437
sa-ti 431
vad 430
mei 430
unde 430
tare 

In [14]:
# shuffle at unison
print('Shuffling sentences')
percentage_test = 2

tmp_sentences = []
tmp_next_word = []
for i in np.random.permutation(len(sentences_original)):
# for i in range(0, len(sentences_original)):
    tmp_sentences.append(sentences_original[i])
    tmp_next_word.append(next_words_original[i])

cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
sentences, sentences_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
next_words, next_words_test= tmp_next_word[:cut_index], tmp_next_word[cut_index:]

print("Size of training set = %d" % len(sentences))
print("Size of test set = %d" % len(sentences_test))


Shuffling sentences
Size of training set = 140191
Size of test set = 2862


In [15]:
# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [16]:
model = Sequential()
model.add(Embedding(input_dim=len(words), output_dim=1024))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))
model.add(Dense(len(words)))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [17]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [18]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file = open(examples_filepath, "a")
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.5]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(GENERATED_LENGTH):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [19]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
callbacks_list = [print_callback]
model.fit(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=15,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x24d9a959128>

In [14]:
model.save("../models/word-model-seq-" + str(SEQUENCE_LEN) + "-min-" + str(MIN_WORD_FREQUENCY) + "_v2.h5", save_format="h5")