Separador de silabas extraido de https://github.com/mabodo/sibilizador/blob/master/Silabizator.ipynb

In [3]:
class char():
    def __init__(self):
        pass
    
class char_line():
    def __init__(self, word):
        self.word = word
        self.char_line = [(char, self.char_type(char)) for char in word]
        self.type_line = ''.join(chartype for char, chartype in self.char_line)
        
    def char_type(self, char):
        if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
            return 'V' #strong vowel
        if char in set(['i', 'u']):
            return 'v' #week vowel
        if char=='x':
            return 'x'
        if char=='s':
            return 's'
        else:
            return 'c'
            
    def find(self, finder):
        return self.type_line.find(finder)
        
    def split(self, pos, where):
        return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
    
    def split_by(self, finder, where):
        split_point = self.find(finder)
        if split_point!=-1:
            chl1, chl2 = self.split(split_point, where)
            return chl1, chl2
        return self, False
     
    def __str__(self):
        return '<'+self.word+':'+self.type_line+'>'
    
    def __repr__(self):
        return '<'+repr(self.word)+':'+self.type_line+'>'

class silabizer():
    def __init__(self):
        self.grammar = []
        
    def split(self, chars):
        rules  = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1), ('vxv',1), ('VxV',1), ('vxV',1), ('Vxv',1)]
        for split_rule, where in rules:
            first, second = chars.split_by(split_rule,where)
            if second:
                if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
                    #print 'skip1', first.word, second.word, split_rule, chars.type_line
                    continue
                if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
                    continue
                if first.word[-1]=='l' and second.word[-1]=='l':
                    continue
                if first.word[-1]=='r' and second.word[-1]=='r':
                    continue
                if first.word[-1]=='c' and second.word[-1]=='h':
                    continue
                return self.split(first)+self.split(second)
        return [chars]
        
    def __call__(self, word):
        return list(map(lambda x: x.word, self.split(char_line(word))))

def silabize_line(line):
    s = silabizer()
    flat_list = []
    for word in line.split(" "):
        flat_list.extend(s(word))
        flat_list.append("\\")
    return flat_list

Creacion de set de entrenamiento (sentences, next_words)



In [4]:

# Parameters: change to experiment different configurations
SEQUENCE_LEN = 10
MIN_WORD_FREQUENCY = 3
STEP = 1
BATCH_SIZE = 32

import os,io
import codecs
import numpy as np
import unicodedata
from keras.preprocessing.text import Tokenizer



def print_vocabulary(words_file_path, words_set):
    words_file = codecs.open(words_file_path, 'w', encoding='utf8')
    for w in words_set:
        if w != "\n":
            words_file.write(w+"\n")
        else:
            words_file.write(w)
    words_file.close()

def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)



corpus = "merge.txt"
examples = "examples.txt"
vocabulary = "vocabulary.txt"

if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

symbols_keep = ['\n','?',',',':',"'"]
symbols_delete = ['¿','.','"','(',')','-']
with io.open(corpus, encoding='utf-8') as f:
    text = f.read().lower().replace('\xa0', ' ')
    for s in symbols_keep:
        text = text.replace(s,' '+s+' ')
    for s in symbols_delete:
        text = text.replace(s,'')

text = unicodedata.normalize('NFC',text)
print('Corpus length in characters:', len(text))


# Separar por palabras
text_in_words = [w for w in text.split(' ') if (w.strip() != '' or w == '\n')]

#Separar por silabas
s = silabizer()
text_in_words = list(map(s,text_in_words))

flat_list = []
for i, word in enumerate(text_in_words):
    for sylab in word:
        flat_list.append(sylab)
    if (word != ["\n"] and (i+1 < len(text_in_words) and text_in_words[i+1] != ["\n"])):
        flat_list.append('\ ')

text_in_words = flat_list

print(text_in_words)
print('Corpus length in words:', len(text_in_words))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)


words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print(words)
print('Unique words after ignoring:', len(words))
print_vocabulary(vocabulary, words)

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i + 1: i + SEQUENCE_LEN + 1])
        next_words.append(text_in_words[i])
    else:
        ignored = ignored + 1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

# x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(
    sentences, next_words
)

Corpus length in characters: 370700
['\ufeff', '\n', '\n', 'en', '\\ ', 'la', '\\ ', 'im', 'pro', 'vi', 'sa', 'ción', '\n', 'la', 'men', 'ta', 'ble', 'men', 'te', '\\ ', 'yo', '\\ ', 'muer', 'do', '\\ ', 'co', 'mo', '\\ ', 'le', 'ón', '\n', 'yo', '\\ ', 'te', '\\ ', 'pren', 'do', '\\ ', 'fue', 'go', '\\ ', ',', '\\ ', 'el', '\\ ', 'con', 'cep', 'to', '\\ ', 'es', '\\ ', 'jue', 'go', '\n', 'y', '\\ ', 'con', 'tra', '\\ ', 'mí', '\\ ', 'es', '\\ ', 'co', 'mo', '\\ ', 'un', '\\ ', 'par', 'que', '\\ ', 'de', '\\ ', 'di', 'ver', 'sión', '\n', '\n', 'te', '\\ ', 'fal', 'ta', '\\ ', 'a', 'pren', 'der', '\\ ', 'que', '\\ ', 'ten', 'go', '\\ ', 'cre', 'a', 'ción', '\n', 'y', '\\ ', 'que', '\\ ', 'cuan', 'do', '\\ ', 'yo', '\\ ', 'can', 'to', '\\ ', 'le', '\\ ', 'pon', 'go', '\\ ', 'la', '\\ ', 're', 'la', 'ción', '\n', 'es', 'to', '\\ ', 'es', '\\ ', 'jue', 'go', '\\ ', 'de', '\\ ', 'ju', 'gar', '\n', 'pe', 'ro', '\\ ', 'si', '\\ ', 'tú', '\\ ', 'me', '\\ ', 'ga', 'nas', '\\ ', 'im', 'po', 'si'

Creacion de modelo y entrenamiento extraido de https://github.com/enriqueav/lstm_lyrics

In [7]:
"""
Example script to train a network to generate text with the style of a given corpus
--By word--

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

Based on
https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

20 epochs should be enough to get decent results.
Uses data generator to avoid loading all the test set into memory.
Saves the weights and model every epoch.
"""

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
import numpy as np
import sys
import io
import os
from google.colab import files

# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y


def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Embedding(input_dim=len(words), output_dim=1024))
    model.add(Bidirectional(LSTM(256)))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()


model = get_model(0.4)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
            "loss{loss:.4f}-acc{accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_accuracy:.4f}" % \
            (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)

checkpoint = ModelCheckpoint(filepath=file_path, monitor='val_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='accuracy', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]

examples_file = open(examples, "w")

if 'weights_clean_bk.h5' in os.listdir():
  print("Weights loaded")
  model.load_weights('weights_clean_bk.h5')

model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=10,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

model.save_weights('weights.h5')
#files.download('weights_10_10_bk.h5')

Build model...
Weights loaded
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save_weights('weights.h5')
files.download('weights.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Generacion de texto con el modelo anterior

In [8]:
"""
Script to generate text from an already trained network (with lstm_train.py)
--By word--

It is necessary to at least provide the trained model and the vocabulary file
(generated also by lstm_train.py).
"""


import argparse
import numpy as np
import re
from keras.models import load_model

import collections


def validate_syllable(s, vocabulary, debug = False):
    if s in vocabulary:
        if debug: print(s, " ✓ in vocabulary")
        return True
    else:
        if debug: print(s, " ✗ NOT in vocabulary")
        return False


def validate_seed(vocabulary, seed, debug = False):
    if debug: print("Validating that all the words in the seed are part of the vocabulary: ", seed)
    
    return all([validate_syllable(s, vocabulary, debug) for s in seed])


# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def rima(old_syl, next_syl):        
    vowels = ['a','e','i','o','u','á', 'é', 'í', 'ó', 'ú', 'y']
    if old_syl==next_syl:
        return 0.6
    old_strip="".join(filter(lambda c: c in vowels, old_syl))
    next_strip="".join(filter(lambda c: c in vowels, next_syl))
    return 0.2 if (old_strip != "" and old_strip == next_strip) else 0


def rhyme_score(preds, previous_syllables, indices_word, debug = False):
    #Previous syllables es la ultima o dos ultimas silabas concatenadas
    prev = preds.copy()
    for index in range(len(preds)):
        #if(preds[index]) > 0.5: #Modificar a verdaderas candidatas a sílaba
        preds[index] += (1.0 - preds[index])*rima(previous_syllables,indices_word[index])

    if debug: print("Rimando con ", previous_syllables, [indices_word[x] for x in range(len(preds)) if (prev[x] != preds[x])])
    return preds


def generate_text(model, indices_word, word_indices, seed,
                  sequence_length, diversity, quantity):
    """
    Similar to lstm_train::on_epoch_end
    Used to generate text using a trained model

    :param model: the trained Keras model (with model.load)
    :param indices_word: a dictionary pointing to the words
    :param seed: a string to be used as seed (already validated and padded)
    :param sequence_length: how many words are given to the model to generate
    :param diversity: is the "temperature" of the sample function (usually between 0.1 and 2)
    :param quantity: quantity of words to generate
    :return: Nothing, for now only writes the text to console
    """
    print("----- Generating text")
    print("----- Diversity:" + str(diversity))
    print("----- Generating with seed:", seed)

    text = seed.copy()
    sentence = list(reversed(seed))
    prev_preds = []
    last_syllables = [[sentences[-2:]]]
    diff = []

    for i in range(quantity):
        #x_pred = np.zeros((1, sequence_length, len(vocabulary)))
        x_pred = np.zeros((1, SEQUENCE_LEN))
        for t, word in enumerate(sentence):
              x_pred[0, t] = word_indices[word]

        preds = model.predict(x_pred, verbose=0)[0]
        
        '''for t, word in enumerate(sentence):
            x_pred[0, t, word_indices[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]'''
        # preds tiene arreglo resultado de softmax

        next_index = sample(preds, diversity)
        next_word = indices_word[next_index]

        if len(text) > 1 and text[0] == '\n' and last_syllables != []:
                changed_index = sample(rhyme_score(preds, last_syllables[-1][-1], indices_word))
                new_word = indices_word[changed_index]
                if new_word != next_word:
                    diff.insert(0,(next_word, new_word.upper()))
                    next_word = new_word
                    text.insert(0,next_word.upper())
                else:
                    text.insert(0,next_word)
        else:
            text.insert(0,next_word)
        
        sentence = sentence[:-1]
        sentence.insert(0,next_word)
        
        if len(text) > 2 and text[2] == '\n' and '\n' not in text[:2]:
            last_syllables.append(sentence[:2])

    print("\nPares (silabas reemplazadas, silabas originales):")
    print(diff,"\n")
    return text

vocabulary_file = "vocabulary.txt"
#model_file = args.network
seed = "voy a ser el que en el microfono"
seed = silabize_line(seed)
sequence_length = SEQUENCE_LEN
diversity = 0.5
quantity = 500

#model = load_model(model_file)
print("\nSummary of the Network: ")
model.summary()
print()

vocabulary = open(vocabulary_file, "r").readlines()
# remove the \n at the end of the word, except for the \n word itself
vocabulary = [re.sub(r'(\S+)\s+', r'\1', w) for w in vocabulary]

#vocabulary = sorted(set(vocabulary))
vocabulary = sorted(vocabulary)

word_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_word = dict((i, c) for i, c in enumerate(vocabulary))

if validate_seed(vocabulary, seed, False):
    print("\nSeed is correct.\n")
    # repeat the seed in case is not long enough, and take only the last elements
    seed = (seed*sequence_length)[-sequence_length:]
    text = generate_text(
        model, indices_word, word_indices, seed, sequence_length, diversity, quantity
    )
    for word in text: print(word if word != '\\' else ' ', end="")
else:
    print('\033[91mERROR: Please fix the seed string\033[0m')



Summary of the Network: 
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 1024)        1159168   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               2623488   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1132)              580716    
_________________________________________________________________
activation_2 (Activation)    (None, 1132)              0         
Total params: 4,363,372
Trainable params: 4,363,372
Non-trainable params: 0
_________________________________________________________________


Seed is correct.

----- Generating text
----- Diversity:0.5
----- Generating with 




Pares (silabas reemplazadas, silabas originales):
[('\n', 'RA'), ('\n', 'HEN'), ('\n', 'PRE'), ('\n', 'DRE'), ('\n', 'DRA'), ('\n', 'AT'), ('\n', 'LAS'), ('\n', 'CHAS'), ('to', 'RRAS'), ('\n', 'MA'), ('\n', 'PAS'), ('\n', 'CAN'), ('\n', 'MA')] 

soy el sador , que eres arrogante , no voy a hacer nada , que está escrito este rapero , cuando yo estoy afueRA
ey , es lo que lo ecHEN
ay , madre mía , cuando lo de siemPRE
ey , hermano , no no tiene nada , tonto , te vas a hablarme de mi estilo , y a tu maDRE
yo me meto el culo , la verdad es que te está jugando a mí para darle la pista , el flow , otalaDRA
viene con rap directamente , eso en el beAT
oh , en el momento , solamente de toda lo que te pinta , mejor que son goLAS
pero bueno , es lo que me voy a sacar de toda la careCHAS
así que sí , yo te suelto en baRRAS
lo siento , hermano , te ayudo , de la temática porque creo que yo me habla del alMA
dice que sabe que lo hace cuando las la triPAS
yo no lo sé , que no me pliCAN
te pregunto a