In [1]:
import numpy as np
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pandas as pd
import joblib

In [2]:
texts = pd.read_parquet('datos.parquet')

In [3]:
# Tokenizar el texto
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts[0])
sequences = tokenizer.texts_to_sequences(texts[0])

# Padding para igualar la longitud de las secuencias
maxlen = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# Datos de entrada y salida
x_train = padded_sequences[:, :-1]  # Todas las columnas excepto la última
y_train = padded_sequences[:, -1]   # Última columna

In [8]:
# Tamaño del vocabulario
vocab_size = len(tokenizer.word_index) + 1

# Modelo de lenguaje
embedding_dim = 50
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen-1),
    LSTM(100),
    Dense(vocab_size, activation='softmax')
])



In [38]:
model = load_model('modelo_text.h5')



In [41]:
model.summary()

# Compilar y entrenar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100)

Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8718 - loss: 1.5805
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8810 - loss: 1.4606
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8907 - loss: 1.4333
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9148 - loss: 1.3872
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9297 - loss: 1.2949
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9522 - loss: 1.1933
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9546 - loss: 1.1075
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9675 - loss: 1.0937
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x211623937d0>

In [55]:
def generate_text(model, tokenizer, input_text, maxlen, num_words):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=maxlen-1)

    generated_sentence = input_text
    for _ in range(num_words):
        predicted_probabilities = model.predict(input_seq, verbose=0)
        predicted_word_idx = np.random.choice(len(predicted_probabilities[0]), p=predicted_probabilities[0])
        generated_word = tokenizer.index_word.get(predicted_word_idx, "")

        if generated_word:
            generated_sentence += " " + generated_word
            input_seq = np.append(input_seq[:, 1:], predicted_word_idx)
            input_seq = input_seq.reshape(1, -1)

    return generated_sentence

input_text = "el amor es"
generated_text = generate_text(model, tokenizer, input_text, maxlen, num_words=3)
print("Texto generado:", generated_text)

Texto generado: el amor es montañas primavera poderoso


In [27]:
# Guardar el modelo existente
model.save('modelo_text.h5')



In [59]:
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']