In [11]:
import numpy as np
import huffman
from collections import Counter
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
import tensorflow_datasets as tfds
import glob
import os
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


In [6]:
texto = "este es un ejemplo de codificación de Huffman"
frequencies = Counter(texto)

In [10]:
codigo_huffman = huffman.codebook(frequencies.items())
texto_codificado = " || ".join(codigo_huffman[char] for char in texto)

print("Texto codificado:", texto_codificado)

# Paso 4: Decodificar el texto codificado
# Crear un diccionario inverso para decodificar
codigo_inverso = {v: k for k, v in codigo_huffman.items()}
texto_decodificado = ""
temp = ""
for bit in texto_codificado:
    temp += bit
    if temp in codigo_inverso:
        texto_decodificado += codigo_inverso[temp]
        temp = ""

print("Texto decodificado:", texto_decodificado)

Texto codificado: 110 || 0100 || 111101 || 110 || 101 || 110 || 0100 || 101 || 0010 || 0111 || 101 || 110 || 111100 || 110 || 0011 || 00010 || 01010 || 0000 || 101 || 1110 || 110 || 101 || 0110 || 0000 || 1110 || 1001 || 1000 || 1001 || 0110 || 11111 || 0110 || 1001 || 00011 || 0111 || 101 || 1110 || 110 || 101 || 01011 || 0010 || 1000 || 1000 || 0011 || 11111 || 0111
Texto decodificado: e


In [13]:
def preprocess_text(text, seq_length=40):
    chars = sorted(set(text))
    char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
    idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
    vocab_size = len(chars)

    # Crear secuencias de entrada y salida
    input_seq = []
    target_seq = []

    for i in range(len(text) - seq_length):
        in_seq = text[i : i + seq_length]
        out_seq = text[i + seq_length]
        input_seq.append([char_to_idx[char] for char in in_seq])
        target_seq.append(char_to_idx[out_seq])

    # Convertir listas de índices en matrices de entrada y salida adecuadas para el entrenamiento
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(
        input_seq, maxlen=seq_length, padding="pre"
    )
    target_seq = tf.keras.utils.to_categorical(target_seq, num_classes=vocab_size)

    return input_seq, target_seq, vocab_size, char_to_idx, idx_to_char

In [27]:
# Datos de ejemplo
text = """ Hola, ¿cómo estás? Soy un ejemplo de texto que se utilizará para entrenar un modelo de lenguaje.
"""  
seq_length = 10  
input_seq, target_seq, vocab_size, char_to_idx, idx_to_char = preprocess_text(
    text, seq_length
)
print("Input sequence shape:", input_seq)
print("Target sequence shape:", target_seq)
print("Vocabulary size:", idx_to_char)
# Construir el modelo
model = Sequential(
    [
        Embedding(input_dim=vocab_size, output_dim=64, input_length=seq_length),
        LSTM(128),
        Dense(vocab_size, activation="softmax"),
    ]
)

# Compilar el modelo
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Entrenar el modelo
model.fit(input_seq, target_seq, epochs=100, verbose=2)

Input sequence shape: [[ 1  0  4 ... 15  8 16]
 [ 0  4  1 ...  8 16  1]
 [ 4  1  7 ... 16  1 22]
 ...
 [ 1 14  9 ... 18 18  5]
 [14  9  1 ... 18  5  6]
 [ 9  1  5 ...  5  6  5]]
Target sequence shape: [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
Vocabulary size: {0: '\n', 1: ' ', 2: 'A', 3: 'P', 4: 'Y', 5: 'a', 6: 'b', 7: 'c', 8: 'd', 9: 'e', 10: 'g', 11: 'h', 12: 'i', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'z', 24: 'á', 25: 'é', 26: 'í'}
Epoch 1/100




7/7 - 1s - 132ms/step - accuracy: 0.1618 - loss: 3.2793
Epoch 2/100
7/7 - 0s - 10ms/step - accuracy: 0.1863 - loss: 3.1925
Epoch 3/100
7/7 - 0s - 11ms/step - accuracy: 0.1863 - loss: 2.9225
Epoch 4/100
7/7 - 0s - 10ms/step - accuracy: 0.1912 - loss: 2.8323
Epoch 5/100
7/7 - 0s - 11ms/step - accuracy: 0.1422 - loss: 2.7818
Epoch 6/100
7/7 - 0s - 9ms/step - accuracy: 0.2157 - loss: 2.7413
Epoch 7/100
7/7 - 0s - 9ms/step - accuracy: 0.1863 - loss: 2.7347
Epoch 8/100
7/7 - 0s - 9ms/step - accuracy: 0.1863 - loss: 2.7149
Epoch 9/100
7/7 - 0s - 11ms/step - accuracy: 0.1912 - loss: 2.6929
Epoch 10/100
7/7 - 0s - 9ms/step - accuracy: 0.1961 - loss: 2.6745
Epoch 11/100
7/7 - 0s - 13ms/step - accuracy: 0.1961 - loss: 2.6526
Epoch 12/100
7/7 - 0s - 9ms/step - accuracy: 0.1961 - loss: 2.6338
Epoch 13/100
7/7 - 0s - 9ms/step - accuracy: 0.2451 - loss: 2.5988
Epoch 14/100
7/7 - 0s - 11ms/step - accuracy: 0.2500 - loss: 2.5682
Epoch 15/100
7/7 - 0s - 11ms/step - accuracy: 0.2941 - loss: 2.5319
Epoch 

<keras.src.callbacks.history.History at 0x3130e3590>

In [28]:
def predict_next_char(model, input_text, char_to_idx, idx_to_char):
    input_indices = [
        char_to_idx[ch] for ch in input_text[-seq_length:]
    ]  # Asegurarse de usar solo los últimos 'seq_length' caracteres
    input_indices = tf.keras.preprocessing.sequence.pad_sequences(
        [input_indices], maxlen=seq_length, padding="pre"
    )
    predictions = model.predict(input_indices)[0]
    return {idx_to_char[i]: prob for i, prob in enumerate(predictions)}


# Ejemplo de uso
context = "[]Hola"
predicted_probs = predict_next_char(model, context, char_to_idx, idx_to_char)
print(predicted_probs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
{'\n': 2.4446226e-06, ' ': 0.00012292077, 'A': 3.9709444e-06, 'P': 0.00018478041, 'Y': 0.00021594099, 'a': 0.002025231, 'b': 0.0045404113, 'c': 2.0484175e-07, 'd': 4.2705764e-05, 'e': 0.9816928, 'g': 2.1880491e-07, 'h': 2.7613967e-05, 'i': 0.000121175595, 'l': 0.0043668956, 'm': 0.0023769338, 'n': 2.608391e-05, 'o': 5.560564e-06, 'q': 2.5820671e-05, 'r': 0.0002562776, 's': 0.0029980824, 't': 2.9368462e-05, 'u': 0.00015539383, 'v': 0.00076011376, 'z': 7.896544e-07, 'á': 5.8225937e-06, 'é': 1.0626007e-05, 'í': 1.7690279e-06}


In [31]:
def load_texts(directory):
    texts = []
    # Recorre todos los archivos en el directorio
    for filename in glob.glob(os.path.join(directory, "*")):
        if os.path.isfile(filename):  # Check if the path is a file
            with open(filename, "r", encoding="utf-8", errors="ignore") as file:
                texts.append(file.read())
    return texts


# Suponiendo que '20news-bydate-train' es tu directorio descomprimido
train_dir = "./20news-bydate/20news-bydate-train/alt.atheism"
test_dir = "./20news-bydate/20news-bydate-test/alt.atheism"
texts = load_texts(train_dir)
texts_test = load_texts(test_dir)

In [32]:
len(texts)

480

In [33]:
def create_sequences(texts, seq_length=40):
    input_chars = []
    output_char = []
    for text in texts:
        for i in range(len(text) - seq_length):
            in_seq = text[i:i + seq_length]
            out_seq = text[i + seq_length]
            input_chars.append(in_seq)
            output_char.append(out_seq)
    return input_chars, output_char

In [38]:
seq_length = 30
texts_train_part = texts[:100]
texts_test_part = texts_test[:25]
input_seqs, output_seqs = create_sequences(texts_train_part, seq_length)
input_seqs_test, output_seqs_test = create_sequences(texts_test_part, seq_length)

In [39]:

# Tokenización
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(texts+texts_test)  # Ajuste del tokenizer al corpus completo
vocab_size = len(tokenizer.word_index) + 1

# Convertir textos a secuencias numéricas
input_seqs = tokenizer.texts_to_sequences(input_seqs)
input_seqs = pad_sequences(input_seqs, maxlen=seq_length, truncating='pre')

# Preparar datos de salida (etiquetas) usando one-hot encoding
output_seqs = tokenizer.texts_to_sequences(output_seqs)
output_seqs = to_categorical(output_seqs, num_classes=vocab_size)

# Test
input_seqs_test = tokenizer.texts_to_sequences(input_seqs_test)
input_seqs_test = pad_sequences(input_seqs_test, maxlen=seq_length, truncating='pre')

# Preparar datos de salida (etiquetas) usando one-hot encoding
output_seqs_test = tokenizer.texts_to_sequences(output_seqs_test)
output_seqs_test = to_categorical(output_seqs_test, num_classes=vocab_size)

# Construcción del modelo LSTM
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

# Compilación del modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Resumen del modelo
model.summary()

# Entrenamiento del modelo
model.fit(input_seqs, output_seqs, epochs=50, batch_size=64,validation_data=(input_seqs_test, output_seqs_test))

Epoch 1/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 66ms/step - accuracy: 0.2129 - loss: 2.9440 - val_accuracy: 0.3511 - val_loss: 2.3590
Epoch 2/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 59ms/step - accuracy: 0.3700 - loss: 2.2322 - val_accuracy: 0.4063 - val_loss: 2.1578
Epoch 3/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 65ms/step - accuracy: 0.4254 - loss: 2.0310 - val_accuracy: 0.4390 - val_loss: 2.0505
Epoch 4/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 68ms/step - accuracy: 0.4568 - loss: 1.9027 - val_accuracy: 0.4585 - val_loss: 1.9800
Epoch 5/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 69ms/step - accuracy: 0.4769 - loss: 1.8230 - val_accuracy: 0.4678 - val_loss: 1.9428
Epoch 6/50
[1m2681/2681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 73ms/step - accuracy: 0.4931 - loss: 1.7608 - val_accuracy: 0.4755 - val_loss: 1.913

<keras.src.callbacks.history.History at 0x14890f230>

In [None]:
# Codificar texto de entrada 
def codificar_texto(texto:str):
    texto_codificado = tokenizer.texts_to_sequences([texto])[0]
    texto_codificado = pad_sequences([texto_codificado], maxlen=seq_length, truncating='pre')
    return texto_codificado

In [40]:
def predict_next_char(model, tokenizer, input_text, seq_length):
    # Paso 1: Convertir la cadena de entrada en una secuencia de enteros
    input_seq = tokenizer.texts_to_sequences([input_text])[0]
    
    # Paso 2: Ajustar la longitud de la secuencia
    input_seq = pad_sequences([input_seq], maxlen=seq_length, truncating='pre')
    
    # Paso 3: Hacer la predicción
    prediction = model.predict(input_seq, verbose=0)[0]
    
    # Paso 4: Convertir las probabilidades en un diccionario
    int_to_char = {i: char for char, i in tokenizer.word_index.items()}
    int_to_char[0] = ''  # Añadir el carácter vacío para el padding
    return {int_to_char[i]: prob for i, prob in enumerate(prediction)}

# Ejemplo de uso
input_text = "this i"
predictions = predict_next_char(model, tokenizer, input_text, seq_length)
print(predictions)

{'': 1.5950792e-10, ' ': 0.012281137, 'e': 8.72832e-05, 't': 0.0036291692, 'i': 2.0899139e-05, 'a': 0.00014589165, 'o': 0.00045588036, 's': 0.22694807, 'n': 0.37558985, 'r': 0.014647618, 'h': 7.445102e-05, 'l': 0.0008568549, 'd': 0.035022456, 'u': 0.0016275678, '\n': 0.0018883744, 'c': 0.0009687037, 'm': 0.011669023, '.': 0.027806953, 'y': 9.848865e-06, 'f': 0.15203448, 'p': 0.00027516356, 'g': 0.0010468533, 'w': 4.818458e-05, 'b': 0.0007719995, '>': 1.5344362e-05, 'v': 0.0024651545, ',': 0.00018973678, '-': 4.819494e-06, 'k': 0.0017686542, ':': 1.6008191e-09, "'": 0.015728982, '"': 2.8503813e-08, '*': 1.258957e-08, 'j': 1.6582248e-06, ')': 0.00017777098, '(': 0.00013986464, '1': 2.6450449e-05, '@': 1.101738e-05, 'x': 0.00051517796, '^': 8.9487695e-13, '?': 7.769665e-06, '0': 6.468009e-05, 'z': 5.496394e-07, '|': 1.7282577e-05, '2': 2.8803897e-05, 'q': 4.2125785e-06, '3': 0.110196695, '=': 4.4372505e-06, '9': 3.362148e-05, '/': 9.07967e-07, '5': 3.551089e-05, '4': 6.4445226e-06, '#': 2

In [42]:
max_key = max(predictions, key=predictions.get)
print("Key con mayor valor:", max_key)

Key con mayor valor: n
