In [29]:
import numpy as np
import huffman
from collections import Counter
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

In [6]:
texto = "este es un ejemplo de codificación de Huffman"
frequencies = Counter(texto)

In [10]:
codigo_huffman = huffman.codebook(frequencies.items())
texto_codificado = " || ".join(codigo_huffman[char] for char in texto)

print("Texto codificado:", texto_codificado)

# Paso 4: Decodificar el texto codificado
# Crear un diccionario inverso para decodificar
codigo_inverso = {v: k for k, v in codigo_huffman.items()}
texto_decodificado = ""
temp = ""
for bit in texto_codificado:
    temp += bit
    if temp in codigo_inverso:
        texto_decodificado += codigo_inverso[temp]
        temp = ""

print("Texto decodificado:", texto_decodificado)

Texto codificado: 110 || 0100 || 111101 || 110 || 101 || 110 || 0100 || 101 || 0010 || 0111 || 101 || 110 || 111100 || 110 || 0011 || 00010 || 01010 || 0000 || 101 || 1110 || 110 || 101 || 0110 || 0000 || 1110 || 1001 || 1000 || 1001 || 0110 || 11111 || 0110 || 1001 || 00011 || 0111 || 101 || 1110 || 110 || 101 || 01011 || 0010 || 1000 || 1000 || 0011 || 11111 || 0111
Texto decodificado: e


In [13]:
def preprocess_text(text, seq_length=40):
    chars = sorted(set(text))
    char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
    idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}
    vocab_size = len(chars)

    # Crear secuencias de entrada y salida
    input_seq = []
    target_seq = []

    for i in range(len(text) - seq_length):
        in_seq = text[i : i + seq_length]
        out_seq = text[i + seq_length]
        input_seq.append([char_to_idx[char] for char in in_seq])
        target_seq.append(char_to_idx[out_seq])

    # Convertir listas de índices en matrices de entrada y salida adecuadas para el entrenamiento
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(
        input_seq, maxlen=seq_length, padding="pre"
    )
    target_seq = tf.keras.utils.to_categorical(target_seq, num_classes=vocab_size)

    return input_seq, target_seq, vocab_size, char_to_idx, idx_to_char

In [27]:
# Datos de ejemplo
text = """ Hola, ¿cómo estás? Soy un ejemplo de texto que se utilizará para entrenar un modelo de lenguaje.
"""  
seq_length = 10  
input_seq, target_seq, vocab_size, char_to_idx, idx_to_char = preprocess_text(
    text, seq_length
)
print("Input sequence shape:", input_seq)
print("Target sequence shape:", target_seq)
print("Vocabulary size:", idx_to_char)
# Construir el modelo
model = Sequential(
    [
        Embedding(input_dim=vocab_size, output_dim=64, input_length=seq_length),
        LSTM(128),
        Dense(vocab_size, activation="softmax"),
    ]
)

# Compilar el modelo
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Entrenar el modelo
model.fit(input_seq, target_seq, epochs=100, verbose=2)

Input sequence shape: [[ 1  0  4 ... 15  8 16]
 [ 0  4  1 ...  8 16  1]
 [ 4  1  7 ... 16  1 22]
 ...
 [ 1 14  9 ... 18 18  5]
 [14  9  1 ... 18  5  6]
 [ 9  1  5 ...  5  6  5]]
Target sequence shape: [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
Vocabulary size: {0: '\n', 1: ' ', 2: 'A', 3: 'P', 4: 'Y', 5: 'a', 6: 'b', 7: 'c', 8: 'd', 9: 'e', 10: 'g', 11: 'h', 12: 'i', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'z', 24: 'á', 25: 'é', 26: 'í'}
Epoch 1/100




7/7 - 1s - 132ms/step - accuracy: 0.1618 - loss: 3.2793
Epoch 2/100
7/7 - 0s - 10ms/step - accuracy: 0.1863 - loss: 3.1925
Epoch 3/100
7/7 - 0s - 11ms/step - accuracy: 0.1863 - loss: 2.9225
Epoch 4/100
7/7 - 0s - 10ms/step - accuracy: 0.1912 - loss: 2.8323
Epoch 5/100
7/7 - 0s - 11ms/step - accuracy: 0.1422 - loss: 2.7818
Epoch 6/100
7/7 - 0s - 9ms/step - accuracy: 0.2157 - loss: 2.7413
Epoch 7/100
7/7 - 0s - 9ms/step - accuracy: 0.1863 - loss: 2.7347
Epoch 8/100
7/7 - 0s - 9ms/step - accuracy: 0.1863 - loss: 2.7149
Epoch 9/100
7/7 - 0s - 11ms/step - accuracy: 0.1912 - loss: 2.6929
Epoch 10/100
7/7 - 0s - 9ms/step - accuracy: 0.1961 - loss: 2.6745
Epoch 11/100
7/7 - 0s - 13ms/step - accuracy: 0.1961 - loss: 2.6526
Epoch 12/100
7/7 - 0s - 9ms/step - accuracy: 0.1961 - loss: 2.6338
Epoch 13/100
7/7 - 0s - 9ms/step - accuracy: 0.2451 - loss: 2.5988
Epoch 14/100
7/7 - 0s - 11ms/step - accuracy: 0.2500 - loss: 2.5682
Epoch 15/100
7/7 - 0s - 11ms/step - accuracy: 0.2941 - loss: 2.5319
Epoch 

<keras.src.callbacks.history.History at 0x3130e3590>

In [28]:
def predict_next_char(model, input_text, char_to_idx, idx_to_char):
    input_indices = [
        char_to_idx[ch] for ch in input_text[-seq_length:]
    ]  # Asegurarse de usar solo los últimos 'seq_length' caracteres
    input_indices = tf.keras.preprocessing.sequence.pad_sequences(
        [input_indices], maxlen=seq_length, padding="pre"
    )
    predictions = model.predict(input_indices)[0]
    return {idx_to_char[i]: prob for i, prob in enumerate(predictions)}


# Ejemplo de uso
context = "[]Hola"
predicted_probs = predict_next_char(model, context, char_to_idx, idx_to_char)
print(predicted_probs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
{'\n': 2.4446226e-06, ' ': 0.00012292077, 'A': 3.9709444e-06, 'P': 0.00018478041, 'Y': 0.00021594099, 'a': 0.002025231, 'b': 0.0045404113, 'c': 2.0484175e-07, 'd': 4.2705764e-05, 'e': 0.9816928, 'g': 2.1880491e-07, 'h': 2.7613967e-05, 'i': 0.000121175595, 'l': 0.0043668956, 'm': 0.0023769338, 'n': 2.608391e-05, 'o': 5.560564e-06, 'q': 2.5820671e-05, 'r': 0.0002562776, 's': 0.0029980824, 't': 2.9368462e-05, 'u': 0.00015539383, 'v': 0.00076011376, 'z': 7.896544e-07, 'á': 5.8225937e-06, 'é': 1.0626007e-05, 'í': 1.7690279e-06}
