In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot, Dense
from tensorflow.keras.models import Model

# Parámetros del modelo
vocab_size = 10000  # Tamaño del vocabulario
embedding_size = 100  # Dimensión de los embeddings
context_window = 0  # Tamaño del contexto (2 palabras antes y 2 después)

# Capa de entrada para la palabra objetivo
input_target = Input(shape=(1,), name="input_target")

# Capa de entrada para la palabra de contexto
input_context = Input(shape=(1,), name="input_context")

# Capa de embeddings para la palabra objetivo
embedding_target = Embedding(input_dim=vocab_size, output_dim=embedding_size, name="embedding_target")(input_target)
embedding_target = Reshape((embedding_size, 1))(embedding_target)

# Capa de embeddings para la palabra de contexto
embedding_context = Embedding(input_dim=vocab_size, output_dim=embedding_size, name="embedding_context")(input_context)
embedding_context = Reshape((embedding_size, 1))(embedding_context)

# Operación Dot para calcular la similitud entre los embeddings
dot_product = Dot(axes=1)([embedding_target, embedding_context])
dot_product = Reshape((1,))(dot_product)

# Capa de salida con activación sigmoide
output = Dense(1, activation='sigmoid')(dot_product)

# Crear el modelo
model = Model(inputs=[input_target, input_context], outputs=output)

# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Resumen del modelo
model.summary()

In [None]:
from collections import defaultdict
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import random

num_negative_samples = 0

with open("datasets/game_of_thrones.txt", "r", encoding="utf-8") as f:
    text = f.read().lower().split()  

# Cargar palabras de entrenamiento
with open("materiales/target_words_game_of_thrones.txt", "r", encoding="utf-8") as f:
    palabras_entrenamiento = set(f.read().lower().split())

# Tokenización del corpus
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts([" ".join(text)])  
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}  

# Convertir texto en secuencia de índices
sequence = [word_index[word] for word in text if word in word_index]

# Listas para pares de entrenamiento
pairs, labels = [], []

# Generar pares positivos y negativos
vocab_list = list(word_index.values())  # Lista de índices de palabras disponibles

for i, word in enumerate(text):
    if word in palabras_entrenamiento:  # Solo si la palabra está en la lista
        window_start = max(i - context_window, 0)
        window_end = min(i + context_window + 1, len(text))

        context_words = []
        for j in range(window_start, window_end):
            if i != j:  # Evitar que la palabra se relacione consigo misma
                pairs.append([word_index[word], word_index[text[j]]])
                labels.append(1)  # Relación positiva
                context_words.append(word_index[text[j]])

        
        for _ in range(num_negative_samples):
            negative_word = random.choice(vocab_list)
            while negative_word in context_words or negative_word == word_index[word]:  
                negative_word = random.choice(vocab_list)  # Asegurar que no esté en el contexto real

            pairs.append([word_index[word], negative_word])
            labels.append(0)  # Relación negativa

# Convertir a numpy arrays
pairs = np.array(pairs)
labels = np.array(labels)

# Guardar los pares en un archivo (opcional)
np.savetxt("pares_entrenamiento_game_of_thrones.txt", np.column_stack((pairs, labels)), fmt="%d")

# Mostrar ejemplos
for i in range(10):
    target, context = pairs[i]
    print(f"Entrada: [{index_word[target]}, {index_word[context]}] -> Salida: {labels[i]}")

ModuleNotFoundError: No module named 'keras.preprocessing.text'

In [None]:

with open("datasets/game_of_thrones.txt", "r", encoding="utf-8") as f:
    text = f.read().lower().split()  


unique_words = set(text)


print(f"El corpus tiene {len(unique_words)} palabras distintas.")

In [None]:





data = np.loadtxt("pares_entrenamiento_game_of_thrones.txt", dtype=int)


targets, contexts, labels = data[:, 0], data[:, 1], data[:, 2]


targets = tf.convert_to_tensor(targets, dtype=tf.int32)
contexts = tf.convert_to_tensor(contexts, dtype=tf.int32)
labels = tf.convert_to_tensor(labels, dtype=tf.float32)

In [None]:
model.fit([targets, contexts], labels, epochs=10, batch_size=64)

In [None]:

embeddings = model.get_layer("embedding_target").get_weights()[0]


np.save("word_embeddings_game_of_thrones.npy", embeddings)


import json
with open("word_index_game_of_thrones.json", "w") as f:
    json.dump(tokenizer.word_index, f)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE

def visualize_tsne_embeddings(words, embeddings, word_index, filename=None):
    """
    Visualizes t-SNE embeddings of selected words.

    Args:
        words (list): List of words to visualize.
        embeddings (numpy.ndarray): Array containing word embeddings.
        word_index (dict): Mapping of words to their indices in the embeddings array.
        filename (str, optional): File to save the visualization. If None, plot is displayed.

    Returns:
        None
    """
    # Filter the embeddings for the selected words
    indices = [word_index[word] for word in words]
    selected_embeddings = embeddings[indices]

    # Set perplexity for t-SNE, it's recommended to use a value less than the number of selected words
    perplexity = min(5,len(words) - 1)

    # Use t-SNE to reduce dimensionality
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=0)
    reduced_embeddings = tsne.fit_transform(selected_embeddings)

    # Plotting
    plt.figure(figsize=(10, 10))
    for i, word in enumerate(words):
        plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1])
        plt.annotate(word, xy=(reduced_embeddings[i, 0], reduced_embeddings[i, 1]), xytext=(5, 2),
                     textcoords='offset points', ha='right', va='bottom')

    # Save or display the plot
    if filename:
        plt.savefig(filename)
    else:
        plt.show()

In [None]:
embeddings = np.load("word_embeddings_game_of_thrones.npy")
with open("word_index_game_of_thrones.json", "r") as f:
    word_index = json.load(f)


with open("materiales/target_words_game_of_thrones.txt", "r", encoding="utf-8") as f:
    words_to_visualize = f.read().lower().split()


visualize_tsne_embeddings(words_to_visualize, embeddings, word_index)

## Harry potter


In [None]:
def reset_weights(model):
    """
    Resetea los pesos del modelo sin cambiar su arquitectura.
    """
    for layer in model.layers:
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.assign(layer.kernel_initializer(tf.keras.backend.shape(layer.kernel)))
        if hasattr(layer, 'bias_initializer') and layer.bias is not None:
            layer.bias.assign(layer.bias_initializer(tf.keras.backend.shape(layer.bias)))


reset_weights(model)
    

In [None]:

num_negative_samples = 0

with open("datasets/harry_potter_and_the_philosophers_stone.txt", "r", encoding="utf-8") as f:
    text = f.read().lower().split()  


with open("materiales/target_words_harry_potter.txt", "r", encoding="utf-8") as f:
    palabras_entrenamiento = set(f.read().lower().split())


tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts([" ".join(text)])  
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}  


sequence = [word_index[word] for word in text if word in word_index]


pairs, labels = [], []


vocab_list = list(word_index.values())  

for i, word in enumerate(text):
    if word in palabras_entrenamiento:  
        window_start = max(i - context_window, 0)
        window_end = min(i + context_window + 1, len(text))

        context_words = []
        for j in range(window_start, window_end):
            if i != j:  
                pairs.append([word_index[word], word_index[text[j]]])
                labels.append(1)  
                context_words.append(word_index[text[j]])

       
        for _ in range(num_negative_samples):
            negative_word = random.choice(vocab_list)
            while negative_word in context_words or negative_word == word_index[word]:  
                negative_word = random.choice(vocab_list)  

            pairs.append([word_index[word], negative_word])
            labels.append(0)  


pairs = np.array(pairs)
labels = np.array(labels)

np.savetxt("pares_entrenamiento_harry_potter.txt", np.column_stack((pairs, labels)), fmt="%d")


for i in range(10):
    target, context = pairs[i]
    print(f"Entrada: [{index_word[target]}, {index_word[context]}] -> Salida: {labels[i]}")

In [None]:
unique_words = set(text)


print(f"El corpus tiene {len(unique_words)} palabras distintas.")

In [None]:
data = np.loadtxt("pares_entrenamiento_harry_potter.txt", dtype=int)

targets, contexts, labels = data[:, 0], data[:, 1], data[:, 2]


targets = tf.convert_to_tensor(targets, dtype=tf.int32)
contexts = tf.convert_to_tensor(contexts, dtype=tf.int32)
labels = tf.convert_to_tensor(labels, dtype=tf.float32)

In [None]:
model.fit([targets, contexts], labels, epochs=10, batch_size=64)

In [None]:
# Obtener los pesos de la capa de embedding
embeddings = model.get_layer("embedding_target").get_weights()[0]

# Guardar los embeddings en un archivo
np.save("word_embeddings_harry_potter.npy", embeddings)

# Guardar el índice de palabras para futura referencia
import json
with open("word_index_harry_potter.json", "w") as f:
    json.dump(tokenizer.word_index, f)

In [None]:
embeddings = np.load("word_embeddings_harry_potter.npy")
with open("word_index_harry_potter.json", "r") as f:
    word_index = json.load(f)


with open("materiales/target_words_harry_potter.txt", "r", encoding="utf-8") as f:
    words_to_visualize = f.read().lower().split()


visualize_tsne_embeddings(words_to_visualize, embeddings, word_index)

In [None]:
reset_weights(model)

In [None]:
num_negative_samples = 0

with open("datasets/the_fellowship_of_the_ring.txt", "r", encoding="utf-8") as f:
    text = f.read().lower().split()  


with open("materiales/target_words_the_fellowship_of_the_ring.txt", "r", encoding="utf-8") as f:
    palabras_entrenamiento = set(f.read().lower().split())


tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts([" ".join(text)])  
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}  


sequence = [word_index[word] for word in text if word in word_index]


pairs, labels = [], []


vocab_list = list(word_index.values())  

for i, word in enumerate(text):
    if word in palabras_entrenamiento:  
        window_start = max(i - context_window, 0)
        window_end = min(i + context_window + 1, len(text))

        context_words = []
        for j in range(window_start, window_end):
            if i != j:  
                pairs.append([word_index[word], word_index[text[j]]])
                labels.append(1)  
                context_words.append(word_index[text[j]])

       
        for _ in range(num_negative_samples):
            negative_word = random.choice(vocab_list)
            while negative_word in context_words or negative_word == word_index[word]:  
                negative_word = random.choice(vocab_list)  

            pairs.append([word_index[word], negative_word])
            labels.append(0)  


pairs = np.array(pairs)
labels = np.array(labels)

np.savetxt("pares_entrenamiento_fellowship.txt", np.column_stack((pairs, labels)), fmt="%d")


for i in range(10):
    target, context = pairs[i]
    print(f"Entrada: [{index_word[target]}, {index_word[context]}] -> Salida: {labels[i]}")

In [None]:
unique_words = set(text)


print(f"El corpus tiene {len(unique_words)} palabras distintas.")

In [None]:
data = np.loadtxt("pares_entrenamiento_fellowship.txt", dtype=int)

targets, contexts, labels = data[:, 0], data[:, 1], data[:, 2]


targets = tf.convert_to_tensor(targets, dtype=tf.int32)
contexts = tf.convert_to_tensor(contexts, dtype=tf.int32)
labels = tf.convert_to_tensor(labels, dtype=tf.float32)

In [None]:
model.fit([targets, contexts], labels, epochs=10, batch_size=64)

In [None]:
# Obtener los pesos de la capa de embedding
embeddings = model.get_layer("embedding_target").get_weights()[0]

# Guardar los embeddings en un archivo
np.save("word_embeddings_fellowship.npy", embeddings)

# Guardar el índice de palabras para futura referencia
import json
with open("word_index_fellowship.json", "w") as f:
    json.dump(tokenizer.word_index, f)

In [None]:
embeddings = np.load("word_embeddings_fellowship.npy")
with open("word_index_fellowship.json", "r") as f:
    word_index = json.load(f)


with open("materiales/target_words_the_fellowship_of_the_ring.txt", "r", encoding="utf-8") as f:
    words_to_visualize = f.read().lower().split()


visualize_tsne_embeddings(words_to_visualize, embeddings, word_index)