# Desafio 2 - NLP

- Crear sus propios vectores con Gensim basado en lo visto en clase con otro dataset.
- Probar términos de interés y explicar similitudes en el espacio de embeddings.
- Intentar plantear y probar tests de analogías. 
- Graficar los embeddings resultantes.
- Sacar conclusiones.

## Imports

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
from gensim.models import Word2Vec
import os
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE                   
import numpy as np
import plotly.graph_objects as go
import plotly.express as px                   

## Funciones generales

In [20]:
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobrecargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
def reduce_dimensions(model, num_dimensions = 2 ):
     
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    return vectors, labels

In [124]:
def max_words (model, vecs, labels, highlight_words, MAX_WORDS, topn=10):
    all_words = list(labels)
    selected_words = set(highlight_words)

    for word in highlight_words:
        if word in model.wv:
            try:
                similar = model.wv.most_similar(word, topn=topn)
                selected_words.update([w for w, _ in similar])
            except KeyError:
                pass  # por si alguna palabra no está en vocabulario

    for word in all_words:
        if len(selected_words) >= MAX_WORDS:
            break
        if word not in selected_words:
            selected_words.add(word)

    selected_words = list(selected_words)
    indices = [all_words.index(w) for w in selected_words if w in all_words]
    filtered_vecs = vecs[indices]
    filtered_labels = [all_words[i] for i in indices]
    return filtered_vecs, filtered_labels

In [114]:
def plot_embeddings(model, vecs, labels, MAX_WORDS=200, highlight_words=None):
    assert vecs.shape[0] == len(labels), "Vecs y labels deben tener la misma longitud"
    #labels = np.asarray(labels)[:MAX_WORDS]
    #vecs = vecs[:MAX_WORDS]
    vecs, labels = max_words(model, vecs, labels, highlight_words, MAX_WORDS)
    # Colores: rojo si está en highlight, azul si no
    highlight_words = set(highlight_words) if highlight_words else set()
    colors = ['red' if word in highlight_words else 'blue' for word in labels]

    if vecs.shape[1] == 2:
        fig = px.scatter(
            x=vecs[:, 0],
            y=vecs[:, 1],
            text=labels,
            color=colors,
            labels={"color": "Tipo de palabra"}
        )
    elif vecs.shape[1] == 3:
        fig = px.scatter_3d(
            x=vecs[:, 0],
            y=vecs[:, 1],
            z=vecs[:, 2],
            text=labels,
            color=colors,
            labels={"color": "Tipo de palabra"}
        )
        fig.update_traces(marker_size = 2)
    else:
        raise ValueError("Los vectores deben ser de dimensión 2 o 3")

    fig.update_traces(textposition='top center')
    fig.show()

## Preprocesamiento

In [6]:
os.listdir("corpus")

['Indiana_Jones_And_The_Last_Crusade.txt',
 'Indiana_Jones_And_The_Temple_Of_Doom.txt',
 'Raiders_Of_The_Lost_Ark.txt']

In [39]:
df1 = pd.read_csv('corpus/Indiana_Jones_And_The_Last_Crusade.txt', sep='/n', header=None, engine='python')
df2 = pd.read_csv('corpus/Raiders_Of_The_Lost_Ark.txt', sep='/n', header=None, engine='python')
df3 = pd.read_csv('corpus/Indiana_Jones_And_The_Temple_Of_Doom.txt', sep='/n', header=None, engine='python')
df = pd.concat([df1, df2, df3], ignore_index=True)
display(df.head())
display("Cantidad de documentos:", df.shape[0])

Unnamed: 0,0
0,"""INDIANA JONES AND THE LAST CRUSADE"""
1,Screenplay by
2,Jeffrey Boam
3,FADE IN:
4,EXT. DESERT OF THE AMERICAN SOUTHWEST - DAY


'Cantidad de documentos:'

13504

In [None]:
sentence_tokens = []
for _, row in df[:None].iterrows():
    sentence_tokens.append(text_to_word_sequence(row[0]))

## Crear los vectores (word2vec)

In [128]:
w2v_model = Word2Vec(min_count=10,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=10,       # cant de palabras antes y desp de la predicha
                     vector_size=500,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=4,      # si tienen más cores pueden cambiar este valor
                     sg=1)           # modelo 0:CBOW  1:skipgram

In [129]:
w2v_model.build_vocab(sentence_tokens)
display("Cantidad de docs en el corpus:", w2v_model.corpus_count)
display("Cantidad de words distintas en el corpus:", len(w2v_model.wv.index_to_key))

'Cantidad de docs en el corpus:'

13504

'Cantidad de words distintas en el corpus:'

1122

In [131]:
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=35,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 148688.84375
Loss after epoch 1: 154944.375
Loss after epoch 2: 156500.25
Loss after epoch 3: 148608.59375
Loss after epoch 4: 128065.0625
Loss after epoch 5: 150578.125
Loss after epoch 6: 143886.9375
Loss after epoch 7: 148801.3125
Loss after epoch 8: 141390.125
Loss after epoch 9: 140532.5
Loss after epoch 10: 140342.625
Loss after epoch 11: 120196.75
Loss after epoch 12: 139894.375
Loss after epoch 13: 121061.0
Loss after epoch 14: 140422.125
Loss after epoch 15: 134675.5
Loss after epoch 16: 130876.75
Loss after epoch 17: 131672.75
Loss after epoch 18: 132998.75
Loss after epoch 19: 137115.75
Loss after epoch 20: 136035.75
Loss after epoch 21: 130298.75
Loss after epoch 22: 129759.5
Loss after epoch 23: 111475.25
Loss after epoch 24: 132522.75
Loss after epoch 25: 134057.75
Loss after epoch 26: 125037.25
Loss after epoch 27: 126986.75
Loss after epoch 28: 133873.0
Loss after epoch 29: 126423.0
Loss after epoch 30: 128231.75
Loss after epoch 31: 119981.5
Loss af

(1667495, 2975910)

## Pruebas

In [132]:
w2v_model.wv.most_similar(positive=["grail"], topn=10)

[('diary', 0.39182910323143005),
 ('knight', 0.3685998320579529),
 ('vial', 0.33626991510391235),
 ('dr', 0.31551000475883484),
 ('late', 0.31352177262306213),
 ('professor', 0.3104715049266815),
 ('doctor', 0.30360856652259827),
 ('schneider', 0.3028962314128876),
 ('touch', 0.3020654320716858),
 ('breath', 0.2945864200592041)]

En este caso podemos ver como la palabra que mas correlación tiene con Grial es Diario, en la pelicula "Indiana Jones y la última cruzada", se menciona varias veces al diario del grial donde estaba la información de como encontrarlo. 

In [133]:
w2v_model.wv.most_similar(positive=["ark"], topn=10)

[('covenant', 0.5358842015266418),
 ('lid', 0.3385184109210968),
 ("let's", 0.31291845440864563),
 ('stuff', 0.2976982593536377),
 ('indians', 0.2872527837753296),
 ('pages', 0.28354933857917786),
 ('souls', 0.27519848942756653),
 ('silver', 0.2683049440383911),
 ('penitent', 0.2653197646141052),
 ('until', 0.26150017976760864)]

En "Indiana Jones y los cazadores del arca perdida" la famosa arca se llama "el arca de la alianza" por eso la mejor correlación es con esa palabra.

In [134]:
w2v_model.wv.most_similar(positive=["sankara"], topn=10)

[('stones', 0.6419008374214172),
 ('bag', 0.5245639681816101),
 ('sacred', 0.4658103585243225),
 ('priest', 0.3602324426174164),
 ('priests', 0.3548106253147125),
 ('punches', 0.3434930741786957),
 ('places', 0.3341156840324402),
 ('says', 0.31909072399139404),
 ('shoulder', 0.3147355318069458),
 ('ma', 0.31326180696487427)]

En "Indiana Jones y el templo de la pedición" el objeto que Indy debe rescatar era la Piedra de Sankara.

In [161]:
w2v_model.wv.most_similar(negative=["hat"], topn=10)

[('frightened', 0.07920803129673004),
 ('seen', 0.04068366438150406),
 ('wide', 0.03377776965498924),
 ("they're", 0.025555318221449852),
 ('lovely', 0.022509140893816948),
 ('lift', 0.022206634283065796),
 ('far', 0.02162557654082775),
 ('power', 0.016364354640245438),
 ('large', 0.013841045089066029),
 ('tunnels', 0.013216033577919006)]

Para la relación inversa no es que se detecten antonimos, sino palabras que no estan muy relacionadas en el espacio del embedding. No podemos sacar mucha conclusión del ejemplo.

## Visualizar agrupación de vectores

In [152]:
vecs, labels = reduce_dimensions(w2v_model)

MAX_WORDS=500
plot_embeddings(w2v_model, vecs, labels, MAX_WORDS, highlight_words=["grail","ark","sankara"])

In [145]:
vecs, labels = reduce_dimensions(w2v_model,3)

MAX_WORDS=200
plot_embeddings(w2v_model, vecs, labels, MAX_WORDS, highlight_words=["grail","ark","sankara"])