# Desafio 2 - NLP

- Crear sus propios vectores con Gensim basado en lo visto en clase con otro dataset.
- Probar términos de interés y explicar similitudes en el espacio de embeddings.
- Intentar plantear y probar tests de analogías. 
- Graficar los embeddings resultantes.
- Sacar conclusiones.

## Imports

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
from gensim.models import Word2Vec
import os
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE                   
import numpy as np
import plotly.graph_objects as go
import plotly.express as px                   

## Funciones generales

In [20]:
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobrecargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
def reduce_dimensions(model, num_dimensions = 2 ):
     
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    return vectors, labels

## Preprocesamiento

In [6]:
os.listdir("corpus")

['Indiana_Jones_And_The_Last_Crusade.txt',
 'Indiana_Jones_And_The_Temple_Of_Doom.txt',
 'Raiders_Of_The_Lost_Ark.txt']

In [39]:
df1 = pd.read_csv('corpus/Indiana_Jones_And_The_Last_Crusade.txt', sep='/n', header=None, engine='python')
df2 = pd.read_csv('corpus/Raiders_Of_The_Lost_Ark.txt', sep='/n', header=None, engine='python')
df3 = pd.read_csv('corpus/Indiana_Jones_And_The_Temple_Of_Doom.txt', sep='/n', header=None, engine='python')
df = pd.concat([df1, df2, df3], ignore_index=True)
display(df.head())
display("Cantidad de documentos:", df.shape[0])

Unnamed: 0,0
0,"""INDIANA JONES AND THE LAST CRUSADE"""
1,Screenplay by
2,Jeffrey Boam
3,FADE IN:
4,EXT. DESERT OF THE AMERICAN SOUTHWEST - DAY


'Cantidad de documentos:'

13504

In [40]:
sentence_tokens = []
# Recorrer todas las filas y transformar las oraciones
# en una secuencia de palabras (esto podría realizarse con NLTK o spaCy también)
for _, row in df[:None].iterrows():
    sentence_tokens.append(text_to_word_sequence(row[0]))

## Crear los vectores (word2vec)

In [75]:
# En este caso utilizaremos la estructura modelo Skipgram
w2v_model = Word2Vec(min_count=10,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=5,       # cant de palabras antes y desp de la predicha
                     vector_size=500,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=4,      # si tienen más cores pueden cambiar este valor
                     sg=0)           # modelo 0:CBOW  1:skipgram

In [76]:
w2v_model.build_vocab(sentence_tokens)
display("Cantidad de docs en el corpus:", w2v_model.corpus_count)
display("Cantidad de words distintas en el corpus:", len(w2v_model.wv.index_to_key))

'Cantidad de docs en el corpus:'

13504

'Cantidad de words distintas en el corpus:'

1122

In [79]:
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=60,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 47842.10546875
Loss after epoch 1: 38177.33203125
Loss after epoch 2: 47550.828125
Loss after epoch 3: 47062.125
Loss after epoch 4: 36538.328125
Loss after epoch 5: 45938.0625
Loss after epoch 6: 38373.53125
Loss after epoch 7: 45495.96875
Loss after epoch 8: 45045.28125
Loss after epoch 9: 44888.84375
Loss after epoch 10: 44333.0625
Loss after epoch 11: 43566.03125
Loss after epoch 12: 42328.75
Loss after epoch 13: 32974.625
Loss after epoch 14: 32979.75
Loss after epoch 15: 41711.0
Loss after epoch 16: 40965.0
Loss after epoch 17: 40443.75
Loss after epoch 18: 40809.5
Loss after epoch 19: 40101.5625
Loss after epoch 20: 41755.3125
Loss after epoch 21: 42286.3125
Loss after epoch 22: 39695.5
Loss after epoch 23: 42692.1875
Loss after epoch 24: 33951.25
Loss after epoch 25: 39260.0
Loss after epoch 26: 37755.75
Loss after epoch 27: 38370.375
Loss after epoch 28: 38524.125
Loss after epoch 29: 38380.375
Loss after epoch 30: 32723.75
Loss after epoch 31: 37969.125
Lo

(2857968, 5101560)

## Pruebas

In [80]:
w2v_model.wv.most_similar(positive=["grail"], topn=10)

[('vial', 0.32271936535835266),
 ('piece', 0.31661638617515564),
 ('breath', 0.30926579236984253),
 ('book', 0.29943010210990906),
 ('highness', 0.2666929066181183),
 ('pocket', 0.25716638565063477),
 ('lighter', 0.25180700421333313),
 ('cockpit', 0.2496134638786316),
 ('dad', 0.24548287689685822),
 ('mouth', 0.24481654167175293)]

In [81]:
w2v_model.wv.most_similar(positive=["ark"], topn=10)

[('pages', 0.369402676820755),
 ('indians', 0.3191428780555725),
 ('pieces', 0.29997438192367554),
 ('covenant', 0.2840358018875122),
 ("we're", 0.28078508377075195),
 ('lid', 0.28059902787208557),
 ('world', 0.2795999348163605),
 ('place', 0.2715905010700226),
 ('beam', 0.2548985779285431),
 ('cab', 0.25403445959091187)]

In [84]:
w2v_model.wv.most_similar(positive=["sankara"], topn=10)

[('stones', 0.5401177406311035),
 ('sacred', 0.49849429726600647),
 ('bag', 0.4511288106441498),
 ('pieces', 0.3865293860435486),
 ('chest', 0.34016749262809753),
 ('shoulder', 0.29628708958625793),
 ('american', 0.27513179183006287),
 ('shaman', 0.2644558548927307),
 ('priest', 0.26418304443359375),
 ('ma', 0.26349884271621704)]

In [89]:
w2v_model.wv.most_similar(negative=["good"], topn=10)

[('fires', 0.2919274568557739),
 ('across', 0.28768402338027954),
 ('smiling', 0.28071507811546326),
 ('boulder', 0.2559468448162079),
 ('and', 0.25493016839027405),
 ('runs', 0.24979250133037567),
 ('toward', 0.24632732570171356),
 ('pilot', 0.24283920228481293),
 ('kicks', 0.24029484391212463),
 ('hurries', 0.23977002501487732)]

## Visualizar agrupación de vectores

In [97]:
vecs, labels = reduce_dimensions(w2v_model)

MAX_WORDS=500
fig = px.scatter(x=vecs[:MAX_WORDS,0], y=vecs[:MAX_WORDS,1], text=labels[:MAX_WORDS])
fig.show() # renderer="colab" esto para plotly en colab

In [99]:
vecs, labels = reduce_dimensions(w2v_model,3)

fig = px.scatter_3d(x=vecs[:MAX_WORDS,0], y=vecs[:MAX_WORDS,1], z=vecs[:MAX_WORDS,2],text=labels[:MAX_WORDS])
fig.update_traces(marker_size = 2)
fig.show()