<a href="https://colab.research.google.com/github/pedromperezc/CEIA/blob/master/NLP/Desafio_3_Custom_embedding_con_Gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Custom embedddings con Gensim



### Objetivo
El objetivo es utilizar documentos / corpus para crear embeddings de palabras basado en ese contexto. Se utilizará canciones de bandas para generar los embeddings, es decir, que los vectores tendrán la forma en función de como esa banda haya utilizado las palabras en sus canciones.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import multiprocessing
from gensim.models import Word2Vec
import spacy
import regex as re

import multiprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('drive/MyDrive/NLP/')

### Datos
Utilizaremos como dataset peliculas tomadas de la página IMDB.

In [None]:
df = pd.read_csv("TMDb_updated.CSV")
df

Unnamed: 0.1,Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1
3,3,Ant-Man,Armed with the astonishing ability to shrink i...,en,13611,7.1
4,4,Percy Jackson: Sea of Monsters,"In their quest to confront the ultimate evil, ...",en,3542,5.9
...,...,...,...,...,...,...
9995,9995,Cargo,The story of CARGO takes place on rusty space-...,de,225,5.9
9996,9996,The Good Night,"Gary, a musician, is trapped in an unhappy rel...",en,67,5.6
9997,9997,The World Is Yours,"To escape his life of crime, a Paris drug deal...",fr,234,7.1
9998,9998,The Grand Seduction,A small fishing village must procure a local d...,en,169,6.7


In [None]:
print("Cantidad de documentos:", df.shape[0])

Cantidad de documentos: 10000


### 1 - Preprocesamiento

In [None]:
# seleccionamos el título y resumen de la pelicula
df1 = df[['title','overview']]

# Unimos el título y el resumen en una sola columna.
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)

# Creamos un nuevo dataframe
df_final = pd.DataFrame({'clean': df2}) 

In [None]:
df_final

Unnamed: 0,clean
0,"Ad Astra,The near future, a time when both hop..."
1,"Bloodshot,After he and his wife are murdered, ..."
2,"Bad Boys for Life,Marcus and Mike are forced t..."
3,"Ant-Man,Armed with the astonishing ability to ..."
4,"Percy Jackson: Sea of Monsters,In their quest ..."
...,...
9995,"Cargo,The story of CARGO takes place on rusty ..."
9996,"The Good Night,Gary, a musician, is trapped in..."
9997,"The World Is Yours,To escape his life of crime..."
9998,"The Grand Seduction,A small fishing village mu..."


In [None]:
# Deshabilitar Named Entity 
nlp = spacy.load('en', disable=['ner', 'parser'])

def cleaning(doc):
    # Tokenización y lematización
    txt = [token.lemma_ for token in doc if not token.is_stop]

    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
#elimino caracteres no alfabeticos:
iterador = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_final['clean'])

In [None]:
#Creo un pipeline para limpiar los documentos
txt = [cleaning(doc) for doc in nlp.pipe(iterador, batch_size=5000, n_threads=-1)]

In [None]:
df_clean = pd.DataFrame({'frase': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean

Unnamed: 0,frase
0,ad astra near future time hope hardship drive ...
1,bloodshot wife murder marine ray garrison resu...
2,bad boy life marcus mike force confront new th...
3,ant man arm astonishing ability shrink scale i...
4,percy jackson sea monster quest confront ultim...
...,...
9995,cargo story cargo take place rusty space freig...
9996,good night gary musician trap unhappy relation...
9997,world escape life crime paris drug dealer take...
9998,grand seduction small fishing village procure ...


In [None]:
# Separo los tokens
corpus = df_clean.frase.apply(lambda x: x.split()).values
corpus

array([list(['ad', 'astra', 'near', 'future', 'time', 'hope', 'hardship', 'drive', 'humanity', 'look', 'star', 'mysterious', 'phenomenon', 'menace', 'destroy', 'life', 'planet', 'earth', 'astronaut', 'roy', 'mcbride', 'undertake', 'mission', 'immensity', 'space', 'peril', 'uncover', 'truth', 'lost', 'expedition', 'decade', 'boldly', 'face', 'emptiness', 'silence', 'search', 'unknown']),
       list(['bloodshot', 'wife', 'murder', 'marine', 'ray', 'garrison', 'resurrect', 'team', 'scientist', 'enhance', 'nanotechnology', 'superhuman', 'biotech', 'kill', 'machine', "'", 'bloodshot', "'", 'ray', 'train', 'fellow', 'super', 'soldier', 'recall', 'life', 'memory', 'flood', 'remember', 'man', 'kill', 'wife', 'break', 'facility', 'revenge', 'discover', 'conspiracy', 'think']),
       list(['bad', 'boy', 'life', 'marcus', 'mike', 'force', 'confront', 'new', 'threat', 'career', 'change', 'midlife', 'crisis', 'join', 'newly', 'create', 'elite', 'team', 'ammo', 'miami', 'police', 'department', 'ru

### 2 - Crear los vectores (word2vec)

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
# El modulo Phrases permite detectar frases 
phrases = Phrases(corpus, min_count=30, progress_per=10000)

In [None]:
sentences = phrases[corpus]



In [None]:
from gensim.models.callbacks import CallbackAny2Vec
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobracargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
#Verifico el número de cores de colab
multiprocessing.cpu_count()

2

In [None]:
# Crearmos el modelo generador de vectoeres
# En este caso utilizaremos la estructura modelo Skipgram
w2v_model = Word2Vec(min_count=10,     # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=8,         # cant de palabras antes y desp de la predicha
                     size=300,         # dimensionalidad de los vectores 
                     negative=20,      # cantidad de negative samples... 0 es no se usa
                     workers=2,       # si tienen más cores pueden cambiar este valor
                     sg=1  )           # modelo 0:CBOW  1:skipgram

In [None]:
# Buildear el vocabularui con los tokens
w2v_model.build_vocab(sentences)



In [None]:
# Cantidad de filas/docs encontradas en el corpus
print("Cantidad de docs en el corpus:", w2v_model.corpus_count)

Cantidad de docs en el corpus: 9992


In [None]:
# Cantidad de words encontradas en el corpus
print("Cantidad de words distintas en el corpus:", len(w2v_model.wv.vocab))

Cantidad de words distintas en el corpus: 4355


### 3 - Entrenar el modelo generador

In [None]:
# Entrenamos el modelo generador de vectores
# Utilizamos nuestro callback
w2v_model.train(sentences,
                 total_examples=w2v_model.corpus_count,
                 epochs=100,
                 compute_loss = True,
                 callbacks=[callback()]
                 )



Loss after epoch 0: 3256977.25
Loss after epoch 1: 2514079.75
Loss after epoch 2: 2427739.5
Loss after epoch 3: 2289603.5
Loss after epoch 4: 2315224.0
Loss after epoch 5: 2240345.0
Loss after epoch 6: 2206877.0
Loss after epoch 7: 2200050.0
Loss after epoch 8: 2174940.0
Loss after epoch 9: 2144230.0
Loss after epoch 10: 2119562.0
Loss after epoch 11: 2055614.0
Loss after epoch 12: 2078168.0
Loss after epoch 13: 2059218.0
Loss after epoch 14: 2080956.0
Loss after epoch 15: 2131164.0
Loss after epoch 16: 2211400.0
Loss after epoch 17: 2093340.0
Loss after epoch 18: 2073832.0
Loss after epoch 19: 2072068.0
Loss after epoch 20: 2007616.0
Loss after epoch 21: 2040344.0
Loss after epoch 22: 2046452.0
Loss after epoch 23: 2032580.0
Loss after epoch 24: 1977612.0
Loss after epoch 25: 1973876.0
Loss after epoch 26: 1974248.0
Loss after epoch 27: 2010376.0
Loss after epoch 28: 1961636.0
Loss after epoch 29: 2014232.0
Loss after epoch 30: 2001412.0
Loss after epoch 31: 665452.0
Loss after epoch 

(22049444, 27517600)

In [None]:
w2v_model.wv.save_word2vec_format("word2vec.model")

In [None]:
!python3 -m gensim.scripts.word2vec2tensor -i word2vec.model -o /my_model

2022-04-06 01:11:32,548 - word2vec2tensor - INFO - running /usr/local/lib/python3.7/dist-packages/gensim/scripts/word2vec2tensor.py -i word2vec.model -o /my_model
2022-04-06 01:11:32,549 - utils_any2vec - INFO - loading projection weights from word2vec.model
2022-04-06 01:11:34,242 - utils_any2vec - INFO - loaded (4355, 300) matrix from word2vec.model
2022-04-06 01:11:35,202 - word2vec2tensor - INFO - 2D tensor file saved to /my_model_tensor.tsv
2022-04-06 01:11:35,203 - word2vec2tensor - INFO - Tensor metadata file saved to /my_model_metadata.tsv
2022-04-06 01:11:35,203 - word2vec2tensor - INFO - finished running word2vec2tensor.py


In [None]:
!ls my_model

my_model_metadata.tsv  my_model_tensor.tsv


In [None]:
!cd my_model/

In [None]:
!ls

 desafio_2ipynb.ipynb			        TMDb_updated.CSV
'Desafio_3_Custom embedding con Gensim.ipynb'   word2vec.ipynb
 my_model				        word2vec.model


### 4 - Ensayar

In [None]:
# Palabras que MÁS se relacionan con...:
w2v_model.wv.most_similar(positive=["police"], topn=10)

[('detective', 0.3753027319908142),
 ('department', 0.33261585235595703),
 ('cop', 0.3245007395744324),
 ('police_officer', 0.2976311445236206),
 ('homicide', 0.29239848256111145),
 ('robbery', 0.2861417531967163),
 ('inspector', 0.2732832431793213),
 ('rookie', 0.2691398859024048),
 ('killer', 0.2626540958881378),
 ('foul', 0.2598835229873657)]

### Se puede ver que la palabra police se encuentra muy relacionadas a palabras que hacen referencia al ambito policial, por ejemplo, department o detective. 

In [None]:
# Palabras que MÁS se relacionan con...:
w2v_model.wv.most_similar(positive=["crime"], topn=10)

[('murder', 0.36579328775405884),
 ('commit', 0.36166104674339294),
 ('syndicate', 0.32888126373291016),
 ('cop', 0.31186699867248535),
 ('frame', 0.30485498905181885),
 ('gotham', 0.2971848249435425),
 ('criminal', 0.2860395908355713),
 ('boss', 0.276999831199646),
 ('heist', 0.2709220349788666),
 ('batman', 0.2661207318305969)]

### Se observa que la palabra crime esta muy relacionada con murder lo cual tiene sentido, se observa que existe una relación con la palabra Gotham la cual hace referencia a una serie de televisión basada en los personajes de Batman.

In [None]:
# Palabras que MÁS se relacionan con...:
w2v_model.wv.most_similar(positive=["gotham"], topn=10)

[('batman', 0.6556724309921265),
 ('joker', 0.49717873334884644),
 ('city', 0.34742623567581177),
 ('superman', 0.326465904712677),
 ('vigilante', 0.31686848402023315),
 ('wayne', 0.306171178817749),
 ('penguin', 0.29732394218444824),
 ('crime', 0.2971847653388977),
 ('knight', 0.2820934057235718),
 ('bruce', 0.2772892117500305)]

### Se observa que la palabra "gotham" esta muy relacionada con varios de los personajes de Batman lo cual tiene sentido. 

In [None]:
# Palabras que MÁS se relacionan con...:
w2v_model.wv.most_similar(positive=["batman"], topn=10)

[('gotham', 0.6556724309921265),
 ('joker', 0.4767673909664154),
 ('superman', 0.46002453565597534),
 ('robin', 0.37576743960380554),
 ('lego', 0.3550025522708893),
 ('wayne', 0.3524987995624542),
 ('lantern', 0.3328215479850769),
 ('masked', 0.3229440450668335),
 ('counterpart', 0.3016071319580078),
 ('knight', 0.3014240264892578)]

### Se observa que la palabra "batman" se encuentra muy relacionada con diferentes personajes de la pelicula así como otros super heroes como superman.

In [None]:
w2v_model.wv.most_similar(positive=["superhero"], topn=10)

[('superpower', 0.3013066053390503),
 ('titan', 0.30071550607681274),
 ('superman', 0.28203892707824707),
 ('batman', 0.2797221839427948),
 ('crimson', 0.27679550647735596),
 ('invitation', 0.26957833766937256),
 ('blade', 0.26519566774368286),
 ('genetically', 0.26397833228111267),
 ('futuristic', 0.26139533519744873),
 ('spider', 0.2568950653076172)]

In [None]:
w2v_model.wv.most_similar(positive=["spider"], topn=10)

[('parker', 0.39420950412750244),
 ('graduation', 0.2892509400844574),
 ('shape', 0.28842002153396606),
 ('peter', 0.26810330152511597),
 ('genetically', 0.2678210139274597),
 ('july', 0.26629865169525146),
 ('mutate', 0.26561713218688965),
 ('ego', 0.2629801034927368),
 ('superhero', 0.2568950653076172),
 ('stockholm', 0.2546631395816803)]

In [None]:
# Palabras que MÁS se relacionan con...:
w2v_model.wv.most_similar(positive=["money"], topn=10)

[('million', 0.3378453254699707),
 ('cash', 0.29531335830688477),
 ('stash', 0.2948591709136963),
 ('job', 0.29136571288108826),
 ('earn', 0.2854808568954468),
 ('prostitute', 0.2797958254814148),
 ('decide', 0.2781282365322113),
 ('casino', 0.2642675042152405),
 ('crook', 0.2629411816596985),
 ('drug_lord', 0.2618781328201294)]

In [None]:
# Comparamos tres palabras para ver cual no se corresponde
w2v_model.wv.doesnt_match(['drug', 'police', 'love'])


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



'love'

### Entre las tres palabras la menos relacionada es love.

### 5 - Visualizar agrupación de vectores

In [None]:
from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE                   
import numpy as np                                  

def reduce_dimensions(model):
    num_dimensions = 2  

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index2word)  

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [None]:
# Graficar los embedddings en 2D
import plotly.graph_objects as go
import plotly.express as px

x_vals, y_vals, labels = reduce_dimensions(w2v_model)

MAX_WORDS=200
fig = px.scatter(x=x_vals[:MAX_WORDS], y=y_vals[:MAX_WORDS], text=labels[:MAX_WORDS])
fig.show(renderer="colab") # esto para plotly en colab


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

