## Word Embeddings 
### Word2Vec y FastText

Vamos a cargar las librerias necesarias

In [68]:
from gensim.models import Word2Vec
from gensim.models import FastText

import numpy as np
import pandas as pd

Cargamos la fuente de datos de prueba

In [2]:
df = pd.read_pickle('data/confesiones.df')
df = df[['Text', 'no_stopwords', 'token_sentence']]

In [3]:
df.sample(5)

Unnamed: 0,Text,no_stopwords,token_sentence
780,"Hola venado cachudo, está es mi historia... vo...","[cachudo, historia, volvi, conversar, conoci, ...",cachudo historia volvi conversar conoci colegi...
6253,¡Hola tío confe! La verdad estoy muy indignado...,"[verdad, indignado, pasando, debido, caso, pla...",verdad indignado pasando debido caso plagio co...
2594,"5133 Tío confe , hola jajaa bien no sé cómo ra...","[rayos, empezar, paso, aqui, enamore, pata, in...",rayos empezar paso aqui enamore pata ingenieri...
6965,Hola! Estudio contabilidad en la Universidad d...,"[estudio, contabilidad, lima, pienso, cambiarm...",estudio contabilidad lima pienso cambiarme sig...
4538,holi perrovaquita xd Waoooo no hago un confe d...,"[perrovaquita, wao, hago, años, extraño, softw...",perrovaquita wao hago años extraño software es...


Definimos nuestro corpus

In [4]:
corpus = df.no_stopwords.to_list()

In [5]:
corpus[5:7]

[['gustan',
  'monedas',
  'plata',
  'conseguir',
  'catolica',
  'vendan',
  'monedas',
  'denle',
  'encorazona',
  'igual'],
 ['consulta',
  'ver',
  'publicas',
  'uu',
  'porfis',
  'posibilidad',
  'ir',
  'intercambio',
  'sali',
  'aceptada',
  'hablando',
  'padres',
  'podrian',
  'costearlo',
  'cuestion',
  'demasiado',
  'dinero',
  'vale',
  'pena',
  'experiencia',
  'campo',
  'laboral',
  'regresas',
  'posiciona',
  'pais',
  'iria',
  'europa',
  'habla',
  'español',
  'ingles',
  'sume',
  'espero',
  'pasen',
  'buenas',
  'fiestas',
  'agradezco',
  'respuestas']]

**Word2Vec Skipgram con Negative Sampling**

Creamos una instancia de word2vec

In [6]:
word2vec = Word2Vec(vector_size=100, # dimension del embedding
                    window=5,        # 5 context words 
                    sg=1,            # skipgram
                    negative=5,      # negative samples
                    min_count=1,     # considera a todo el vocabulario
                    workers=4)       # numero de threads (paralelismo)

Construimos el vocabulario para el modelo

In [7]:
word2vec.build_vocab(corpus)

Entrenamos el modelo con el vocabulario

In [8]:
word2vec.train(corpus, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)

(1398545, 1405445)

Cuando terminemos de entrenar el modelo, lo ideal es eliminarlo para que no ocupe espacio innecesario en la RAM, solo nos quedamos con la instancia KeyedVector que contiene un diccionario {palabra:embedding}

In [9]:
key_model = word2vec.wv

In [10]:
del word2vec

Verificamos las palabras fuera de vocabulario

In [12]:
def verificar_OOV(palabra, embedding):
    if palabra in embedding:
        print("existe")
    else:
        print("OOV")

In [19]:
verificar_OOV("humano", key_model)

existe


In [16]:
verificar_OOV("abcde", key_model)

OOV


Representación vectorial para la palabra "human"

In [34]:
pandemia_vector = key_model["pandemia"]
print(pandemia_vector)

[-0.2390047   0.29652548 -0.03869939  0.0491403   0.11378825 -0.49919015
  0.07133853  0.5687842  -0.21554929 -0.10956714 -0.2092009  -0.42416403
  0.11806712  0.40812877  0.32723984 -0.25870752 -0.04779321 -0.14878425
  0.1385481  -0.45617947  0.12299144 -0.06781757 -0.01403688 -0.11016196
 -0.13268118  0.2373916  -0.2649964   0.09688468 -0.07709549 -0.09410274
  0.14331028  0.07923035  0.09977055 -0.05100177 -0.08609331  0.27598578
  0.0058547   0.04817837 -0.3181799  -0.10952939  0.176932   -0.5109165
 -0.25085354  0.30111203  0.10216925 -0.2056061  -0.03133236 -0.1996022
 -0.00935877 -0.04443126  0.1673057  -0.21359764 -0.00456343  0.08677194
  0.1798133   0.27755877 -0.00388626  0.14213505 -0.37570733  0.09470231
  0.03128951 -0.04911314 -0.13228008 -0.08324295 -0.2469396   0.24066824
  0.02542163  0.22501263 -0.20171563  0.3939298  -0.4522092  -0.00246767
  0.15539712 -0.24458495  0.34821635  0.2877434  -0.17087355 -0.01864116
 -0.21716693  0.06055283 -0.3304058   0.1009806  -0.3

Longitud del vector "pandemia"

In [43]:
sum(pandemia_vector*pandemia_vector)**0.5

2.295185530738878

In [35]:
pandemia_vector.shape

(100,)

In [36]:
key_model.most_similar(["pandemia"])

[('debido', 0.9701520800590515),
 ('crisis', 0.9429941773414612),
 ('economicos', 0.9338640570640564),
 ('covid', 0.9330642223358154),
 ('trabajando', 0.9285545349121094),
 ('iniciar', 0.9205121994018555),
 ('motivos', 0.9191024899482727),
 ('comunicacion', 0.9181717038154602),
 ('vacaciones', 0.9164676070213318),
 ('familiares', 0.9151014089584351)]

Obtener la representacion vectorial de un documento

In [46]:
# un documento tokenizado puede ser representado como una matriz de embededings, 
# en este caso, el documento "covid pandemia"
key_model[['covid', 'pandemia']]

array([[-0.17904632,  0.19197476,  0.02264021, -0.01356796,  0.15642785,
        -0.38012156,  0.00779457,  0.47159195, -0.09319825, -0.00105666,
        -0.16929497, -0.32610455, -0.00511496,  0.29634   ,  0.24495059,
        -0.18867934, -0.08726405, -0.21613845,  0.07010417, -0.42640716,
         0.09098908, -0.03896779,  0.11095868, -0.12291662, -0.06855084,
         0.1492431 , -0.08804021,  0.01088107, -0.14334513, -0.07146288,
         0.23652881,  0.06765001,  0.07744748, -0.17819698, -0.0790666 ,
         0.14328305, -0.04333408, -0.11646359, -0.1688872 , -0.20693003,
        -0.01875087, -0.36640525, -0.24572599,  0.23690863,  0.1995254 ,
        -0.20372383, -0.06515097, -0.14332515, -0.00433778,  0.01963469,
         0.0967778 , -0.33601883, -0.14257069,  0.0955412 ,  0.03433375,
         0.238643  ,  0.06594518,  0.13286377, -0.38755047,  0.08264332,
        -0.0117006 ,  0.00959196, -0.1851691 ,  0.01218765, -0.22720027,
         0.16809678,  0.01827211,  0.02882191, -0.1

Sin embargo, para modelos de machine learning esto no es optimo ya que estos modelos aceptan como input, un vector y **no** una matriz

Es posible promediar todos los embeddings que representan a un solo documento, de esta forma obtenemos una representacion vectorial del documento. Vamos a crear una funcion para ello:

In [48]:
def get_sentence_vector(word_vectors, tokenized_document):
    doc = [word for word in tokenized_document if word in word_vectors]
    if len(doc) > 0:
        return np.mean(word_vectors[doc], axis=0)  # normalize vector before averaging
    else:
        return None

In [59]:
print(corpus[125])

['puedes', 'preguntar', 'fans', 'probabilidad', 'halla', 'clases', 'presenciales', 'proximo', 'ciclo', 'posible', 'llevar', 'presenciales', 'virtuales', 'cierto', 'real', 'verdad', 'manejando', 'tipo', 'paginas']


In [60]:
get_sentence_vector(key_model, corpus[125])

array([-0.09966843,  0.2983563 , -0.07314879,  0.06365222,  0.14246635,
       -0.37941924,  0.17477052,  0.6202089 , -0.17701139, -0.13235344,
       -0.17335267, -0.4220199 ,  0.09172118,  0.22594891,  0.24485126,
       -0.20388265,  0.14098224, -0.26175854,  0.13703208, -0.54886067,
        0.2382588 ,  0.03613375,  0.07093573, -0.2529772 , -0.20025186,
        0.08944787, -0.22051959, -0.1434103 , -0.11874307,  0.07474812,
        0.1711025 , -0.08759552,  0.03388444, -0.15765117, -0.11169514,
        0.33900627,  0.03338052,  0.09719645, -0.13137884, -0.27516124,
        0.04333494, -0.48839444, -0.1546015 ,  0.19651513,  0.1503721 ,
       -0.04194989, -0.11110803, -0.20310472,  0.09437621,  0.18277337,
        0.13614722, -0.2061509 , -0.0334821 ,  0.09659825,  0.04367841,
        0.13911146,  0.14986786,  0.07076327, -0.44449526,  0.13459106,
        0.00289829,  0.08915915, -0.01121171, -0.05578618, -0.24106008,
        0.23559548,  0.06822045,  0.21339028, -0.12098318,  0.23

Si queremos obtener la matriz de datos de entrada (donde cada fila representa a un documento) para poder entrenar un modelo de Machine Learning, usamos la siguiente funcion

In [61]:
def get_matrix_features(tokenized_documents, word_vectors):
    X = [dv for doc in tokenized_documents if (dv := get_sentence_vector(word_vectors, doc)) is not None]
    X = np.asarray(X)
    return X

In [63]:
X = get_matrix_features(corpus, key_model)
X

array([[-0.03438928,  0.17347874, -0.09558075, ..., -0.13287136,
         0.08594537,  0.06277625],
       [-0.05179014,  0.16660129, -0.01314778, ..., -0.11870394,
         0.07152622, -0.008335  ],
       [-0.01738044,  0.18657194, -0.15941274, ..., -0.20389093,
         0.03129928, -0.00515905],
       ...,
       [-0.05397575,  0.10478994, -0.14192767, ..., -0.04172281,
         0.20646419,  0.11925217],
       [-0.06310783,  0.16798544, -0.03263178, ..., -0.10025916,
         0.07750864,  0.06027355],
       [-0.14128111,  0.25246173, -0.06332291, ..., -0.03357408,
         0.17202091,  0.02856389]], dtype=float32)

In [64]:
X.shape

(7622, 100)

In [66]:
len(X)  # cantidad de documentos en el corpus

7622

----

**FastText Skipgram con Negative Sampling**

Creamos una instancia de FastText

In [69]:
fasttext = FastText(vector_size=100, # dimension del embedding
                    window=5,        # 5 context words 
                    sg=1,            # skipgram
                    negative=5,      # negative samples
                    min_count=1,     # considera a todo el vocabulario
                    workers=4)       # numero de threads (paralelismo)

Construimos el vocabulario para el modelo

In [70]:
fasttext.build_vocab(corpus)

Entrenamos el modelo con el vocabulario

In [72]:
fasttext.train(corpus, total_examples=fasttext.corpus_count, epochs=fasttext.epochs)

(1398465, 1405445)

Cuando terminemos de entrenar el modelo, lo ideal es eliminarlo para que no ocupe espacio innecesario en la RAM, solo nos quedamos con la instancia KeyedVector que contiene un diccionario {palabra:embedding}

In [73]:
key_model = fasttext.wv

In [74]:
del fasttext

Verificamos las palabras fuera de vocabulario

In [75]:
def verificar_OOV(palabra, embedding):
    if palabra in embedding:
        print("existe")
    else:
        print("OOV")

In [76]:
verificar_OOV("humano", key_model)

existe


In [77]:
verificar_OOV("abcde", key_model)

existe


Representación vectorial para la palabra "human"

In [78]:
pandemia_vector = key_model["pandemia"]
print(pandemia_vector)

[-0.47535917  0.4469425  -0.08889992  0.4525141   0.47795844 -0.24794318
  0.16550289 -0.03592255  0.25135943 -0.43389696  0.27088374 -0.281261
 -0.20038722  0.22962731 -0.6397624   0.06929249  0.06661253  0.2604368
 -0.6017789  -0.4097073  -0.342086   -0.01715344 -0.46508548 -0.14563188
 -0.11072055 -0.1546572  -0.13298689 -0.21401045 -0.03844127  0.18706389
  0.10245904 -0.06149973  0.12074273  0.07830574 -0.32698956  0.07109768
  0.19077575 -0.17564686 -0.06366107 -0.65924186  0.36575025 -0.28583434
  0.02122089  0.24287769 -0.22995834 -0.20991999  0.0438101  -0.5178599
  0.08785778 -0.06692299 -0.28998357 -0.16178887 -0.18913785 -0.10287409
 -0.10026503 -0.04862829 -0.23693305 -0.0455689   0.11870112  0.11238942
  0.10105834 -0.4771492  -0.10106739 -0.06594379  0.72586024  0.61520654
  0.04679266 -0.03268689  0.56103283  0.49196678 -0.1792612   0.32822493
 -0.04159755 -0.56236637 -0.06698496 -0.0542876  -0.2246779  -0.1031053
 -0.26109385  0.49571064 -0.57822174 -0.37344995  0.1214

Longitud del vector "pandemia"

In [79]:
sum(pandemia_vector*pandemia_vector)**0.5

3.0380428421624783

In [80]:
pandemia_vector.shape

(100,)

In [81]:
key_model.most_similar(["pandemia"])

[('epidemia', 0.9771687388420105),
 ('ingresaba', 0.9761853218078613),
 ('acamdemia', 0.9759626388549805),
 ('aproximadamente', 0.9759359359741211),
 ('pandemic', 0.9755813479423523),
 ('colegio', 0.9753250479698181),
 ('egreso', 0.974170982837677),
 ('ci', 0.9729998111724854),
 ('aproxima', 0.9727729558944702),
 ('covid', 0.9725071787834167)]

Obtener la representacion vectorial de un documento

In [82]:
# un documento tokenizado puede ser representado como una matriz de embededings, 
# en este caso, el documento "covid pandemia"
key_model[['covid', 'pandemia']]

array([[-0.34878814,  0.38450465, -0.04177864,  0.40855873,  0.3813158 ,
        -0.33506197,  0.14639966, -0.1404888 ,  0.2085058 , -0.48305133,
         0.27145165, -0.2520652 , -0.17788094,  0.12681164, -0.65341824,
         0.04566998,  0.02866227,  0.27733248, -0.62406564, -0.4059716 ,
        -0.30960342,  0.04792475, -0.39524353, -0.12456547, -0.24178675,
        -0.13891889, -0.18745337, -0.12828144, -0.05519076,  0.12175844,
        -0.05612132,  0.02216432,  0.0410179 ,  0.03403662, -0.2761785 ,
         0.18147764,  0.23162311, -0.15011552, -0.12374708, -0.42788944,
         0.326867  , -0.26238033,  0.00315759,  0.22131827, -0.34013656,
        -0.1926434 ,  0.14900869, -0.47694784,  0.03450155, -0.12294238,
        -0.18776199, -0.13273199, -0.16079721, -0.0664361 , -0.12895097,
        -0.0067935 , -0.25893992, -0.08077059,  0.25232208,  0.17014582,
         0.23108448, -0.43817705, -0.10647551, -0.09223165,  0.5459727 ,
         0.4879014 , -0.02596772, -0.13827577,  0.5

Sin embargo, para modelos de machine learning esto no es optimo ya que estos modelos aceptan como input, un vector y **no** una matriz

Es posible promediar todos los embeddings que representan a un solo documento, de esta forma obtenemos una representacion vectorial del documento. Vamos a crear una funcion para ello:

In [83]:
def get_sentence_vector(word_vectors, tokenized_document):
    doc = [word for word in tokenized_document if word in word_vectors]
    if len(doc) > 0:
        return np.mean(word_vectors[doc], axis=0)  # normalize vector before averaging
    else:
        return None

In [84]:
print(corpus[125])

['puedes', 'preguntar', 'fans', 'probabilidad', 'halla', 'clases', 'presenciales', 'proximo', 'ciclo', 'posible', 'llevar', 'presenciales', 'virtuales', 'cierto', 'real', 'verdad', 'manejando', 'tipo', 'paginas']


In [85]:
get_sentence_vector(key_model, corpus[125])

array([-0.46570483,  0.35979843,  0.2460997 ,  0.6164818 ,  0.4996098 ,
       -0.46645707,  0.2920175 , -0.33821374,  0.24981129, -0.5223775 ,
        0.2867807 , -0.16843736, -0.28045735,  0.01047138, -0.71035475,
        0.01722758,  0.14963378,  0.46548286, -0.5452751 , -0.34808767,
       -0.22101271, -0.17935948, -0.5232851 , -0.10239992, -0.40110222,
       -0.09259734, -0.1502019 , -0.24519598, -0.13400097,  0.26709017,
       -0.15264359,  0.10928688,  0.06633197,  0.04208242, -0.4050258 ,
        0.07371589,  0.3516671 , -0.14144397, -0.05986004, -0.49273014,
        0.39993286, -0.22230387,  0.07462651,  0.02176721, -0.3846363 ,
       -0.05140441, -0.01352565, -0.35135728,  0.12472791,  0.04670028,
       -0.2748418 , -0.03172313, -0.1886201 , -0.12579818, -0.22141258,
        0.14300469, -0.3336247 ,  0.11218429,  0.27585265,  0.13761899,
        0.03280386, -0.27869672, -0.09019964, -0.02636855,  0.396761  ,
        0.59416026,  0.16121282, -0.05153586,  0.60634404,  0.49

Si queremos obtener la matriz de datos de entrada (donde cada fila representa a un documento) para poder entrenar un modelo de Machine Learning, usamos la siguiente funcion

In [86]:
def get_matrix_features(tokenized_documents, word_vectors):
    X = [dv for doc in tokenized_documents if (dv := get_sentence_vector(word_vectors, doc)) is not None]
    X = np.asarray(X)
    return X

In [87]:
X = get_matrix_features(corpus, key_model)
X

array([[-0.16492157,  0.39495015,  0.15380646, ..., -0.04176918,
        -0.06285821,  0.02810989],
       [-0.25051168,  0.29933652,  0.08941238, ..., -0.10894046,
        -0.12815644, -0.01799944],
       [-0.2372306 ,  0.40630752,  0.17936037, ..., -0.02705403,
        -0.08114426,  0.00043439],
       ...,
       [ 0.0510092 ,  0.38551578, -0.01416106, ...,  0.04950898,
         0.08275305,  0.15799391],
       [-0.1733005 ,  0.36648002,  0.07859138, ..., -0.06605142,
        -0.06963263,  0.05051989],
       [-0.18174268,  0.4082288 ,  0.06932551, ..., -0.05257936,
        -0.0252078 ,  0.04627145]], dtype=float32)

In [88]:
X.shape

(7622, 100)

In [89]:
len(X)  # cantidad de documentos en el corpus

7622

----

## Transfer Learning - Embeddings preentrenados

Vamos a cargar los modelos: 
- Word2Vec Skipgram: Spanish Billion Corpus - https://crscardellino.ar/SBWCE/
- Spanish Unannotated Corpora - https://github.com/dccuchile/spanish-word-embeddings#fasttext-embeddings-from-suc

Aqui pueden encontrar más modelos basados en embeddings especificos al idioma español:
- https://github.com/dccuchile/spanish-word-embeddings

In [90]:
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors, load_facebook_model

Para cargar un embedding preentrenado Word2Vec:

In [None]:
w2v_pretrained = KeyedVectors.load_word2vec_format('embeddings/word2vec/sbw_vectors.bin', binary=True)

Para cargar un embedding preentrenado FastText:

In [None]:
ft_pretrained = load_facebook_vectors('embeddings/fasttext(vec)/model.bin')

Utilizando la clase EmbeddingLoader

In [94]:
from EmbeddingLoader import EmbeddingLoader

loader = EmbeddingLoader('word2vec', embedding_path='embeddings/word2vec/sbw_vectors.bin')
loader.load_embedding_model()

model = loader.embedding_object

In [96]:
type(model)

gensim.models.keyedvectors.KeyedVectors

In [97]:
model['pandemia']

array([ 0.42914522,  0.18478541,  0.3483141 , -0.19908416,  0.48047414,
        0.38556612,  0.03594365, -0.3349033 , -0.36773038,  0.5021457 ,
       -0.0018208 , -0.4809572 ,  0.16318682, -0.11275186,  0.17996278,
        0.24656822, -0.05681027,  0.09116678, -0.19431195,  0.12161419,
        0.21356428, -0.1371043 ,  0.02093909,  0.08580279,  0.06824893,
       -0.19798575,  0.28420013,  0.06831953, -0.10925457,  0.5925825 ,
       -0.3001714 ,  0.09723116, -0.05311775, -0.11275718,  0.03702854,
        0.2543165 , -0.04919711, -0.15395342, -0.08209398,  0.26236358,
        0.4235284 ,  0.09896774, -0.02899392, -0.22137518,  0.18428046,
       -0.4977927 ,  0.04356395,  0.06938778, -0.5183308 , -0.08295436,
        0.51685405,  0.5224837 , -0.22405407,  0.08846341,  0.01646345,
        0.0665799 , -0.27243853, -0.04766185, -0.01019799,  0.31840512,
        0.16162357, -0.29562205,  0.02779709,  0.07293418, -0.19344994,
        0.2775112 , -0.35663685,  0.10577558, -0.6789721 , -0.41

In [98]:
model['pandemia'].shape

(300,)