# Semántica de vectores y _embeddings_

In [359]:
# carga el corpus
with open('corpus.txt', 'r') as f:
    corpus = f.readlines()

In [360]:
# cada oración del corpus tiene un salto de línea al final
corpus

['hola mundo\n',
 'hola mundo de vectores\n',
 'hola vectores\n',
 'aguante la similitud coseno\n']

In [361]:
# quita saltos de línea al final de cada oración
corpus = list(map(str.strip, corpus))
corpus

['hola mundo',
 'hola mundo de vectores',
 'hola vectores',
 'aguante la similitud coseno']

## Matrices de co-ocurrencia

### Matrices término-término

In [362]:
import numpy as np
from copy import deepcopy
from itertools import combinations
from collections import defaultdict

In [363]:
words = sorted(' '.join(corpus).split())
words

['aguante',
 'coseno',
 'de',
 'hola',
 'hola',
 'hola',
 'la',
 'mundo',
 'mundo',
 'similitud',
 'vectores',
 'vectores']

In [364]:
w_combinations = list(set(combinations(words,2)))
w_combinations = sorted(w_combinations, key=lambda x:(x[0],x[1]))
w_combinations[:6]

[('aguante', 'coseno'),
 ('aguante', 'de'),
 ('aguante', 'hola'),
 ('aguante', 'la'),
 ('aguante', 'mundo'),
 ('aguante', 'similitud')]

In [365]:
term_term = defaultdict(int)
term_term

defaultdict(int, {})

In [366]:
# tamaño de la ventana para buscar la coocurrencia
window = 3

for sent in corpus:
    tokens = sent.split()
    if len(tokens) <= window:
        stop = 1
    else:
        stop = len(tokens) - window + 1
    for i in range(0, stop):
        span = tokens[i:i+window]
        for wc in w_combinations:
            if (wc[0] in span) and (wc[1] in span):
                if (wc[0] == wc[1]) and (span.count(wc[0]) == 2):
                    term_term[wc] += 1
                elif wc[0] != wc[1]:
                    term_term[wc] += 1

In [367]:
term_term

defaultdict(int,
            {('hola', 'mundo'): 2,
             ('de', 'hola'): 1,
             ('de', 'mundo'): 2,
             ('de', 'vectores'): 1,
             ('mundo', 'vectores'): 1,
             ('hola', 'vectores'): 1,
             ('aguante', 'la'): 1,
             ('aguante', 'similitud'): 1,
             ('la', 'similitud'): 2,
             ('coseno', 'la'): 1,
             ('coseno', 'similitud'): 1})

In [368]:
for wc in w_combinations:
    if wc not in term_term.keys():
        term_term[wc] = 0

In [369]:
term_term

defaultdict(int,
            {('hola', 'mundo'): 2,
             ('de', 'hola'): 1,
             ('de', 'mundo'): 2,
             ('de', 'vectores'): 1,
             ('mundo', 'vectores'): 1,
             ('hola', 'vectores'): 1,
             ('aguante', 'la'): 1,
             ('aguante', 'similitud'): 1,
             ('la', 'similitud'): 2,
             ('coseno', 'la'): 1,
             ('coseno', 'similitud'): 1,
             ('aguante', 'coseno'): 0,
             ('aguante', 'de'): 0,
             ('aguante', 'hola'): 0,
             ('aguante', 'mundo'): 0,
             ('aguante', 'vectores'): 0,
             ('coseno', 'de'): 0,
             ('coseno', 'hola'): 0,
             ('coseno', 'mundo'): 0,
             ('coseno', 'vectores'): 0,
             ('de', 'la'): 0,
             ('de', 'similitud'): 0,
             ('hola', 'hola'): 0,
             ('hola', 'la'): 0,
             ('hola', 'similitud'): 0,
             ('la', 'mundo'): 0,
             ('la', 'vectores'): 0,
   

In [370]:
word, context, n = list(), list(), list()
for key, value in term_term.items():
    word.append(key[0])
    context.append(key[1])
    n.append(value)

In [371]:
term_term_df = pd.DataFrame({'word':word,'context':context,'n':n})
term_term_df.head()

Unnamed: 0,word,context,n
0,hola,mundo,2
1,de,hola,1
2,de,mundo,2
3,de,vectores,1
4,mundo,vectores,1


In [372]:
pd.pivot_table(term_term_df, values='n', index='word', columns='context', fill_value=0)

context,coseno,de,hola,la,mundo,similitud,vectores
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aguante,0,0,0,1,0,1,0
coseno,0,0,0,1,0,1,0
de,0,0,1,0,2,0,1
hola,0,0,0,0,2,0,1
la,0,0,0,0,0,2,0
mundo,0,0,0,0,0,0,1
similitud,0,0,0,0,0,0,0
vectores,0,0,0,0,0,0,0


### Matrices término-docuemnto

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# CountVectorizer?

In [23]:
count_vec = CountVectorizer()
X = count_vec.fit_transform(corpus)
X

<4x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [24]:
X = X.toarray()
X

array([[0, 0, 1, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1],
       [1, 1, 0, 1, 0, 1, 0]])

In [26]:
count_vec.get_feature_names_out()

array(['aguante', 'coseno', 'hola', 'la', 'mundo', 'similitud',
       'vectores'], dtype=object)

In [29]:
term_doc = pd.DataFrame(
    X,
    columns=count_vec.get_feature_names_out(),
    index=corpus
)
term_doc

Unnamed: 0,aguante,coseno,hola,la,mundo,similitud,vectores
hola mundo,0,0,1,0,1,0,0
mundo hola,0,0,1,0,1,0,0
hola vectores,0,0,1,0,0,0,1
aguante la similitud coseno,1,1,0,1,0,1,0


In [32]:
term_doc = term_doc.T
term_doc

Unnamed: 0,hola mundo,mundo hola,hola vectores,aguante la similitud coseno
aguante,0,0,0,1
coseno,0,0,0,1
hola,1,1,1,0
la,0,0,0,1
mundo,1,1,0,0
similitud,0,0,0,1
vectores,0,0,1,0


In [36]:
# vector del documento => vecotr columna
term_doc["hola mundo"].to_numpy()

array([0, 0, 1, 0, 1, 0, 0])

In [41]:
# vector de la palabra => vector fila
term_doc.loc["hola"].to_numpy()

array([1, 1, 1, 0])

In [44]:
sents = ["similitud de vectores", "el aguante"]
Y = count_vec.transform(sents).toarray()
pd.DataFrame(
    Y,
    columns=count_vec.get_feature_names_out(),
    index=sents
)

Unnamed: 0,aguante,coseno,hola,la,mundo,similitud,vectores
similitud de vectores,0,0,0,0,0,1,1
el aguante,1,0,0,0,0,0,0


## Similitud coseno

### Producto punto (_dot product_)

$$
\text{dot product}(v,w) = v \cdot w = \sum_{i=1}^Nv_iw_i = v_iw_i+v_iw_i+...+v_Nw_N
$$

In [374]:
dp_df = pd.DataFrame(
    {
        "texto_jardinería":[6,4,8,1,0,15],
        "texto_transporte":[0,0,1,7,8,13],
        "texto_industria":[4,0,0,2,6,10]
    },
    index=["planta","flor","tierra","asfalto","auto","de"]
)
dp_df

Unnamed: 0,texto_jardinería,texto_transporte,texto_industria
planta,6,0,4
flor,4,0,0
tierra,8,1,0
asfalto,1,7,2
auto,0,8,6
de,15,13,10


In [378]:
vec_planta = dp_df.loc["planta"].to_numpy()
vec_planta

array([6, 0, 4])

In [379]:
vec_flor = dp_df.loc["flor"].to_numpy()
vec_flor

array([4, 0, 0])

In [380]:
vec_de = dp_df.loc["de"].to_numpy()
vec_de

array([15, 13, 10])

In [407]:
def calculate_dot_product(x:np.array, y:np.array) -> np.array:
    vec = x + y
    total = vec.sum()
    print(f"""
    Suma del vector {x} y el vector {y}:
     {x[0]:3}{x[1]:3}{x[2]:3}
    +{y[0]:3}{y[1]:3}{y[2]:3}
     ----------
     {vec[0]:3}{vec[1]:3}{vec[2]:3}

    Suma de todas las componentes de {vec} = {total}
    """)
    return total

In [408]:
vec_planta_flor = calculate_dot_product(vec_planta, vec_flor)


    Suma del vector [6 0 4] y el vector [4 0 0]:
       6  0  4
    +  4  0  0
     ----------
      10  0  4

    Suma de todas las componentes de [10  0  4] = 14
    


In [409]:
vec_planta_de = calculate_dot_product(vec_planta, vec_de)


    Suma del vector [6 0 4] y el vector [15 13 10]:
       6  0  4
    + 15 13 10
     ----------
      21 13 14

    Suma de todas las componentes de [21 13 14] = 48
    


### Similitud coseno

La similitud coseno utiliza el producto punto normalizado por la norma de cada vector:

$$
cosine(v,w) = \frac{v \cdot w}{\vert v \vert \vert w \vert} = \frac{\sum_{i=1}^Nv_iw_i}{\sqrt{\sum_{i=1}^Nv_i^2} \sqrt{\sum_{i=1}^Nw_i^2}}
$$