# Semántica de vectores y _embeddings_

In [None]:
# carga el corpus
with open('corpus.txt', 'r') as f:
    corpus = f.readlines()

In [None]:
# cada oración del corpus tiene un salto de línea al final
corpus

In [None]:
# quita saltos de línea al final de cada oración
corpus = list(map(str.strip, corpus))
corpus

## Matrices de co-ocurrencia

### Matrices término-término

In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
from itertools import combinations
from collections import defaultdict

In [None]:
words = sorted(' '.join(corpus).split())
words

In [None]:
w_combinations = list(set(combinations(words,2)))
w_combinations = sorted(w_combinations, key=lambda x:(x[0],x[1]))
w_combinations[:6]

In [None]:
term_term = defaultdict(int)
term_term

In [None]:
# tamaño de la ventana para buscar la coocurrencia
window = 3

for sent in corpus:
    tokens = sent.split()
    if len(tokens) <= window:
        stop = 1
    else:
        stop = len(tokens) - window + 1
    for i in range(0, stop):
        span = tokens[i:i+window]
        for wc in w_combinations:
            if (wc[0] in span) and (wc[1] in span):
                if (wc[0] == wc[1]) and (span.count(wc[0]) == 2):
                    term_term[wc] += 1
                elif wc[0] != wc[1]:
                    term_term[wc] += 1

In [None]:
term_term

In [None]:
for wc in w_combinations:
    if wc not in term_term.keys():
        term_term[wc] = 0

In [None]:
term_term

In [None]:
word, context, n = list(), list(), list()
for key, value in term_term.items():
    word.append(key[0])
    context.append(key[1])
    n.append(value)

In [None]:
term_term_df = pd.DataFrame({'word':word,'context':context,'n':n})
term_term_df.head()

In [None]:
pd.pivot_table(term_term_df, values='n', index='word', columns='context', fill_value=0)

### Matrices término-docuemnto

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# CountVectorizer?

In [None]:
count_vec = CountVectorizer()
X = count_vec.fit_transform(corpus)
X

In [None]:
X = X.toarray()
X

In [None]:
count_vec.get_feature_names_out()

In [None]:
term_doc = pd.DataFrame(
    X,
    columns=count_vec.get_feature_names_out(),
    index=corpus
)
term_doc

In [None]:
term_doc = term_doc.T
term_doc

In [None]:
# vector del documento => vecotr columna
term_doc["hola mundo"].to_numpy()

In [None]:
# vector de la palabra => vector fila
term_doc.loc["hola"].to_numpy()

In [None]:
sents = ["similitud de vectores", "el aguante"]
Y = count_vec.transform(sents).toarray()
pd.DataFrame(
    Y,
    columns=count_vec.get_feature_names_out(),
    index=sents
)

## Similitud coseno

### Producto punto (_dot product_)

$$
\text{dot product}(v,w) = v \cdot w = \sum_{i=1}^Nv_iw_i = v_iw_i+v_iw_i+...+v_Nw_N
$$

In [None]:
dp_df = pd.DataFrame(
    {
        "texto_jardinería":[6,4,8,1,0,15],
        "texto_transporte":[0,0,1,7,8,13],
        "texto_industria":[4,0,0,2,6,10]
    },
    index=["planta","flor","tierra","asfalto","auto","de"]
)
dp_df

In [None]:
vec_planta = dp_df.loc["planta"].to_numpy()
vec_planta

In [None]:
vec_flor = dp_df.loc["flor"].to_numpy()
vec_flor

In [None]:
vec_de = dp_df.loc["de"].to_numpy()
vec_de

In [None]:
def calculate_dot_product(x:np.array, y:np.array, quiet: bool=False) -> np.array:
    vec = x * y
    total = vec.sum()
    if not quiet:
        print(f"""
        Producto de las componentes del vector {x} y el vector {y}: {vec}
        Cálculo:
            < {', '.join([f'{m} * {n}' for m,n in list(zip(x,y))])} >
    
        Suma de todas las componentes de  {vec} = {total}
        Cálculo:
            {' + '.join(list(map(str,vec)))}
        """)
    return total

In [None]:
vec_planta_flor = calculate_dot_product(vec_planta, vec_flor)

In [None]:
vec_planta_de = calculate_dot_product(vec_planta, vec_de)

### Similitud coseno

La similitud coseno utiliza el producto punto normalizado por la norma de cada vector.

Dicha norma se define como:

$$
\vert v \vert = \sqrt{\sum_{i=1}^Nv_i^2}
$$

Y el coseno queda definido del siguiente modo:

$$
cosine(v,w) = \frac{v \cdot w}{\vert v \vert \vert w \vert} = \frac{\sum_{i=1}^Nv_iw_i}{\sqrt{\sum_{i=1}^Nv_i^2} \sqrt{\sum_{i=1}^Nw_i^2}}
$$

In [None]:
from math import sqrt

In [None]:
def calculate_norm(x: np.array, quiet: bool=False) -> float:
    _norm = (x**2)
    _norm_sum = _norm.sum()
    norm = sqrt(_norm_sum)
    if not quiet:
        print(f"""
        Norma del vector {x}: {norm}
        Cálculo
            - < {', '.join([f'{i}**2' for i in list(map(str,x))])} > = < {', '.join(list(map(str,_norm)))} >
            - < {' + '.join(list(map(str,_norm)))} > = {_norm_sum}
            - sqrt( {_norm_sum} ) = {norm}
        """)
    return norm

In [None]:
norm_planta = calculate_norm(vec_planta)
norm_flor = calculate_norm(vec_flor)
norm_de = calculate_norm(vec_de)

In [None]:
def calculate_cosine(x: np.array, y: np.array, quiet: bool=False) -> float:
    dot_x_y = calculate_dot_product(x, y, quiet=quiet)
    x_norm, y_norm = calculate_norm(x, quiet=quiet), calculate_norm(y, quiet=quiet)
    cosine = dot_x_y / (x_norm*y_norm)
    if not quiet:
        print(f"""
        cosine = {dot_x_y} / ( {x_norm} * {y_norm}) = {cosine}
        """)
    return cosine

In [None]:
calculate_cosine(vec_planta, vec_flor, quiet=True)

In [None]:
calculate_cosine(vec_planta, vec_de, quiet=True)

**¿Cómo interpretar el coseno?**

Simplifiquemos mucho la situación. Supongamos que tenemos solo dos documentos: uno sobre jardinería y otro sobre yoga. Y tenemos las palabras _maceta_, _tierra_, _mat_. Las primeras dos aparecen con cierta frecuencia en el documento de jardinería, pero nunca en el de yoga, y la última solo aparece en este documento.

Cada palabra se representará con un vector de dos dimensiones ( $\mathbb{R}^2$ ), en el que la primera dimensión representará el número de ocurrencias de la palabra en el **documento "jardín"** y la segunda, las ocurrencias en el **documento "yoga"**.

In [None]:
import matplotlib.pyplot as plt
from itertools import combinations

In [None]:
maceta = np.array([5, 0])
tierra = np.array([3, 1])
mat = np.array([0, 6])
non_existent_word = np.array([0,-2])

In [None]:
cosines = dict()
for c in list(combinations(["maceta","tierra","mat","non_existent_word"], 2)):
    w1, w2 = eval(c[0]), eval(c[1])
    cosines[f"{c[0]}-{c[1]}"] = calculate_cosine(w1, w2, quiet=True)
cosines

In [None]:
fig, ax = plt.subplots()

for word in ["maceta","tierra","mat","non_existent_word"]:
    vec = eval(word)
    x, y = list(zip([0,0],vec))
    ax.plot(x, y, '--', label=f"norma de {word}")
    ax.scatter(x=vec[0], y=vec[1], label=word)
plt.legend()
plt.show()

## TF-IDF

TF-IDF fórmula en [_Vector Semantics and Embeddings_](https://web.stanford.edu/~jurafsky/slp3/6.pdf):

$$w_{t,d} = tf_{t,d} \times idf_{t}$$

donde:

$$
tf_{t,d} = \log_{10}(count(t,d) + 1)
$$
$$
idf_{t} = \log_{10} \left( \frac{N}{df_{t}} \right)
$$

- _t_ = término
- _d_ = documento
- _N_ = cantidad total de documentos en el corpus
- _df<sub>t</sub>_ = cantidad de documentos en los que aparece el término _t_

`TfidfVectorizer`, la implementación de [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html), propone las siguientes parametrizaciones:

- **norm**: `"l1"`, `"l2"` o `None`, por defecto = `"l2"`
- **use_idf**: `False` o `True`, por defecto = `True`
- **smooth_idf**: `False` o `True`, por defecto = `True`
- **sublinear_tf**: `False` o `True`, por defecto = `False`

Estos parámetros influyen del siguiente modo:

- **norm**
  - `"l1"` $\rightarrow$ Los vectores resultantes de $tf \times idf $ son normalizados por la norma de Manhattan, es decir, por la norma del vector $\vert\vert v \vert\vert_{1} = \sum_{i=1}^N{v_i}$
  - `"l2"` $\rightarrow$ Los vectores resultantes de $tf \times idf $ son normalizados por la norma euclidiana, es decir, por la norma del vector $\vert\vert v \vert\vert_{2} = \sqrt{\sum_{i=1}^N{v_i^2}}$
  - `None` $\rightarrow$ No realiza ninguna normalización
- **use_idf**
  - `False` $\rightarrow idf_{t}=1$
  - `True` $\rightarrow$ utiliza alguno de los cálculos listados en *smooth_idf*
- **smooth_idf**
  - `False` $\rightarrow idf_{t}=\log_{10} \left( \frac{N}{df_{t}} + 1\right)$, evita $\log_{10}\left(1\right)$, lo que devolvería cero
  - `True` $\rightarrow idf_{t}=\log_{10} \left( \frac{N+1}{df_{t}+1} + 1 \right)$, agrega "1" al numerador y al denominador para simular que todos los términos fueron vistos en al menos un documento, esto evita la división por cero
- **sublinear_tf**
  - `False` $\rightarrow tf_{t,d}=count(t,d)$
  - `True` $\rightarrow tf_{t,d} = 1 + \log(count(t,d))$
 
Así, la fórmula utilizada por defecto en la implementación de `scikit-learn` es:

$$
w_{t,d} = \frac{count(t,d) \times \log_{10} \left( \frac{N+1}{df_{t}+1} + 1 \right)}{\sqrt{\sum_{i=1}^N{v_i^2}}}
$$
 
Para más información ver la [Guía de Usuario](https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting) de `scikit-learn`.

In [None]:
from string import Template
from IPython.display import display, Math

def build_tfidf(norm: str, use_idf: bool, smooth_idf: bool, sublineat_tf: boll) -> None:
    tf = "count(t,d)"
    if use_idf:   
        if smooth_idf:
            idf = "\\log_{10} \\left( \\frac{N+1}{df_{t}+1} + 1 \\right)"
        else:
            idf = "\\log_{10} \\left( \\frac{N}{df_{t}} + 1 \\right)"
    else:
        idf = 1
    if sublineat_tf:
        tf = "(1 + \\log({count}))"
    formula = f"{tf} \\times {idf}"
    if norm:
        if norm == 'l1':
            denominator = "\\sum_{i=1}^Nv_i"
        elif norm == 'l2':
            denominator = "\\sqrt{\\sum_{i=1}^Nv_i^2}"
        formula = Template("\\frac{$num}{$den}").substitute(num=formula, den=denominator)
    display(Math(formula))

In [None]:
# misma fórmula que CountVectorizer => cuenta frecuencias absolutas
build_tfidf(norm=None, use_idf=False, smooth_idf=False, sublineat_tf=False)

In [None]:
# modificar mooth_idf sin cambiar use_idf no cambia nada
build_tfidf(norm=None, use_idf=False, smooth_idf=True, sublineat_tf=False)

In [None]:
# implementación por defecto de scikit-learn
build_tfidf(norm="l2", use_idf=True, smooth_idf=True, sublineat_tf=False)

In [None]:
import seaborn as sns
from pandas.plotting import parallel_coordinates
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
params = {
    "norm" : [None,"l1","l2"],
    "use_idf": [False,True],
    "smooth_idf" : [False,True],
    "sublinear_tf" : [False,True]
}
params_grid = list(ParameterGrid(params))
params_grid[:3]

In [None]:
len(params_grid)

In [None]:
# los casos en los que use_idf = False y smooth_idf = True no tienen sentido
# se filtran
params_grid = list(filter(lambda x: not(x["use_idf"]==False and x["smooth_idf"]==True),params_grid))
len(params_grid)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


fig, axs = plt.subplots(6,3, figsize=(18,21))
row, col = 0, 0
for pg in params_grid:
    vectorizer = TfidfVectorizer(**pg)
    X = vectorizer.fit_transform(corpus).toarray()
    df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
    df["doc"] = corpus
    df = df.melt(id_vars=['doc'], var_name=['word'], value_name='score')
    sns.barplot(data=df, x="word", y="score", hue="doc", ax=axs[row,col])
    axs[row,col].set_title("\n".join([f"{k}: {v}" for k,v in pg.items()]))
    axs[row,col].tick_params(labelrotation=45,)
    handles, labels = axs[row,col].get_legend_handles_labels()
    axs[row,col].get_legend().remove()
    col += 1
    if col > 2:
        row += 1
        col = 0
fig.legend(handles, labels, loc='upper center')
fig.tight_layout()
plt.show()

## _Pointwise Mutual Information (PMI)_

TBD

## Word2vec

TBD