<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Word2vect


In [1]:
import numpy as np
import pandas as pd

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
def obtener_vacabulario(corpus):
  voc = []  
  for document in corpus:
    voc = voc + document.split()
  return list(set(np.char.lower(voc)))

In [5]:
obtener_vacabulario(corpus)

['martes', 'de', 'muchas', 'hoy', 'que', 'es', 'gracias', 'el', 'dia']

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [6]:
def one_hot_encoding_text(corpus):
  vocabulario = obtener_vacabulario(corpus)
  matriz = pd.DataFrame({key: np.zeros([len(corpus)]) for key in vocabulario})
  for i, documento in enumerate(corpus):
    for termino in vocabulario:
      matriz[termino][i] = (np.char.lower(documento).tolist().split().count(termino) > 0) * 1
  return matriz

In [7]:
one_hot_encoding_text(corpus)

Unnamed: 0,martes,de,muchas,hoy,que,es,gracias,el,dia
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [8]:
def frecuencia_encoding_text(corpus):
  vocabulario = obtener_vacabulario(corpus)
  matriz = pd.DataFrame({key: np.zeros([len(corpus)]) for key in vocabulario})
  for i, documento in enumerate(corpus):
    for termino in vocabulario:
      matriz[termino][i] = np.char.lower(documento).tolist().split().count(termino)
  return matriz

In [9]:
frecuencia_encoding_text(corpus)

Unnamed: 0,martes,de,muchas,hoy,que,es,gracias,el,dia
0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
1,2.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
vocabulario = obtener_vacabulario(corpus)

matriz = one_hot_encoding_text(corpus)
idf = np.log(len(corpus) / np.sum(matriz, axis=0).values)
frecuencia = frecuencia_encoding_text(corpus)
for i, documento in enumerate(frecuencia.values):
  matriz.iloc[i] = documento * idf

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [11]:
def tfidf_encoding_text(corpus):
  n = len(corpus)
  matriz = one_hot_encoding_text(corpus)
  idf = np.log(len(corpus) / np.sum(matriz, axis=0).values)
  frecuencia = frecuencia_encoding_text(corpus)
  for i, documento in enumerate(frecuencia.values):
    matriz.iloc[i] = documento * idf
  return matriz

In [12]:
tfidf_encoding_text(corpus)

Unnamed: 0,martes,de,muchas,hoy,que,es,gracias,el,dia
0,0.0,0.0,0.0,0.405465,1.098612,0.405465,0.0,0.0,0.405465
1,0.81093,1.098612,0.0,0.405465,0.0,0.405465,0.0,1.098612,0.405465
2,0.405465,0.0,1.098612,0.0,0.0,0.0,1.098612,0.0,0.0


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [50]:
def orden_coseno_vector(corpus, indice):
  df = pd.DataFrame()
  df.index = corpus
  tfidf = tfidf_encoding_text(corpus)
  b = tfidf.iloc[indice]
  for i in range(len(tfidf)):
    a = tfidf.iloc[i]
    df.loc[corpus[i], "coseno"]= cosine_similarity(a, b)
  return df.sort_values(by="coseno", ascending=False)

In [51]:
orden_coseno_vector(corpus, 0)


Unnamed: 0,coseno
que dia es hoy,1.0
martes el dia de hoy es martes,0.200342
martes muchas gracias,0.0


In [52]:
orden_coseno_vector(corpus, 1)

Unnamed: 0,coseno
martes el dia de hoy es martes,1.0
que dia es hoy,0.200342
martes muchas gracias,0.108457


In [53]:
orden_coseno_vector(corpus, 2)


Unnamed: 0,coseno
martes muchas gracias,1.0
martes el dia de hoy es martes,0.108457
que dia es hoy,0.0
