In [1]:
import numpy as np
import pandas as pd

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

In [112]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias', 'qué dia es mañana', 'que día es'])

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [113]:
#Cada documento transformarlo en una lista de términos
docs = []
for document in corpus:
  document = document.split(" ")
  docs.append(document)

In [114]:
# Armar un vector de términos no repetidos de todos los documentos
split_corpus = np.char.split(corpus, sep =' ') 
all_tokens = np.sum(split_corpus)
unique_tokens = np.unique(all_tokens)
#unique_tokens

In [115]:
#generar diccionario de palabras para usar más adelante
vocab = {}
index = 0
for document in docs:
  for word in document:
    if not word in vocab:
      vocab[word] = index
      index +=1

In [116]:
vocab

{'que': 0,
 'dia': 1,
 'es': 2,
 'hoy': 3,
 'martes': 4,
 'el': 5,
 'de': 6,
 'muchas': 7,
 'gracias': 8,
 'qué': 9,
 'mañana': 10,
 'día': 11}

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [117]:
#creo matrix con 1s en la diagonal y 0 en el resto
onehot_token = np.eye(len(vocab))

onehot_matrix = []
for document in docs:
  onehot_doc = np.zeros((len(document), len(vocab)))
  for word in range(len(document)):
    onehot_doc[word, :] = onehot_token[vocab[document[word]]]
  onehot_matrix.append(np.array(onehot_doc))

In [118]:
onehot_matrix

[array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [135]:
count_freq = []
for document in onehot_matrix:
  count_freq.append(document.sum(axis=0))

freq_dist = np.array(count_freq)

In [136]:
freq_dist

array([[1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 2., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [123]:
df_vocab = np.zeros(len(vocab))
for token in vocab:
  for document in docs:
    if token in document:
      df_vocab[vocab[token]] +=1                   

In [124]:
df_vocab

array([2., 3., 4., 2., 2., 1., 1., 1., 1., 1., 1., 1.])

In [141]:
len_corpus = len(corpus)

In [142]:
idf = np.log10(len_corpus/df_vocab)

In [143]:
tf_idf = np.zeros((len_corpus, len(vocab)))
for i in range(len_corpus):
  tf_idf[i,:] = np.multiply(idf, freq_dist[i])

tf_idf

array([[0.39794001, 0.22184875, 0.09691001, 0.39794001, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.22184875, 0.09691001, 0.39794001, 0.79588002,
        0.69897   , 0.69897   , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.39794001,
        0.        , 0.        , 0.69897   , 0.69897   , 0.        ,
        0.        , 0.        ],
       [0.        , 0.22184875, 0.09691001, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.69897   ,
        0.69897   , 0.        ],
       [0.39794001, 0.        , 0.09691001, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.69897   ]])

### Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [144]:
def doc_comparison(corpus, idx):

  #SEPARACIÓN DE DOCUMENTOS EN LISTAS DE TÉRMINOS
  docs = []
  for document in corpus:
    document = document.split(" ")
    docs.append(document)

  #DICCIONARIO DE PALABRAS
  vocab = {}
  index = 0
  for document in docs:
    for word in document:
      if not word in vocab:
        vocab[word] = index
        index +=1

  #ONE HOT ENCODING
  onehot_token = np.eye(len(vocab))

  onehot_matrix = []
  for document in docs:
    onehot_doc = np.zeros((len(document), len(vocab)))
    for word in range(len(document)):
      onehot_doc[word, :] = onehot_token[vocab[document[word]]]
    onehot_matrix.append(np.array(onehot_doc)) 

  #COUNT FREQUENCY
  count_freq = []
  for document in onehot_matrix:
    count_freq.append(document.sum(axis=0))
  freq_dist = np.array(count_freq)

  #TF IDF
  #1 - frecuencia de las palabras en el corpus
  df_vocab = np.zeros(len(vocab))
  for token in vocab:
    for document in docs:
      if token in document:
        df_vocab[vocab[token]] +=1                   
  
  #2. frecuencia inversa de las palabras
  len_corpus = len(corpus)
  idf = np.log10(len_corpus/df_vocab) 

  #3. indice tf.idf = element wise multiplication
  tf_idf = np.zeros((len_corpus, len(vocab)))
  for i in range(len_corpus):
    tf_idf[i,:] = np.multiply(idf, freq_dist[i])

  similarity = np.zeros(len_corpus)
  for i in range(len_corpus):
    similarity[i] = cosine_similarity(tf_idf[i,:], tf_idf[idx, :])

  return corpus[np.argsort(-similarity)]

In [145]:
doc_comparison(corpus, 1)

array(['martes el dia de hoy es martes', 'que dia es hoy',
       'martes muchas gracias', 'qué dia es mañana', 'que día es'],
      dtype='<U30')