<a href="https://www.inove.com.ar"><img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center"></a>


# Procesamiento de lenguaje natural
## Word2vect


In [None]:
import numpy as np

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [None]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])
corpus

array(['que dia es hoy', 'martes el dia de hoy es martes',
       'martes muchas gracias'], dtype='<U30')

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [None]:
def vectorize_text_list(text_list):
  words = []
  for text in text_list:
    words = words + text.split(' ')
  return  np.asarray(words)


In [None]:
vectorize_text_list(corpus)

array(['que', 'dia', 'es', 'hoy', 'martes', 'el', 'dia', 'de', 'hoy',
       'es', 'martes', 'martes', 'muchas', 'gracias'], dtype='<U7')

In [None]:
vocabulario = np.unique(vectorize_text_list(corpus))
vocabulario

array(['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas',
       'que'], dtype='<U7')

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [None]:
class DocumentEncoder:

  def __init__(self, vocabulary):
    self.vocabulary = vocabulary
    self.vec_length = len(vocabulary)
    self.encoded_vocabulary = self.encode_vocabulary(vocabulary)
    

  def __getitem__(self, key):
    return self.encoded_vocabulary.get(key, None)

  def int_to_binary_sparse(self, integer_number):
    arr = np.zeros(self.vec_length, dtype='float')  
    arr[integer_number] = 1.0
    return arr

  def encode_vocabulary(self, vocabulary):
    vocabulary_sequence = dict((c, i) for i, c in enumerate(vocabulary))
    encoded_vocabulary = {}
    for word, ord in vocabulary_sequence.items():
      encoded_term = self.int_to_binary_sparse(ord)
      encoded_vocabulary.update({word: encoded_term})
    return encoded_vocabulary

  def vectorize_text_list(self, text_list):
    words = []
    for text in text_list:
      words = words + text.split(' ')
    return  np.asarray(words)

  def encode_document(self, documents):
    words = self.vectorize_text_list(documents)
    words_num = len(words)
    encoded_matrix = np.empty((words_num, self.vec_length))
    for i, word in enumerate(words):
      encoded_matrix[i] = self.encoded_vocabulary.get(word)
    return encoded_matrix

In [None]:
doc_encoder.encoded_vocabulary

{'de': array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'dia': array([0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'el': array([0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 'es': array([0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 'gracias': array([0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'hoy': array([0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'martes': array([0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'muchas': array([0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 'que': array([0., 0., 0., 0., 0., 0., 0., 0., 1.])}

In [None]:
doc_encoder = DocumentEncoder(vocabulario)

In [None]:
doc_encoder['martes']

array([0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [None]:
# primero se genera un vocabulario a partir de la lista de textos
# luego se codifica la lista de textos
text_list = np.array(['martes es el dia de hoy', 'que es hoy martes gracias dia martes martes martes', 'el dia muchas gracias', 'gracias muchas gracias dia'])

doc_encoder.encode_document(text_list)

array([[0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0.

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [None]:
class Frequency:
  def __init__(self, encoder):
    self.econder = encoder
    self.encoded_matrix = None
    self.idf = None

  def frequency_matrix(self, text_list):
    frequency_matrix = np.empty((len(text_list), self.econder.vec_length))
    for i, text in enumerate(text_list):
      encoded_document = self.econder.encode_document([text])
      frequency_matrix[i] = np.sum(encoded_document, axis=0)
    return frequency_matrix
  
  def tfidf(self, text_list):
    docs_num = len(text_list)
    frequency_matrix = self.frequency_matrix(text_list)
    idf_vec = np.vectorize(lambda x: np.log10(docs_num / x))
    df = np.count_nonzero(frequency_matrix, axis=0)
    idf = idf_vec(df)
    print(idf)
    tfidf_matrix = np.empty((len(text_list), self.econder.vec_length))
    for i, tf in enumerate(frequency_matrix):
      tfidf_matrix[i] = np.multiply(tf, idf)
    return tfidf_matrix
  


In [None]:
f = Frequency(doc_encoder)
fvec = f.frequency_matrix(text_list)
fvec

array([[1., 1., 1., 1., 0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 1., 1., 4., 0., 1.],
       [0., 1., 1., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 2., 0., 0., 1., 0.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [None]:

f.tfidf(text_list)


[0.602 0.    0.301 0.301 0.125 0.301 0.301 0.301 0.602]


array([[0.602, 0.   , 0.301, 0.301, 0.   , 0.301, 0.301, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.301, 0.125, 0.301, 1.204, 0.   , 0.602],
       [0.   , 0.   , 0.301, 0.   , 0.125, 0.   , 0.   , 0.301, 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.25 , 0.   , 0.   , 0.301, 0.   ]])

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [None]:
def order_by_similarity(documents, idx, encoder):
  f = Frequency(encoder)
  fmatrix = f.frequency_matrix(documents)
  text = fmatrix[idx]
  for i in range(len(fmatrix)):
    cs = cosine_similarity(text, fmatrix[i])
    print(cs)

In [None]:
order_by_similarity(corpus, 2, doc_encoder)

0.0
0.3849001794597505
1.0000000000000002
