<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Word2vect


In [1]:
import numpy as np

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
a = corpus[0].split(" ")
b = corpus[1].split(" ")
c = corpus[2].split(" ")
unicos = np.sort(np.unique(np.array(a + b + c)))
unicos

array(['de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas',
       'que'], dtype='<U7')

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [5]:
indices = np.zeros( (3, len(unicos)) )

In [6]:
indices_a = np.intersect1d(unicos, a, return_indices=True)[1]
indices_b = np.intersect1d(unicos, b, return_indices=True)[1]
indices_c = np.intersect1d(unicos, c, return_indices=True)[1]

In [7]:
indices[0,indices_a] = 1
indices[1, indices_b] = 1
indices[2, indices_c] = 1

In [8]:
indices

array([[0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 1., 1., 1., 0., 1., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 1., 0.]])

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [9]:
frequency = np.copy(indices)

In [10]:
a0 = np.unique(a, return_index=True, return_counts=True)[0]
a2 = np.unique(a, return_index=True, return_counts=True)[2]
repetidas_a = a0[np.argwhere(a2>1)]
indice_repetidas_a = np.intersect1d(unicos, a0[np.argwhere(a2>1)], return_indices=True)[1]

In [11]:
b0 = np.unique(b, return_index=True, return_counts=True)[0]
b2 = np.unique(b, return_index=True, return_counts=True)[2]
repetidas_b = b0[np.argwhere(b2>1)]
indice_repetidas_b = np.intersect1d(unicos, b0[np.argwhere(b2>1)], return_indices=True)[1]

In [12]:
c0 = np.unique(c, return_index=True, return_counts=True)[0]
c2 = np.unique(c, return_index=True, return_counts=True)[2]
repetidas_c = c0[np.argwhere(c2>1)]
indice_repetidas_c = np.intersect1d(unicos, c0[np.argwhere(c2>1)], return_indices=True)[1]

In [13]:
frequency[0, indice_repetidas_a] = frequency[0, indice_repetidas_a] * a2[np.argwhere(a2>1)]
frequency[1, indice_repetidas_b] = frequency[1, indice_repetidas_b] * b2[np.argwhere(b2>1)]
frequency[2, indice_repetidas_c] = frequency[2, indice_repetidas_c] * c2[np.argwhere(c2>1)]

In [14]:
frequency

array([[0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 1., 1., 1., 0., 1., 2., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 1., 0.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [16]:
idf = np.sum(indices, axis=0)
idf

array([1., 2., 1., 2., 1., 2., 2., 1., 1.])

In [17]:
idf_aux = np.full( len(unicos), 3 )
idf_aux

array([3, 3, 3, 3, 3, 3, 3, 3, 3])

In [18]:
idf_final = idf_aux / idf
idf_final = np.log10(idf_final)
idf_final

array([0.47712125, 0.17609126, 0.47712125, 0.17609126, 0.47712125,
       0.17609126, 0.17609126, 0.47712125, 0.47712125])

In [20]:
tf_idf = frequency * idf_final
print(tf_idf)

[[0.         0.17609126 0.         0.17609126 0.         0.17609126
  0.         0.         0.47712125]
 [0.47712125 0.17609126 0.47712125 0.17609126 0.         0.17609126
  0.35218252 0.         0.        ]
 [0.         0.         0.         0.         0.47712125 0.
  0.17609126 0.47712125 0.        ]]


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [21]:
## Asumo que la funcion recibe el corpus del comienzo y el indice es el documento al cual se le quiere evaluar la similitud con el resto de documentos.

In [4]:
def document_comparison(corpus, doc_idx):
    documents_list = []
    indices_list = []
    doc_count_list = []
    indices_repetidas_list = []
    result_list = []

    for doc in corpus:
        lista_palabras = doc.split(" ")
        documents_list.append(lista_palabras)

    unicos = np.sort(np.unique(np.concatenate(documents_list)))
    # print(unicos)

    ohe = np.zeros( (len(documents_list), len(unicos)) )

    for doc in documents_list:
        indices_list.append( np.intersect1d(unicos, doc, return_indices=True)[1] )

    for i in range(0, len(corpus) ):
        ohe[i, indices_list[i]] = 1

    # print(ohe)

    frequency = np.copy(ohe)

    for doc in documents_list:
        doc_unique = np.unique(doc, return_index=True, return_counts=True)[0]
        doc_count = np.unique(doc, return_index=True, return_counts=True)[2]
        doc_count_list.append(doc_count)
        doc_repetidas = doc_unique[np.argwhere(doc_count>1)]
        doc_indice_repetidas = np.intersect1d(unicos, doc_unique[np.argwhere(doc_count>1)], return_indices=True)[1]
        
        indices_repetidas_list.append(doc_indice_repetidas)

    for i in range(0, len(corpus) ):
        frequency[i, indices_repetidas_list[i]] = frequency[i, indices_repetidas_list[i]] * doc_count_list[i][np.argwhere(doc_count_list[i]>1)]

    # print(frequency)

    
    idf = np.sum(ohe, axis=0)
    idf_aux = np.full( len(unicos), len(documents_list) )
    idf_final = idf_aux / idf
    idf_final = np.log10(idf_final)
    tf_idf = frequency * idf_final

    # print(tf_idf)

    for i in range(0, len(corpus) ):
        result_list.append(cosine_similarity(tf_idf[doc_idx], tf_idf[i]))

    return result_list


In [5]:
document_comparison(corpus, 0)

[0.9999999999999998, 0.20034190268098703, 0.0]