In [1]:
import pandas as pd
import numpy as np


from collections import Counter
from functools import reduce

from math import log2

from IPython.display import HTML as html_print, display

In [2]:
def tag_print(tag, str1, str2):
    display(html_print("<{0:s}>{1:s}</{0:}>".format(tag,str1)),str2)
    
def h1_print(str1, str2):
    tag_print("h1", str1, str2)
    
def h2_print(str1, str2):
    tag_print("h2", str1, str2)

def h3_print(str1, str2):
    tag_print("h3", str1, str2)    
    
def h4_print(str1, str2):
    tag_print("h4", str1, str2)

# Modelo TF-IDF

### Exercício 1. Considere os documentos abaixo e calcule a representação lógica desses documentos no modelo TF-IDF.
<pre>
    D1 = A A A B
    D2 = A A C
    D3 = A A
    D4 = B B
</pre>

## Solução

### 1. Calculando $f_{i,j}$ (freqüência bruta do termo $i$ no documento $j$)

In [3]:
data = ["A A A B",
        "A A C",
        "A A",
        "B B"]

# matriz de nomes
m = [z.split(" ") for z in data]

# cria o dicionário de termos
words = sorted(set(reduce(lambda x,y :x+y ,m)))
    
# matriz binária
mm = [ [1 if x in set(row) else 0 for x in words] for row in m ]    


# matriz de contagens
mmc = [ [Counter(row)[x] if x in set(row) else 0 for x in words] for row in m ]    

# cria dicionário de contagem geral de termos
#words_count = Counter(reduce(lambda x,y :x+y ,m))

df = pd.DataFrame(data=mmc,columns=words)
df = df.set_index('d_'+df.index.astype(str))
#df['max_j'] = df.max(axis=1)

h3_print("Words:",words)
#print()
#print(words_count)
#print('total words:', len(words))
h3_print("Words counts:",mmc)

['A', 'B', 'C']

[[3, 1, 0], [2, 0, 1], [2, 0, 0], [0, 2, 0]]

#### Exibindo $f_{i,j}$ 

In [4]:
f_ij = df.copy()

h3_print("Exibindo $f_{ij}$:",f_ij)

Unnamed: 0,A,B,C
d_0,3,1,0
d_1,2,0,1
d_2,2,0,0
d_3,0,2,0


### 2. Calculando $max_i f_{i,j}$ (freqüência do termo $i$ mais freqüente no documento $j$)

In [5]:
max_if_ij = f_ij.max(axis=1)

h3_print("Exibindo $max_i f_{i,j}$:",max_if_ij)

d_0    3
d_1    2
d_2    2
d_3    2
dtype: int64

### 3. Calculando $tf_{i,j} = \frac{f_{i,j}}{ max_i f_{i,j}}$ (freqüência normalizada)

In [6]:
# Normalizando  pelo máximo
#df = df[['A','B', 'C']].div(df.max(axis=1), axis=0) # faz a mesma coisa que a linha abaixo, mas permite selecionar as colunas
f_ij_nom = f_ij.div(f_ij.max(axis=1), axis=0)



h3_print("Exibindo $tf_{i,j} = \\frac{f_{i,j}}{ max_i f_{i,j}}$:",f_ij_nom.round(2))

Unnamed: 0,A,B,C
d_0,1.0,0.33,0.0
d_1,1.0,0.0,0.5
d_2,1.0,0.0,0.0
d_3,0.0,1.0,0.0


In [7]:
#df.loc['n_i'] = df[df>0].count()
#df.loc['idf'] = np.log2(D/df.loc['n_i'])
#df.round(2)

### 4. Calculando $idf_i = log_2( \frac{D}{n_i} )$, a freqüência inversa dos termos $i$ (inverse document frequency)

Encontrando $n_i$ (quantidade de documentos $j$ em que o termo $i$ ocorre) e;
<br>
$D$ (quantidade total de documentos).

In [8]:
#Encontrando n_i 
n_i = f_ij_nom[df>0].count()
h4_print("Exibindo $n_i$:",n_i)


#Encontrando D
D = f_ij_nom.shape[0]
h4_print("Exibindo $D$:",D)

A    3
B    2
C    1
dtype: int64

4

Calculando $idf_i$ (<i>inverse document frequency</i> para cada termo $i$).

In [9]:
idf_i = np.log2(D / n_i )


h4_print("Exibindo $idf_i$:",idf_i.round(2))

A    0.42
B    1.00
C    2.00
dtype: float64

### 5. Calculando pesos finais $w_{i,j} = tf_{i,j} \times idf_i$

In [10]:
w_ij = f_ij_nom * idf_i 


h3_print("Exibindo $w_{i,j} = tf_{i,j} \\times idf_i$:",w_ij.round(2))

Unnamed: 0,A,B,C
d_0,0.42,0.33,0.0
d_1,0.42,0.0,1.0
d_2,0.42,0.0,0.0
d_3,0.0,1.0,0.0


### Exercício 2. Considere a query $q$ = "$A B$" para os documentos acima.
>a. Calcule o vetor de q no modelo TF-IDF.
<br>
>b. Considerando a distância Euclidiana encontre o documento mais próximo de q.

a) Calculando o vetor de q no modelo TF-IDF.

In [11]:
q = ['A B']
qm = [z.split(" ") for z in q]
qmmc = [Counter(row)[x] if x in set(row) else 0 for x in words for row in qm ]

print("q tf_i:", qmmc )

qmmc  = qmmc / np.max(qmmc)
qmmc = (qmmc * np.log2(D / df[ df>0].count() ) )

print()
print("q w_i:")
qmmc.round(2)

q tf_i: [1, 1, 0]

q w_i:


A    0.42
B    1.00
C    0.00
dtype: float64

b) Encontrando a <b>distância euclidiana</b> entre $q$ e cada documento ($d_j$). $$\vec{d(q, d_j)}=\sqrt{\sum_i^n (w_{q_i} - w_{ji})^2}$$
Where \{$i \in 1,2,\ldots,n\}$ and $w_{ij} = tf_{ij} \times idf_i$

In [12]:
#(df[['A', 'B']] - np.array(qmmc)).pow(2).sum(1).pow(0.5)

#print(qmmc.values)
#print(df.values[1])

# Euclidian Distance
#np.sum((np.array(qmmc) - df.values)**2,axis=1)**.5
np.sum((np.array(qmmc) - w_ij)**2,axis=1)**.5

d_0    0.666667
d_1    1.414214
d_2    1.000000
d_3    0.415037
dtype: float64

In [13]:
# Documento mais similar pela distância euclidiana
((np.sum((np.array(qmmc) - w_ij)**2,axis=1)**.5)).argmin()

'd_3'

c) Encontrando a <b>distância do cosseno</b> encontre o documento mais próximo de q. $$d(\vec{q_0}, \vec{d_j})=\ldots$$