http://www.minerazzi.com/tutorials/

In [32]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics.pairwise import cosine_similarity as cossine

import nltk
from time import time


from collections import Counter
from functools import reduce

In [2]:
def jaccard_similarity(l1, l2):
    """
        Params: l1 and l2 Numpy arrays
        Return: jaccard_similarity between l1 and l2 as double value between 0 an 1
        
    """
    len_intersect=len(np.intersect1d(l1,l2))
    len_union=len(np.union1d(l1,l2))
    return len_intersect/len_union

def cossine_similarity(l1, l2):
    """
        Params: l1 and l2 Numpy arrays
        Return: cossine_similarity between l1 and l2 as double value between 0 an 1
        
    """    
    dot_ = np.dot(l1,l2)
    n_l1 = np.linalg.norm(l1)
    n_l2 = np.linalg.norm(l2,axis=0)
    return dot_/(n_l1*n_l2)


In [3]:
#vec = HashingVectorizer(stop_words=nltk.corpus.stopwords.words('portuguese'))
vec = CountVectorizer(stop_words=nltk.corpus.stopwords.words('portuguese'))
#vec = TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('portuguese'))
documents=[
        'iago a aula de luciano',
        'aula 1: iuri a aula de luciano',
        'luciano ministrou a aula',
        'a aula foi muito boa' 
]

In [567]:
text_vec = vec.fit_transform(documents
                            # +['iago estudou conceitos básicos de IA hoje com luciano', 'iuri nao gostou da aula de luciano']
                            )

In [568]:
df = pd.DataFrame(data=text_vec.toarray(),columns=vec.get_feature_names())
df = df.set_index('d_'+df.index.astype(str))

df

Unnamed: 0,aula,boa,iago,iuri,luciano,ministrou
d_0,1,0,1,0,1,0
d_1,2,0,0,1,1,0
d_2,1,0,0,0,1,1
d_3,1,1,0,0,0,0


In [579]:
t0 = time()
for i in range(0,df.shape[0]):
    #print(df.values[i])
    cos = cossine_similarity(df.values[0],df.values[i])
    #print("my cossine_similarity:",cos)
    #cos = cossine([df.values[0]],[df.values[i]])[0][0]
    #print("sk cossine_similarity:",cos)
    #print()
print("Elapsed time:",time()-t0,"- My cossine_similarity:",cos)

Elapsed time: 0.0019998550415039062 - My cossine_similarity: 0.6666666666666667


In [570]:
q1 = ['iago estudou conceitos básicos de IA hoje com luciano']
q2 = ['iuri nao gostou da aula de luciano']
qv1 = vec.transform(q1)
qv2 = vec.transform(q2)

In [571]:
#row1 = {x:y for x,y in zip(['aula', 'boa', 'iago', 'iuri', 'luciano', 'ministrou'], qv1.toarray()[0])}
#row2 = {x:y for x,y in zip(['aula', 'boa', 'iago', 'iuri', 'luciano', 'ministrou'], qv2.toarray()[0])}

df.loc['qv1'] = qv1.toarray()[0]
df.loc['qv2'] = qv2.toarray()[0]

df

Unnamed: 0,aula,boa,iago,iuri,luciano,ministrou
d_0,1,0,1,0,1,0
d_1,2,0,0,1,1,0
d_2,1,0,0,0,1,1
d_3,1,1,0,0,0,0
qv1,0,0,1,0,1,0
qv2,1,0,0,1,1,0


In [572]:
print("q1",qv1.toarray())
print("q2",qv2.toarray())

q1 [[0 0 1 0 1 0]]
q2 [[1 0 0 1 1 0]]


In [590]:
# qv1 cossine available and documents
t0 = time()
for i in range(0,df.shape[0]):
    print("{} {} {}".format("my cossine_similarity:",df.index[i],cossine_similarity(qv1.toarray()[0],df.values[i])))
    #print("sk cossine_similarity:",cossine(qv1.toarray(),[df.values[i]])[0][0])
    #print()
print("Elapsed time:",time()-t0)

my cossine_similarity: d_0 0.8164965809277259
my cossine_similarity: d_1 0.2886751345948129
my cossine_similarity: d_2 0.40824829046386296
my cossine_similarity: d_3 0.0
my cossine_similarity: qv1 0.9999999999999998
my cossine_similarity: qv2 0.40824829046386296
Elapsed time: 0.003000497817993164


In [574]:
t0 = time()
# qv2 cossine available and documents
for i in range(0,df.shape[0]):
    #print("my cossine_similarity:",cossine_similarity(qv2.toarray()[0],df.values[i]))
    print("sk cossine_similarity:",cossine(qv2.toarray(),[df.values[i]])[0][0])
    #print()
print("Elapsed time:",time()-t0)

sk cossine_similarity: 0.6666666666666669
sk cossine_similarity: 0.9428090415820636
sk cossine_similarity: 0.6666666666666669
sk cossine_similarity: 0.408248290463863
sk cossine_similarity: 0.408248290463863
sk cossine_similarity: 1.0000000000000002
Elapsed time: 0.002000093460083008


In [575]:
print(cossine(qv2.toarray(),df.values))
print(cossine_similarity(qv2.toarray(),df.values.T))

[[0.66666667 0.94280904 0.66666667 0.40824829 0.40824829 1.        ]]
[[0.66666667 0.94280904 0.66666667 0.40824829 0.40824829 1.        ]]


In [576]:
df2 = df.copy()
df2['sim_qv1'] = cossine_similarity(qv1.toarray(),df.values.T).T
df2['sim_qv2'] = cossine_similarity(qv2.toarray(),df.values.T).T
df2

Unnamed: 0,aula,boa,iago,iuri,luciano,ministrou,sim_qv1,sim_qv2
d_0,1,0,1,0,1,0,0.816497,0.666667
d_1,2,0,0,1,1,0,0.288675,0.942809
d_2,1,0,0,0,1,1,0.408248,0.666667
d_3,1,1,0,0,0,0,0.0,0.408248
qv1,0,0,1,0,1,0,1.0,0.408248
qv2,1,0,0,1,1,0,0.408248,1.0


In [566]:
np.linalg.norm([[1,2],[0,1]],axis=1)

array([2.23606798, 1.        ])

## My Vector Space Model

In [144]:
nomes = [
    'Luciano Vilas Vilas Boas Espiridião',
    'Fabiano Vilas Boas Espiridião',
    'Isaac Espiridião',
    'Rosa Maria Vilas Boas Espiridião',
    'Bernardo Moreira Vilas Boas Espiridião',
    'Isabella Moreira Vilas Boas Espiridião',
    'Camila Adriana Moreira da Silva Silva Silva Souza']

# matriz de nomes
m = [z.split(" ") for z in nomes]
    
# matriz binária
mm = [ [1 if x in set(row) else 0 for x in words] for row in m ]    


# matriz de contagens
mmc = [ [Counter(row)[x] if x in set(row) else 0 for x in words] for row in m ]    

# cria o dicionário de termos
words = sorted(set(reduce(lambda x,y :x+y ,m)))

# cria dicionário de contagem geral de termos
words_count = Counter(reduce(lambda x,y :x+y ,m))

print(words)
print()
print(words_count)
print('total words:', len(words))

mmc

['Adriana', 'Bernardo', 'Boas', 'Camila', 'Espiridião', 'Fabiano', 'Isaac', 'Isabella', 'Luciano', 'Maria', 'Moreira', 'Rosa', 'Silva', 'Souza', 'Vilas', 'da']

Counter({'Vilas': 6, 'Espiridião': 6, 'Boas': 5, 'Moreira': 3, 'Silva': 3, 'Luciano': 1, 'Fabiano': 1, 'Isaac': 1, 'Rosa': 1, 'Maria': 1, 'Bernardo': 1, 'Isabella': 1, 'Camila': 1, 'Adriana': 1, 'da': 1, 'Souza': 1})
total words: 16


[[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0],
 [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0],
 [0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
 [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 0, 1]]

In [138]:
m

[['Luciano', 'Vilas', 'Vilas', 'Boas', 'Espiridião'],
 ['Fabiano', 'Vilas', 'Boas', 'Espiridião'],
 ['Isaac', 'Espiridião'],
 ['Rosa', 'Maria', 'Vilas', 'Boas', 'Espiridião'],
 ['Bernardo', 'Moreira', 'Vilas', 'Boas', 'Espiridião'],
 ['Isabella', 'Moreira', 'Vilas', 'Boas', 'Espiridião'],
 ['Camila', 'Adriana', 'Moreira', 'da', 'Silva', 'Silva', 'Silva', 'Souza']]

In [135]:
#words_indices = dict((w, i) for i, w in enumerate(words))
#indices_words = dict((i, w) for i, w in enumerate(words))
#words_indices

In [154]:
df2 = pd.DataFrame(data=mmc,columns=words)

In [155]:
df2

Unnamed: 0,Adriana,Bernardo,Boas,Camila,Espiridião,Fabiano,Isaac,Isabella,Luciano,Maria,Moreira,Rosa,Silva,Souza,Vilas,da
0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,2,0
1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0
4,0,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0
5,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0
6,1,0,0,1,0,0,0,0,0,0,1,0,3,1,0,1


In [158]:
# Em quantos documentos (d_i) cada Termo aparece?
df2[df2>0].count()

Adriana       1
Bernardo      1
Boas          5
Camila        1
Espiridião    6
Fabiano       1
Isaac         1
Isabella      1
Luciano       1
Maria         1
Moreira       3
Rosa          1
Silva         1
Souza         1
Vilas         5
da            1
dtype: int64

In [161]:
# contagem total de termos em D documentos
df2.sum()

Adriana       1
Bernardo      1
Boas          5
Camila        1
Espiridião    6
Fabiano       1
Isaac         1
Isabella      1
Luciano       1
Maria         1
Moreira       3
Rosa          1
Silva         3
Souza         1
Vilas         6
da            1
dtype: int64

In [166]:
# probabilidade de ocorrência de cada Termo em cada documento d_i
df2[df2>0].count() / df2.sum()

Adriana       1.000000
Bernardo      1.000000
Boas          1.000000
Camila        1.000000
Espiridião    1.000000
Fabiano       1.000000
Isaac         1.000000
Isabella      1.000000
Luciano       1.000000
Maria         1.000000
Moreira       1.000000
Rosa          1.000000
Silva         0.333333
Souza         1.000000
Vilas         0.833333
da            1.000000
dtype: float64

In [123]:
np.intersect1d(mm[0],mm[2])

array([0, 1])

In [124]:
np.union1d(mm[0],mm[2])

array([0, 1])

In [125]:
jaccard_similarity(mm[0],mm[2])

1.0

In [126]:
jaccard_similarity(["luciano","boas"],["luciano","vilas"])

0.3333333333333333

In [142]:
df3 = df2.copy()
df3['df2.values[6]'] = cossine_similarity(df2.values[6],df2.values.T).T
df3

Unnamed: 0,Adriana,Bernardo,Boas,Camila,Espiridião,Fabiano,Isaac,Isabella,Luciano,Maria,Moreira,Rosa,Silva,Souza,Vilas,da,df2.values[6]
0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,2,0,0.0
1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0.0
2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0
3,0,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0.0
4,0,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0.119523
5,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0.119523
6,1,0,0,1,0,0,0,0,0,0,1,0,3,1,0,1,1.0


In [141]:
nomes

['Luciano Vilas Vilas Boas Espiridião',
 'Fabiano Vilas Boas Espiridião',
 'Isaac Espiridião',
 'Rosa Maria Vilas Boas Espiridião',
 'Bernardo Moreira Vilas Boas Espiridião',
 'Isabella Moreira Vilas Boas Espiridião',
 'Camila Adriana Moreira da Silva Silva Silva Souza']

In [17]:
s = "Luciano".rjust(11,"-")

In [20]:
s.ljust(15,"-")

'----Luciano----'

In [9]:
"21".zfill(15)

'000000000000021'

In [38]:
def fill(s,pad,char='-'):
    l = len(s)
    s = s.rjust(l+pad,char)
    l = len(s)
    return s.ljust(l+pad,char)

In [46]:
fill("luciano",3,"~")

'~~~luciano~~~'