# Distância Cosseno

- Referência: https://www.machinelearningplus.com/cosine-similarity/

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [12]:
docs = [None] * 4
docs[0] = 'I like football'
docs[1] = 'John likes football'
docs[2] = 'Mike likes basketball'
docs[3] = 'My house is beautiful'
docs

['I like football',
 'John likes football',
 'Mike likes basketball',
 'My house is beautiful']

# Vetorizador de contagem
- Converta uma coleção de documentos de texto em uma matriz de contagens de tokens.

In [13]:
count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(docs)

In [14]:
print(count_vectorizer.get_feature_names())

['basketball', 'beautiful', 'football', 'house', 'john', 'like', 'likes', 'mike']


In [15]:
dense_matrix = sparse_matrix.toarray()
dense_matrix

array([[0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

# Distância Cosseno

In [17]:
cosine_distances(dense_matrix, dense_matrix)

array([[0.        , 0.59175171, 1.        , 1.        ],
       [0.59175171, 0.        , 0.66666667, 1.        ],
       [1.        , 0.66666667, 0.        , 1.        ],
       [1.        , 1.        , 1.        , 0.        ]])

# Similaridade Cosseno

In [19]:
cosine_similarity(dense_matrix, dense_matrix)

array([[1.        , 0.40824829, 0.        , 0.        ],
       [0.40824829, 1.        , 0.33333333, 0.        ],
       [0.        , 0.33333333, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

# TF-IDF - Frequência de Term / Freqüência de Documento Inverso
- Converta uma coleção de documentos brutos em uma matriz de recursos do TF-IDF.
- $ _{t, d, D} = _{t, d} \times _{t, D} $
- Onde t denota os termos; d denota cada documento; D denota a coleção de documentos.
- $ _{t, D} = (1 + \log_{t, d}) \cdot \log \frac{\vert D \vert}{t} $

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_sparse_matrix = tfidf_vectorizer.fit_transform(docs)

In [21]:
tfidf_sparse_matrix.shape

(4, 8)

In [22]:
print(count_vectorizer.get_feature_names())

['basketball', 'beautiful', 'football', 'house', 'john', 'like', 'likes', 'mike']


In [24]:
tfidf_dense_matrix = tfidf_sparse_matrix.toarray()
tfidf_dense_matrix.round(4)

array([[0.    , 0.    , 0.6191, 0.    , 0.    , 0.7853, 0.    , 0.    ],
       [0.    , 0.    , 0.5264, 0.    , 0.6677, 0.    , 0.5264, 0.    ],
       [0.6176, 0.    , 0.    , 0.    , 0.    , 0.    , 0.4869, 0.6176],
       [0.    , 0.7071, 0.    , 0.7071, 0.    , 0.    , 0.    , 0.    ]])

In [25]:
cosine_distances(tfidf_dense_matrix, tfidf_dense_matrix)

array([[0.        , 0.67408645, 1.        , 1.        ],
       [0.67408645, 0.        , 0.74367516, 1.        ],
       [1.        , 0.74367516, 0.        , 1.        ],
       [1.        , 1.        , 1.        , 0.        ]])