In [8]:
import importlib

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.pipeline import make_pipeline
from sklearn.utils.estimator_checks import check_estimator

import vector_similarity

importlib.reload(vector_similarity)
from vector_similarity import VectorSimilarity

In [9]:
# Sanity checks on VectorSimilarity

# VectorSimilarity will never pass check_estimator because predict()'s output is determined by the n_best attribute,
# and not the prediction input.
# check_estimator(VectorSimilarity())

estimator = VectorSimilarity()
X = np.array(
    [[0, 1],
     [1, 0],
     [-1, 0]])

y = np.array(['a', 'b', 'c'])

estimator = estimator.fit(X, y)
estimator.predict(np.array([1, 2]).reshape(1, -1))

array([['a', 'b', 'c']], dtype='<U1')

In [10]:
corpus = [
    'Bees like to make honey',
    'Bears like to eat honey',
    'Bees don\'t like bears',
    'Humans are walking around the park'
]

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
print(linear_kernel(X))
print(vectorizer.transform(['This sentence has different words']).toarray())

['are', 'around', 'bears', 'bees', 'don', 'eat', 'honey', 'humans', 'like', 'make', 'park', 'the', 'to', 'walking']
[[0.         0.         0.         0.43584673 0.         0.
  0.43584673 0.         0.35285549 0.55281632 0.         0.
  0.43584673 0.        ]
 [0.         0.         0.43584673 0.         0.         0.55281632
  0.43584673 0.         0.35285549 0.         0.         0.
  0.43584673 0.        ]
 [0.         0.         0.4842629  0.4842629  0.61422608 0.
  0.         0.         0.39205255 0.         0.         0.
  0.         0.        ]
 [0.40824829 0.40824829 0.         0.         0.         0.
  0.         0.40824829 0.         0.         0.40824829 0.40824829
  0.         0.40824829]]
[[1.         0.50443175 0.3494023  0.        ]
 [0.50443175 1.         0.3494023  0.        ]
 [0.3494023  0.3494023  1.         0.        ]
 [0.         0.         0.         1.        ]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [12]:
pipe = make_pipeline(TfidfVectorizer(), VectorSimilarity())
pipe.fit(corpus, ['a', 'b', 'c', 'd'])
pipe.predict(corpus)

array([['a', 'b', 'c', 'd'],
       ['b', 'a', 'c', 'd'],
       ['c', 'b', 'a', 'd'],
       ['d', 'c', 'b', 'a']], dtype='<U1')