In [1]:
from sentence_transformers import SentenceTransformer

In [2]:
model_similarity = SentenceTransformer('bert-base-nli-mean-tokens')

In [3]:
import wikipedia

In [80]:
question = "What is the population of Columbus, Ohio?"

q_search = wikipedia.search(question)
page = wikipedia.page(q_search[0])
context = page.content

In [81]:
q_search

['Columbus, Ohio',
 'Christopher Columbus',
 'Ohio',
 'List of United States cities by population',
 'Dublin, Ohio',
 'Ohio Penitentiary',
 'Neighborhoods in Columbus, Ohio',
 'Columbus City Center',
 "Ohio's 4th congressional district",
 'List of counties in Ohio']

In [75]:
import scipy

In [76]:
cos_similarity = []

for search_rez in q_search:
    tmp_duo = [question, search_rez]
    result = model_similarity.encode(tmp_duo)
    
    embeddings = []
    for sentence, embedding in zip(tmp_duo, result):
        embeddings.append(embedding)
        
    cos_similarity.append(scipy.spatial.distance.cosine(embeddings[0], embeddings[1]))
    

In [77]:
cos_similarity

[0.42663317918777466,
 0.40918219089508057,
 0.3956661820411682,
 0.4411410093307495,
 0.2773272395133972,
 0.7016679644584656,
 0.8098232299089432,
 0.5341960787773132,
 0.4940072298049927,
 0.47312045097351074]

In [24]:
sentences = [question, q_search[0]]
sentence_embeddings = model_similarity.encode(sentences)

In [25]:
embeddings = []
for sentence, embedding in zip(sentences, sentence_embeddings):
    embeddings.append(embedding)
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: How tall is Eiffel Tower?
Embedding: [ 4.27328289e-01  1.61710992e-01  1.27966061e-01  4.10898387e-01
  7.74664357e-02  8.87552463e-03 -1.85335398e-01  7.63639927e-01
 -4.45792049e-01  3.44585836e-01  2.87298977e-01 -1.39294758e-01
  3.57429713e-01  9.33012366e-01  1.74043521e-01 -4.91121382e-01
 -1.09455720e-01  1.85731769e-01  6.39762580e-02 -4.20719713e-01
 -3.90647233e-01 -3.45338374e-01  4.55304563e-01 -6.80087745e-01
  1.22082639e+00 -5.02514482e-01 -6.10837102e-01 -7.70603776e-01
 -7.78433606e-02  7.63470352e-01  8.70445967e-02 -3.96604836e-01
 -2.58041322e-01 -7.50160515e-01 -2.73522586e-01 -1.88927382e-01
 -2.12925728e-02  3.18380028e-01  2.27487087e-01 -1.02959871e+00
  9.95727479e-02 -7.61179566e-01  5.70626795e-01 -4.01428305e-02
 -1.73337519e+00 -4.63050425e-01  4.94133793e-02  4.08111811e-01
 -3.24600697e-01 -1.21819913e-01 -4.70157236e-01  2.75452644e-01
 -1.39202565e-01 -9.17001963e-02  2.62565523e-01  6.77330017e-01
  5.50426953e-02 -4.23624367e-01 -1.5339617

In [17]:
from scipy import spatial

In [18]:
spatial.distance.cosine(embeddings[0], embeddings[1])

0.3526206612586975

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform([question, q_search[0]])

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names())

In [30]:
df

Unnamed: 0,eiffel,how,is,tall,tower
0,1,1,1,1,1
1,1,0,0,0,1


In [37]:

cosine_similarity(df, df)

array([[1.        , 0.63245553],
       [0.63245553, 1.        ]])

In [82]:
documents = (
question,
q_search[0],
q_search[1],
q_search[2],
q_search[3],
q_search[4],
q_search[6],
q_search[7],
q_search[8],
q_search[9],    
)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

array([[1.        , 0.34827861, 0.13744696, 0.22097887, 0.23614143,
        0.09682685, 0.17593935, 0.10421911, 0.05986543, 0.19927071]])

In [86]:
print(tfidf_matrix)

  (0, 16)	0.2209788729929701
  (0, 6)	0.26919570355003813
  (0, 15)	0.33715350935919747
  (0, 17)	0.3853703399162655
  (0, 19)	0.4533281457254249
  (0, 12)	0.4533281457254249
  (0, 21)	0.4533281457254249
  (1, 16)	0.6344887903469154
  (1, 6)	0.7729320635891024
  (2, 3)	0.859828001022076
  (2, 6)	0.510583792005172
  (3, 16)	1.0
  (4, 1)	0.4083012304645545
  (4, 4)	0.4083012304645545
  (4, 18)	0.4083012304645545
  (4, 20)	0.4083012304645545
  (4, 13)	0.3470933482865148
  (4, 15)	0.3036656647614862
  (4, 17)	0.3470933482865148
  (5, 10)	0.8988909296735403
  (5, 16)	0.43817245069793953
  (6, 11)	0.5589689255813854
  (6, 14)	0.657539826772868
  (6, 16)	0.3205236896018122
  (6, 6)	0.3904608569958576
  (7, 2)	0.6519642954359074
  (7, 5)	0.6519642954359074
  (7, 6)	0.38714999025380564
  (8, 9)	0.5557600363852174
  (8, 7)	0.5557600363852174
  (8, 0)	0.5557600363852174
  (8, 16)	0.2709102173623312
  (9, 8)	0.5558936174337361
  (9, 11)	0.47256036124762185
  (9, 13)	0.47256036124762185
  (9, 16)	0

In [62]:
documents

('How tall is Eiffel Tower?',
 'Eiffel Tower',
 'Amazon Tall Tower Observatory',
 'List of tallest towers')

In [70]:
documents

('How many people live in Ohio?',
 'COVID-19 pandemic in Ohio',
 'Wyandot people',
 'Haley Bennett',
 'Ohio',
 'Jim Jordan (American politician)',
 'Jane Curtin',
 'Piper Kerman',
 'Black Is King',
 'Lynn Toler')

In [79]:
documents

('How many people live in Columbus?',
 'Christopher Columbus',
 'Columbus, Ohio',
 'Columbus Day',
 'Voyages of Christopher Columbus',
 'List of people from Columbus, Ohio',
 'Harry Potter (film series)',
 'Chris Columbus (filmmaker)',
 'Indigenous peoples of the Americas',
 "A People's History of the United States")

In [83]:
documents

('What is the population of Columbus, Ohio?',
 'Columbus, Ohio',
 'Christopher Columbus',
 'Ohio',
 'List of United States cities by population',
 'Dublin, Ohio',
 'Neighborhoods in Columbus, Ohio',
 'Columbus City Center',
 "Ohio's 4th congressional district",
 'List of counties in Ohio')