In [2]:
# Import generic libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
from tabulate import tabulate

# Import sentence transformer libraries
from sentence_transformers import SentenceTransformer,util

In [3]:
# Load data
file = './deeplearning_questions.csv'
df = pd.read_csv(file)

# Basic data cleansing
df.dropna()
df.reset_index(drop=True, inplace=True)

# Random question selection (source question)
'''this will be the source question for which we will find similar questions'''
src_q = random.choice(df['DESCRIPTION'])
idx = df.index[df['DESCRIPTION'] == src_q][0]

# removing the above question from the original dataframe & reset it
df.drop(idx, inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"The randomly selected question is: {src_q}")

The randomly selected question is:  What are the advantages and disadvantages of bag of words


In [6]:
# Instantiate model
'''using all-MiniLM-L6-v2 model'''
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Compute embedding for source question
embed_src = model.encode(src_q, convert_to_tensor=True)

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 293kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 27.2kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 964kB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 205kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 28.1kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 64.0kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:10<00:00, 8.99MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 13.3kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 16.0kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 322kB/s]  
Downloading: 100%|██████████| 350/350 [00:00<00:00, 49.8kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 2.64MB/s]
Downloading: 100%|██████████| 232k/232k [00:04<00:00, 55.1kB/s] 
Downloading: 100%|██████████| 349/349 [00:00<00:00, 49.9kB/s]


In [7]:
# iterate over the remaining question list
score = []
for q in df['DESCRIPTION']:
    embed_trgt = model.encode(q,convert_to_tensor=True)
    semanticSearch = util.semantic_search(embed_src, embed_trgt)
    semanticSearch_score = semanticSearch[0][0]['score']
    score.append(float("{:.4f}".format(semanticSearch_score)))

In [8]:
# Adding the new 'SCORE' column to the existing dataframe
df['SCORE'] = score

In [14]:
df

Unnamed: 0,ID,DESCRIPTION,SCORE
0,83,What is bag of words How we can use it for te...,0.6405
1,80,What are some advantages of using character e...,0.5278
2,82,Would you prefer gradient boosting trees mode...,0.5173
3,41,How large should be N for our bag of words wh...,0.5064
4,78,What are word embeddings Why are they useful,0.4982
...,...,...,...
105,8,What is the Computational Graph,-0.0328
106,55,How to handle exploding gradient problem,-0.0402
107,69,What is the range of activation functions,-0.0402
108,74,What is RNN and How does an RNN work,-0.0547


In [9]:
# Sort & view the new dataframe
df.sort_values(by='SCORE', inplace=True, ascending=False)
df.reset_index(drop=True, inplace=True)

In [10]:
print(f"Top 10 questions similar to >> {src_q}\n")
print(tabulate(df[['DESCRIPTION']].head(10), headers=['#','SimilarQuestions'], tablefmt='fancy_grid'))

Top 10 questions similar to >>  What are the advantages and disadvantages of bag of words

╒═════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│   # │ SimilarQuestions                                                                                                       │
╞═════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│   0 │ What is bag of words How we can use it for text vectorization                                                          │
├─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│   1 │ What are some advantages of using character embeddings instead of word embeddings                                      │
├─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│   2 