In [1]:
import pandas as pd
import numpy as np
from re import sub
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../quora.tsv', sep='\t')

In [3]:
df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df = df.drop(["id", "qid1", "qid2"], axis=1)
df.head(3)

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [5]:
df['is_duplicate'].value_counts(normalize=True)

0    0.630752
1    0.369248
Name: is_duplicate, dtype: float64

In [6]:
X = df[['question1', 'question2']]
Y = df['is_duplicate']

In [7]:
def text_to_word_list(text):
        ''' Pre process and convert texts to a list of words '''
        text = str(text)
        text = text.lower()

        # Clean the text
        text = sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = sub(r"what's", "what is ", text)
        text = sub(r"\'s", " ", text)
        text = sub(r"\'ve", " have ", text)
        text = sub(r"can't", "cannot ", text)
        text = sub(r"n't", " not ", text)
        text = sub(r"i'm", "i am ", text)
        text = sub(r"\'re", " are ", text)
        text = sub(r"\'d", " would ", text)
        text = sub(r"\'ll", " will ", text)
        text = sub(r",", " ", text)
        text = sub(r"\.", " ", text)
        text = sub(r"!", " ! ", text)
        text = sub(r"\/", " ", text)
        text = sub(r"\^", " ^ ", text)
        text = sub(r"\+", " + ", text)
        text = sub(r"\-", " - ", text)
        text = sub(r"\=", " = ", text)
        text = sub(r"'", " ", text)
        text = sub(r"(\d+)(k)", r"\g<1>000", text)
        text = sub(r":", " : ", text)
        text = sub(r" e g ", " eg ", text)
        text = sub(r" b g ", " bg ", text)
        text = sub(r" u s ", " american ", text)
        text = sub(r"\0s", "0", text)
        text = sub(r" 9 11 ", "911", text)
        text = sub(r"e - mail", "email", text)
        text = sub(r"j k", "jk", text)
        text = sub(r"\s{2,}", " ", text)

        return text

In [8]:
for index, row in df.iterrows():
    for sequence in ['question1', 'question2']:
        sent = text_to_word_list(row[sequence])
    df.at[index, sequence] = sent

In [9]:
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,what would happen if the indian government sto...,0
2,How can I increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,find the remainder when math 23 ^ 24 math is d...,0
4,"Which one dissolve in water quikly sugar, salt...",which fish would survive in salt water,0


In [10]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, random_state=42, stratify=Y)

In [11]:
x = train_x.values

In [12]:
x = x.tolist()
len(x)

303263

In [13]:
y = train_y.values
y = y.tolist()

In [14]:
from sentence_transformers import SentenceTransformer, util

In [15]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device='cuda')

100%|██████████| 245M/245M [01:03<00:00, 3.83MB/s] 


In [16]:
q1 = []
q2 = []
for i in range(len(x)):
    q1.append(x[i][0])
    q2.append(x[i][1])

In [17]:
for i in range(len(q1)):
    if type(q1[i]) is not str:
        q1[i] = " "
    
    if type(q2[i]) is not str:
        q2[i] = " "
print("Done")

Done


In [25]:
q3 = q1[1:11]
q4 = q2[1:11]

In [28]:
assert len(q3) == len(q4)

In [29]:
# Compute embeddings for the lists
embeddings1 = model.encode(q3, convert_to_tensor=True)
embeddings2 = model.encode(q4, convert_to_tensor=True)

In [30]:
cosines_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

In [33]:
#Output the pairs with their score
for i in range(len(q3)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(q3[i], q4[i], cosines_scores[i][i]))

Skills for freelance coding? 		 How long to use 3m 9000in face mask? 		 Score: 0.2702
How do I be a more reliable person? 		 How can I become a more reliable person? 		 Score: 0.9623
What are the requirements for selection into MIT? 		 How can I get admitted to MIT? 		 Score: 0.8194
What is the life in USA? 		 How is life in the USA? 		 Score: 0.9458
Camgirls do you live a lucrative lifestyle? 		 Do CamGirls live a lucrative lifestyle? 		 Score: 0.9836
How do I solve my problem? 		 Why do I avoid my crush? 		 Score: 0.2846
Anki: I would like to flip the language on the flashcards, from English prompts, Tagalog answers to Tagalog prompts, English answers. Is there a simple switch? 		 What are the best resources for learning Swahili? 		 Score: 0.2129
Why is Lord Karthik referred to as the 'Tamil God'? 		 Does God exist? 		 Score: 0.4484
What is the diffusion of solids, liquids, and gases? 		 What are the densities of solids, liquids, and gases? How is this determined? 		 Score: 0.8239
In

In [34]:
y[1:11]

[0, 1, 1, 1, 1, 0, 0, 0, 0, 0]