In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
%matplotlib inline

In [None]:
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

bert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
full_pd = pd.read_csv("fulltrain_cleaned.csv")
full_pd = full_pd.dropna()

print("total number of samples: ", len(full_pd))
print("number of negative samples: " , len(full_pd[full_pd['is_duplicate'] == 0]))
print("number of positive samples: " , len(full_pd[full_pd['is_duplicate'] == 1]))

total number of samples:  800037
number of negative samples:  254896
number of positive samples:  545141


In [None]:
generated_df = full_pd[full_pd["id"] > 404289]
original_df = full_pd[full_pd["id"] <= 404289]

number_to_keep = len(original_df[original_df['is_duplicate'] == 0]) - len(original_df[original_df['is_duplicate'] == 1])
print(number_to_keep)

indexes_to_keep = [random.randint(0,len(generated_df)) for p in range(number_to_keep)]
indexes_to_keep.sort()
to_keep = generated_df.iloc[indexes_to_keep]
final_df = pd.concat([original_df, to_keep], axis=0, ignore_index=True)

final_df.to_csv("balanced.csv",index=False)

105637


In [None]:
final_df = pd.read_csv("balanced.csv")

In [None]:
print("total number of samples: ", len(final_df))
print("number of negative samples: " , len(final_df[final_df['is_duplicate'] == 0]))
print("number of positive samples: " , len(final_df[final_df['is_duplicate'] == 1]))


total number of samples:  509792
number of negative samples:  254896
number of positive samples:  254896


In [None]:
balanced_cleaned = final_df[["is_duplicate", "qid1", "qid2", "question1_cleaned", "question2_cleaned"]]
balanced_stopwords = final_df[["is_duplicate", "qid1", "qid2", "question1_stopwords", "question2_stopwords"]]
balanced_stopwords_lemmatize = final_df[["is_duplicate", "qid1", "qid2", "question1_stopwords_lemmatize", "question2_stopwords_lemmatize"]]

In [None]:
def calculate_cosine_similarity(train_df, question1_name, question2_name):
    id_to_sentence = {}
    for idx, sample in train_df.iterrows():
        sample1_id = sample['qid1']
        sample2_id = sample['qid2']
        if sample1_id not in id_to_sentence.keys():
            id_to_sentence[sample1_id] = sample[question1_name]

        if sample2_id not in id_to_sentence.keys():
            id_to_sentence[sample2_id] = sample[question2_name]
    
    id_to_embeddings = {}
    counter = 0
    
    for sentence_key in id_to_sentence.keys():
        id_to_embeddings[sentence_key] = bert_model.encode(id_to_sentence[sentence_key])
        counter = counter + 1

        if counter % 10000 == 0:
            print(counter)
    
    def get_embedding(x):
        return id_to_embeddings[int(x)]

    q1_encoded = train_df['qid1'].apply(lambda x: get_embedding(x))
    q2_encoded = train_df['qid2'].apply(lambda x: get_embedding(x))
    
    cosine_similarity_list = []

    for x, y in zip(q1_encoded, q2_encoded):
        cosine_similarity_list.append(util.cos_sim(x,y).item())
        
    return cosine_similarity_list


In [None]:
def generate_features(train_clean, question1_name, question2_name):
    # Trim whitespace
    train_clean[question1_name] = train_clean[question1_name].apply(lambda x: x.strip())
    train_clean[question2_name] = train_clean[question2_name].apply(lambda x: x.strip())
    
    # Extracting number of words for each question
    train_clean['q1_word_count'] = train_clean[question1_name].apply(lambda x: len(str(x).split(" ")))
    train_clean['q2_word_count'] = train_clean[question2_name].apply(lambda x: len(str(x).split(" ")))
    
    # Extracting number of non-whitespace characters
    train_clean['q1char_count'] = train_clean[question1_name].str.len() ## this also includes spaces
    train_clean['q2char_count'] = train_clean[question2_name].str.len()
    
    # Frequency of how many times each question occurs
    train_clean['freq_qid1'] = train_clean.groupby('qid1')['qid1'].transform('count') 
    train_clean['freq_qid2'] = train_clean.groupby('qid2')['qid2'].transform('count') 
    
    # Unique number of words
    train_clean['total_unique_num_words'] = train_clean.apply(lambda x: len(set(x[question1_name].split()).union(set(x[question2_name].split()))) ,axis=1)
    
    # Count common words between question 1 and 2
    def word_common(row):
        q1=row[question1_name]
        q2=row[question2_name]
        q1=set(q1.split(" "))
        q2=set(q2.split(" "))
        return len(q1&q2)
    
    train_clean['common_words_count']=train_clean.apply(word_common,axis=1)

    # Total word count for both questions
    train_clean['tot_words'] = train_clean['total_unique_num_words'] + train_clean['common_words_count']

    # Fraction of same words / total number of words
    def wordratio(row):
          return row['common_words_count']/row['tot_words']
    train_clean['words_ratio']=train_clean.apply(wordratio,axis=1)

    # Advanced feature extraction
    train_clean['Simple_Ratio']= train_clean.apply(lambda x: fuzz.ratio(x[question1_name],x[question2_name]),axis=1)
    train_clean['Partial_Ratio']= train_clean.apply(lambda x: fuzz.partial_ratio(x[question1_name],x[question2_name]) ,axis=1)
    train_clean['Token_Sort_Ratio']= train_clean.apply(lambda x: fuzz.token_sort_ratio(x[question1_name],x[question2_name]) ,axis=1)
    train_clean['Token_Set_Ratio']= train_clean.apply(lambda x: fuzz.token_set_ratio(x[question1_name],x[question2_name]) ,axis=1)


In [None]:
cleaned_cosine_similarity = calculate_cosine_similarity(balanced_cleaned, "question1_cleaned", "question2_cleaned")
balanced_cleaned["cosine_similarity"] = cleaned_cosine_similarity

In [None]:
generate_features(balanced_cleaned, "question1_cleaned", "question2_cleaned")
balanced_cleaned.to_csv("cleaned_features.csv",index=False)

In [None]:
stopwords_cosine_similarity = calculate_cosine_similarity(balanced_stopwords, "question1_stopwords", "question2_stopwords")
balanced_stopwords["cosine_similarity"] = stopwords_cosine_similarity

generate_features(balanced_stopwords, "question1_stopwords", "question2_stopwords")
balanced_stopwords.to_csv("stopwords_features.csv",index=False)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balanced_stopwords["cosine_similarity"] = stopwords_cosine_similarity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean[question1_name] = train_clean[question1_name].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean[question2_name] = train_clean[question

In [None]:
stopwords_lemmatize_cosine_similarity = calculate_cosine_similarity(balanced_stopwords_lemmatize, "question1_stopwords_lemmatize", "question2_stopwords_lemmatize")
balanced_stopwords_lemmatize["cosine_similarity"] = stopwords_lemmatize_cosine_similarity

generate_features(balanced_stopwords_lemmatize, "question1_stopwords_lemmatize", "question2_stopwords_lemmatize")
balanced_stopwords_lemmatize.to_csv("stopwords_lemmatize_features.csv",index=False)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balanced_stopwords_lemmatize["cosine_similarity"] = stopwords_lemmatize_cosine_similarity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean[question1_name] = train_clean[question1_name].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean[question2_name] = 

In [None]:
balanced_cleaned

Unnamed: 0,is_duplicate,qid1,qid2,question1_cleaned,question2_cleaned,cosine_similarity,q1_word_count,q2_word_count,q1char_count,q2char_count,freq_qid1,freq_qid2,common_words_count,total_unique_num_words,tot_words,words_ratio,Simple_Ratio,Partial_Ratio,Token_Sort_Ratio,Token_Set_Ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.891528,14,12,65,56,1,1,11,12,23,0.478261,93,100,93,100
1,0,3,4,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...,0.667396,10,15,48,85,4,1,7,17,24,0.291667,65,73,63,86
2,0,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0.499441,14,10,72,58,1,1,4,20,24,0.166667,54,53,66,66
3,0,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,0.165055,11,13,48,59,1,1,0,20,20,0.000000,36,40,36,36
4,0,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.211917,13,7,73,38,3,1,4,16,20,0.200000,45,55,47,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509787,1,235427,267265,should i join hcl tss leap program it is worth...,hcl tss best or not,0.472403,15,5,70,19,6,1,2,18,20,0.100000,36,63,29,54
509788,1,537762,132589,what is your favorite vodka drink and why,what is your favourite vodka,0.941730,8,5,41,28,2,3,4,9,13,0.307692,78,96,78,78
509789,1,537762,132589,what is your favorite vodka drink and why,what is your favourite vodka,0.941730,8,5,41,28,2,3,4,9,13,0.307692,78,96,78,78
509790,1,537894,187745,among bollywood stars which actor or actress d...,who are the over actors of bollywood,0.785160,12,7,70,36,3,2,2,17,19,0.105263,42,53,58,58


In [None]:
balanced_stopwords

Unnamed: 0,is_duplicate,qid1,qid2,question1_stopwords,question2_stopwords,cosine_similarity,q1_word_count,q2_word_count,q1char_count,q2char_count,freq_qid1,freq_qid2,total_unique_num_words,common_words_count,tot_words,words_ratio,Simple_Ratio,Partial_Ratio,Token_Sort_Ratio,Token_Set_Ratio
0,0,1,2,step step guide invest share market india,step step guide invest share market,0.915769,7,6,41,35,1,1,6,5,11,0.454545,92,100,92,100
1,0,3,4,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,0.631077,5,10,31,67,4,1,11,4,15,0.266667,59,94,59,89
2,0,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0.422253,6,5,44,36,1,1,9,2,11,0.181818,55,56,70,70
3,0,7,8,mentally lonely solve,find remainder math 23 24 math divided 24 23,0.236682,3,9,21,44,1,1,9,0,9,0.000000,22,24,22,22
4,0,9,10,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0.225030,10,5,60,29,3,1,13,2,15,0.133333,43,52,40,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509787,1,235427,267265,join hcl tss leap program worth giving 2 lac r...,hcl tss best,0.381733,10,3,50,12,6,1,11,2,13,0.153846,32,75,26,74
509788,1,537762,132589,favorite vodka drink,favourite vodka,0.951523,3,2,20,15,2,3,4,1,5,0.200000,80,93,80,80
509789,1,537762,132589,favorite vodka drink,favourite vodka,0.951523,3,2,20,15,2,3,4,1,5,0.200000,80,93,80,80
509790,1,537894,187745,among bollywood stars actor actress much acting,actors bollywood,0.733316,7,2,47,16,3,2,8,1,9,0.111111,38,75,51,72


In [None]:
balanced_stopwords_lemmatize

Unnamed: 0,is_duplicate,qid1,qid2,question1_stopwords_lemmatize,question2_stopwords_lemmatize,cosine_similarity,q1_word_count,q2_word_count,q1char_count,q2char_count,freq_qid1,freq_qid2,total_unique_num_words,common_words_count,tot_words,words_ratio,Simple_Ratio,Partial_Ratio,Token_Sort_Ratio,Token_Set_Ratio
0,0,1,2,step step guide invest share market india,step step guide invest share market,0.915769,7,6,41,35,1,1,6,5,11,0.454545,92,100,92,100
1,0,3,4,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,0.631077,5,10,31,67,4,1,11,4,15,0.266667,59,94,59,89
2,0,5,6,increase speed internet connection use vpn,internet speed increase hack dns,0.517595,6,5,42,32,1,1,8,3,11,0.272727,51,56,70,84
3,0,7,8,mentally lonely solve,find remainder math 23 24 math divide 24 23,0.241359,3,9,21,43,1,1,9,0,9,0.000000,22,24,22,23
4,0,9,10,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0.225030,10,5,60,29,3,1,13,2,15,0.133333,43,52,40,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509787,1,235427,267265,join hcl tss leap program worth give 2 lac rupee,hcl tss best,0.385390,10,3,48,12,6,1,11,2,13,0.153846,33,75,30,74
509788,1,537762,132589,favorite vodka drink,favourite vodka,0.951523,3,2,20,15,2,3,4,1,5,0.200000,80,93,80,80
509789,1,537762,132589,favorite vodka drink,favourite vodka,0.951523,3,2,20,15,2,3,4,1,5,0.200000,80,93,80,80
509790,1,537894,187745,among bollywood star actor actress much act,actor bollywood,0.664399,7,2,43,15,3,2,7,2,9,0.222222,41,80,52,100
