In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [3]:
df = df.sample(51279,axis=0)

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
95724,95724,159576,159577,What are the effects of demonetization? Is it ...,"If demonetisation of 1946 and 1978 failed, why...",0
233453,233453,311945,198203,Where can I find beautiful and luxury hotels i...,Where can I find luxury hotels in Ranikhet?,1
305586,305586,429012,429013,Is it possible to write the JSX template for a...,Is it possible to extract the sidebar code of ...,0
354462,354462,483592,483593,What is the difference between entrepreneurs a...,What is the difference between entrepreneurshi...,1
214567,214567,49717,126069,How do I hack into someone WiFi network?,How to hack Wifi?,1


In [5]:
df.isnull().sum()


Unnamed: 0,0
id,0
qid1,0
qid2,0
question1,0
question2,0
is_duplicate,0


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
print(df['is_duplicate'].value_counts())

is_duplicate
0    32250
1    19029
Name: count, dtype: int64


In [8]:
df.dropna(subset=['question1', 'question2'], inplace=True)

In [9]:
df.isnull().sum()

Unnamed: 0,0
id,0
qid1,0
qid2,0
question1,0
question2,0
is_duplicate,0


In [10]:
df.shape

(51279, 6)

In [11]:
qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())

print('Number of unique questions',np.unique(qid).shape[0])
x = qid.value_counts()>1
print('Number of questions getting repeated',x[x].shape[0])

Number of unique questions 90969
Number of questions getting repeated 7672


In [12]:
def preprocess(q):

    q = str(q).lower().strip()


    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')


    q = q.replace('[math]', '')


    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)


    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")


    q = BeautifulSoup(q)
    q = q.get_text()


    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()


    return q


In [13]:
preprocess("I've made! the <b>project</b>?")

'i have made  the project'

In [14]:
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

In [15]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1


In [16]:
df['q1_len'] = df['question1'].str.len()
df['q2_len'] = df['question2'].str.len()

In [17]:
df['q1_num_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_num_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4


In [18]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(w1 & w2)

In [19]:
df['word_common'] = df.apply(common_words, axis=1)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,5
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,8
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,9
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,7
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,3


In [20]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return (len(w1) + len(w2))

In [21]:
df['word_total'] = df.apply(total_words, axis=1)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,5,28
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,8,18
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,9,37
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,7,16
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,3,12


In [22]:
df['word_share'] = round(df['word_common']/df['word_total'],2)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,5,28,0.18
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,8,18,0.44
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,9,37,0.24
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,7,16,0.44
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,3,12,0.25


In [23]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def fetch_token_features(row):

    q1 = row['question1']
    q2 = row['question2']

    SAFE_DIV = 0.0001

    STOP_WORDS = stopwords.words("english")

    token_features = [0.0]*8

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features


    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])


    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])


    common_word_count = len(q1_words.intersection(q2_words))


    common_stop_count = len(q1_stops.intersection(q2_stops))


    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))


    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)


    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])


    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
token_features = df.apply(fetch_token_features, axis=1)

df["cwc_min"]       = list(map(lambda x: x[0], token_features))
df["cwc_max"]       = list(map(lambda x: x[1], token_features))
df["csc_min"]       = list(map(lambda x: x[2], token_features))
df["csc_max"]       = list(map(lambda x: x[3], token_features))
df["ctc_min"]       = list(map(lambda x: x[4], token_features))
df["ctc_max"]       = list(map(lambda x: x[5], token_features))
df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [25]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,word_total,word_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,...,28,0.18,0.0,0.0,0.666656,0.44444,0.399996,0.235293,0.0,0.0
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,...,18,0.44,0.999975,0.799984,0.999975,0.799984,0.999988,0.799992,1.0,1.0
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,...,37,0.24,0.374995,0.272725,0.857131,0.599994,0.562496,0.333332,0.0,1.0
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,...,16,0.44,0.666644,0.666644,0.99998,0.99998,0.874989,0.874989,0.0,1.0
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,...,12,0.25,0.99995,0.499988,0.499975,0.249994,0.749981,0.374995,0.0,1.0


In [27]:
!pip install distance

import distance

def fetch_length_features(row):
    q1 = row['question1']
    q2 = row['question2']

    length_features = [0.0]*3

    # Tokenization
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length difference
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    # Average token length
    length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2

    # Longest common substring ratio
    strs = list(distance.lcsubstrings(q1, q2))
    if strs:  # Only access strs[0] if list is non-empty
        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    else:
        length_features[2] = 0.0

    return length_features


Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/180.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (pyproject.toml) ... [?25l[?25hdone
  Created wheel for distance: filename=distance-0.1.3-py3-none-any.whl size=16321 sha256=514594730e0b5c9e210745e9914f06bd80d9f1b15eae9c6b85c8a095923a371e
  Stored in directory: /root/.cache/pip/wheels/fb/cd/9c/3ab5d666e3bcacc58900b10959edd3816cc9557c7337986322
Succe

In [28]:
length_features = df.apply(fetch_length_features, axis=1)

# Unpack the list into separate columns
df['abs_len_diff'] = [x[0] for x in length_features]
df['mean_len'] = [x[1] for x in length_features]
df['longest_substr_ratio'] = [x[2] for x in length_features]


In [29]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,...,0.0,0.666656,0.44444,0.399996,0.235293,0.0,0.0,7.0,13.5,0.15625
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,...,0.799984,0.999975,0.799984,0.999988,0.799992,1.0,1.0,2.0,9.0,0.651163
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,...,0.272725,0.857131,0.599994,0.562496,0.333332,0.0,1.0,11.0,21.5,0.219512
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,...,0.666644,0.99998,0.99998,0.874989,0.874989,0.0,1.0,0.0,8.0,0.676923
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,...,0.499988,0.499975,0.249994,0.749981,0.374995,0.0,1.0,4.0,6.0,0.352941


In [30]:
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):

    q1 = row['question1']
    q2 = row['question2']

    fuzzy_features = [0.0]*4

    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [31]:
fuzzy_features = df.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

In [32]:
print(df.shape)
df.head()

(51279, 28)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
95724,95724,159576,159577,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...,0,63,106,11,19,...,0.235293,0.0,0.0,7.0,13.5,0.15625,33,48,52,60
233453,233453,311945,198203,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet,1,56,42,10,8,...,0.799992,1.0,1.0,2.0,9.0,0.651163,86,74,86,100
305586,305586,429012,429013,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...,0,81,158,16,29,...,0.333332,0.0,1.0,11.0,21.5,0.219512,49,51,50,72
354462,354462,483592,483593,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...,1,65,64,8,8,...,0.874989,0.0,1.0,0.0,8.0,0.676923,95,95,99,99
214567,214567,49717,126069,how do i hack into someone wifi network,how to hack wifi,1,39,16,8,4,...,0.374995,0.0,1.0,4.0,6.0,0.352941,55,75,58,90


In [33]:
ques_df = df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
95724,what are the effects of demonetization is it ...,if demonetisation of 1946 and 1978 failed why...
233453,where can i find beautiful and luxury hotels i...,where can i find luxury hotels in ranikhet
305586,is it possible to write the jsx template for a...,is it possible to extract the sidebar code of ...
354462,what is the difference between entrepreneurs a...,what is the difference between entrepreneurshi...
214567,how do i hack into someone wifi network,how to hack wifi


In [34]:
final_df = df.drop(columns=['id','qid1','qid2','question1','question2'])
print(final_df.shape)
final_df.head()

(51279, 23)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
95724,0,63,106,11,19,5,28,0.18,0.0,0.0,...,0.235293,0.0,0.0,7.0,13.5,0.15625,33,48,52,60
233453,1,56,42,10,8,8,18,0.44,0.999975,0.799984,...,0.799992,1.0,1.0,2.0,9.0,0.651163,86,74,86,100
305586,0,81,158,16,29,9,37,0.24,0.374995,0.272725,...,0.333332,0.0,1.0,11.0,21.5,0.219512,49,51,50,72
354462,1,65,64,8,8,7,16,0.44,0.666644,0.666644,...,0.874989,0.0,1.0,0.0,8.0,0.676923,95,95,99,99
214567,1,39,16,8,4,3,12,0.25,0.99995,0.499988,...,0.374995,0.0,1.0,4.0,6.0,0.352941,55,75,58,90


In [35]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [36]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(51279, 6000)

In [37]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)
final_df.head()

(51279, 6023)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
95724,0,63,106,11,19,5,28,0.18,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
233453,1,56,42,10,8,8,18,0.44,0.999975,0.799984,...,0,0,0,0,0,0,0,0,0,0
305586,0,81,158,16,29,9,37,0.24,0.374995,0.272725,...,0,0,0,0,0,0,0,0,0,0
354462,1,65,64,8,8,7,16,0.44,0.666644,0.666644,...,0,0,0,0,0,0,0,0,0,0
214567,1,39,16,8,4,3,12,0.25,0.99995,0.499988,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(final_df.iloc[:,1:].values,final_df.iloc[:,0].values,test_size=0.2,random_state=1)

In [42]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred1 = xgb.predict(X_test)
accuracy_score(y_test,y_pred1)

0.7945592823712948

In [43]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [44]:

confusion_matrix(y_test,y_pred1)

array([[5421, 1004],
       [1103, 2728]])

In [45]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
    return len(w1 & w2)

In [46]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
    return (len(w1) + len(w2))

In [47]:
def test_fetch_token_features(q1,q2):

    SAFE_DIV = 0.0001

    STOP_WORDS = stopwords.words("english")

    token_features = [0.0]*8

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))

    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))

    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))


    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)

    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])

    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features


In [48]:
def test_fetch_length_features(q1,q2):

    length_features = [0.0]*3

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2

    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)

    return length_features

In [49]:
def test_fetch_fuzzy_features(q1,q2):

    fuzzy_features = [0.0]*4

    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

In [50]:
def query_point_creator(q1,q2):

    input_query = []

    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)

    # fetch basic features
    input_query.append(len(q1))
    input_query.append(len(q2))

    input_query.append(len(q1.split(" ")))
    input_query.append(len(q2.split(" ")))

    input_query.append(test_common_words(q1,q2))
    input_query.append(test_total_words(q1,q2))
    input_query.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))

    # fetch token features
    token_features = test_fetch_token_features(q1,q2)
    input_query.extend(token_features)

    # fetch length based features
    length_features = test_fetch_length_features(q1,q2)
    input_query.extend(length_features)

    # fetch fuzzy features
    fuzzy_features = test_fetch_fuzzy_features(q1,q2)
    input_query.extend(fuzzy_features)

    # bow feature for q1
    q1_bow = cv.transform([q1]).toarray()

    # bow feature for q2
    q2_bow = cv.transform([q2]).toarray()



    return np.hstack((np.array(input_query).reshape(1,22),q1_bow,q2_bow))

In [51]:
q1 = 'Where is the capital of India?'
q2 = 'What is the current capital of Pakistan?'
q3 = 'Which city serves as the capital of India?'
q4 = 'What is the business capital of India?'

In [52]:
xgb.predict(query_point_creator(q1,q3))

array([0])

In [53]:
import pickle

pickle.dump(xgb,open('model.pkl','wb'))
pickle.dump(cv,open('cv.pkl','wb'))

In [None]:
# Tried DL models but it had gone bad

In [41]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.3/322.3 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.27.5 xgboost-3.0.2
