In [115]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from xgboost import XGBClassifier

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [116]:
# source: https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs

data = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)

In [117]:
# length based features
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
# difference in lengths of two questions
data['diff_len'] = data.len_q1 - data.len_q2

# character length based features
data['len_char_q1'] = data.question1.apply(lambda x: 
len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: 
len(''.join(set(str(x).replace(' ', '')))))

# word length based features
data['len_word_q1'] = data.question1.apply(lambda x: 
len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: 
len(str(x).split()))

# common words in the two questions
data['common_words'] = data.apply(lambda x: 
len(set(str(x['question1'])
.lower().split())
.intersection(set(str(x['question2'])
.lower().split()))), axis=1)

In [118]:
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2',     
        'common_words']

In [119]:
# create fuzzy features
data['fuzz_QRatio'] = data.apply(lambda x: fuzz.QRatio(
str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(
str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_partial_ratio'] = data.apply(lambda x: 
fuzz.partial_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_partial_token_set_ratio'] = data.apply(lambda x:
fuzz.partial_token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: 
fuzz.partial_token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_token_set_ratio'] = data.apply(lambda x: 
fuzz.token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_token_sort_ratio'] = data.apply(lambda x: 
fuzz.token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

In [120]:
fs_2 = ['fuzz_QRatio', 'fuzz_WRatio', 'fuzz_partial_ratio', 
                'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
                'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

In [121]:
data.sample(5)

Unnamed: 0,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzz_QRatio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio
345176,Can I make money online?,What are the easiest ways to make good money u...,1,24,64,-40,12,19,5,12,2,44,86,58,100,61,61,42
388401,What is 1?,What is √-1?,0,10,12,-2,8,10,3,3,2,90,95,80,100,100,100,100
53958,Which would be better for professional bloggin...,Which blog platform is better: WordPress or Bl...,0,95,52,43,28,21,13,8,2,46,86,56,100,54,67,58
16213,Is India still considered to be a third world ...,Is mexico a third world country?,0,54,32,22,18,19,10,6,5,67,86,78,100,65,87,62
244924,I know that the Milky Way is the Galaxy holds ...,Why didn't any of the other planets in the Sol...,0,109,151,-42,27,26,20,29,5,44,52,50,100,61,53,55


### Logistic Regression

In [122]:
n_splits = 10
kf = KFold(n_splits=n_splits)
cv_accuracies = []
for train, val in kf.split(data[fs_1 + fs_2]):
    train_X = np.array(data[fs_1 + fs_2])[train]
    train_y = np.array(data['is_duplicate'])[train]
    val_X = np.array(data[fs_1 + fs_2])[val]
    val_y = np.array(data['is_duplicate'])[val]
    
    lr_model = LogisticRegression(max_iter=1000, random_state=1)
    lr_model.fit(train_X, train_y)
    lr_prediction = lr_model.predict(val_X)
    
    cv_accuracies.append(metrics.accuracy_score(val_y, lr_prediction))

average = sum(cv_accuracies)/n_splits

print('Accuracies with ' + str(n_splits) + '-fold cross validation: ')
for cv_accuracy in cv_accuracies:
    print(cv_accuracy)

print('Average: ' + str(average))

Accuracies with 10-fold cross validation: 
0.6624700091518464
0.6595018427366495
0.6580424942491776
0.6603428232209553
0.6594276385762695
0.6588340052932301
0.6622721313908333
0.662841029953746
0.6626184174726063
0.6685052808627471
Average: 0.6614855672908061


### XGBoost

In [123]:
n_splits = 10
kf = KFold(n_splits=n_splits)
cv_accuracies = []
for train, val in kf.split(data[fs_1 + fs_2]):
    train_X = np.array(data[fs_1 + fs_2])[train]
    train_y = np.array(data['is_duplicate'])[train]
    val_X = np.array(data[fs_1 + fs_2])[val]
    val_y = np.array(data['is_duplicate'])[val]
    
    xgb_model = XGBClassifier(n_estimators=500, random_state=1)
    xgb_model.fit(train_X, train_y)
    xgb_prediction = xgb_model.predict(val_X)
    
    cv_accuracies.append(metrics.accuracy_score(val_y, xgb_prediction))

average = sum(cv_accuracies)/n_splits

print('Accuracies with ' + str(n_splits) + '-fold cross validation: ')
for cv_accuracy in cv_accuracies:
    print(cv_accuracy)

print('Average: ' + str(average))

Accuracies with 10-fold cross validation: 
0.729130079893146
0.7279675480471939
0.725469341314403
0.7314304088649237
0.7252961982735165
0.7267060773207351
0.7288085285314997
0.7283633035692201
0.7286106507704865
0.7275717925251676
Average: 0.7279353929110292


In [126]:
# train xgboost with all the data
xgb_model = XGBClassifier(n_estimators=500, random_state=1)
xgb_model.fit(data[fs_1+fs_2], data['is_duplicate'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

### Detect if new questions are duplicates

In [129]:
q1 = input('Enter first sentence: ')
q2 = input('Enter second sentence: ')
new_data = pd.DataFrame([[q1, q2]], columns=['question1', 'question2'])



# length based features
new_data['len_q1'] = new_data.question1.apply(lambda x: len(str(x)))
new_data['len_q2'] = new_data.question2.apply(lambda x: len(str(x)))
# difference in lengths of two questions
new_data['diff_len'] = new_data.len_q1 - new_data.len_q2

# character length based features
new_data['len_char_q1'] = new_data.question1.apply(lambda x: 
len(''.join(set(str(x).replace(' ', '')))))
new_data['len_char_q2'] = new_data.question2.apply(lambda x: 
len(''.join(set(str(x).replace(' ', '')))))

# word length based features
new_data['len_word_q1'] = new_data.question1.apply(lambda x: 
len(str(x).split()))
new_data['len_word_q2'] = new_data.question2.apply(lambda x: 
len(str(x).split()))

# common words in the two questions
new_data['common_words'] = new_data.apply(lambda x: 
len(set(str(x['question1'])
.lower().split())
.intersection(set(str(x['question2'])
.lower().split()))), axis=1)



# create fuzzy features
new_data['fuzz_QRatio'] = new_data.apply(lambda x: fuzz.QRatio(
str(x['question1']), str(x['question2'])), axis=1)
new_data['fuzz_WRatio'] = new_data.apply(lambda x: fuzz.WRatio(
str(x['question1']), str(x['question2'])), axis=1)

new_data['fuzz_partial_ratio'] = new_data.apply(lambda x: 
fuzz.partial_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

new_data['fuzz_partial_token_set_ratio'] = new_data.apply(lambda x:
fuzz.partial_token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

new_data['fuzz_partial_token_sort_ratio'] = new_data.apply(lambda x: 
fuzz.partial_token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

new_data['fuzz_token_set_ratio'] = new_data.apply(lambda x: 
fuzz.token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

new_data['fuzz_token_sort_ratio'] = new_data.apply(lambda x: 
fuzz.token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)




xgb_prediction = xgb_model.predict(new_data[fs_1+fs_2])
if xgb_prediction == 1:
    print('The sentences you have entered are considered duplicate.')
else:
    print('The sentences you have entered are considered NOT duplicate.')

Enter first sentence: What is food in Spanish?
Enter second sentence: How to say food in Spanish?
The sentences you have entered are considered duplicate.
