In [33]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
##########################################

In [34]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
#data = pd.read_csv('/stfm/research5/m1pll00/quora/dataset/train.csv')
data = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/train.csv')

#Drop irrelevant features
#data = data.drop(['id', 'qid1', 'qid2'], axis=1)
df_train = data
##########################################

In [35]:
##########################################
# Feature Engineering Method 1: 
# Question1 and Question2 length-based features
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data['len_diff'] = data.len_q1 - data.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data['len_char_q1'] = data.question1.apply(lambda x: len(str(x).replace(' ', '')))
data['len_char_q2'] = data.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [5]:
# #Evaluate how length-based features impacts the detection of duplicate questions
# #Results: They all seem to return the same distribution (Predicts 'Duplicate' slightly more accurately than 'Not Duplicate')
# data_subset = data.len_word_q1
# plt.figure(figsize=(15, 5))
# plt.hist(data_subset[data['is_duplicate'] == 0], bins=20, normed=True, label='Not Duplicate')
# plt.hist(data_subset[data['is_duplicate'] == 1], bins=20, normed=True, alpha=0.7, label='Duplicate')
# plt.legend()
# plt.title('Label distribution over length-based features', fontsize=15)
# plt.xlabel('Length-based Features', fontsize=15)

In [36]:
####################################################################################
# Feature Engineering Method 2: 
# Identifying the average amount of shared words between question1 and question2
####################################################################################
#Loads in nltk stopwords
stop_words = set(stopwords.words("english"))

##########################################
#Function: shared_words
#Purpose: Finds the shared words between question1 and question2 (exclude stop words)
#Parameters: Dataframe row containing question1 and question
##########################################
def shared_words(row):
    question1_words = []
    question2_words = []
    for word in set(str(row.question1).lower().split()):
        if word not in stop_words:
            question1_words.append(word)
            
    for word in set(str(row.question2).lower().split()):
        if word not in stop_words:
            question2_words.append(word)
    
    #Question contains only stop words (or is an empty string)
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0
    
    question1_shared_words = [w for w in question1_words if w in question2_words]
    question2_shared_words = [w for w in question2_words if w in question1_words]
    
    avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
    return avg_words_shared
##########################################

#data['avg_words_shared'] = data.apply(shared_words, axis=1, raw=True)
####################################################################################

In [7]:
# #Evaluate how shared words feature impacts the detection of duplicate questions
# #Results: Good for detecting 'Not Duplicate' questions, but does not do as well to detect 'Duplicate' questions
# plt.figure(figsize=(15, 5))
# train_word_match = data.apply(shared_words, axis=1, raw=True)
# plt.hist(train_word_match[data['is_duplicate'] == 0], bins=20, normed=True, label='Not Duplicate')
# plt.hist(train_word_match[data['is_duplicate'] == 1], bins=20, normed=True, alpha=0.7, label='Duplicate')
# plt.legend()
# plt.title('Label distribution over shared_words', fontsize=15)
# plt.xlabel('shared_words', fontsize=15)

In [8]:
# print(data.question1.iloc[0])
# print(data.question2.iloc[0])
# shared_words(data.iloc[0])

In [37]:
##########################################
# Feature Engineering Method 3:
# Create Bag Of Words Model with Tfidf Normalization
##########################################
from sklearn.feature_extraction.text import TfidfVectorizer

#Obtain the complete vocabulary for the entire dataset
questions_combined = list(data.question1.values.astype('str')) + list(data.question2.values.astype('str'))
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(questions_combined)

#86153 unique vocabulary words question1 & question2 combined
complete_vocab = vectorizer.get_feature_names()

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
tfidf_question1 = vectorizer.transform(data.question1.values.astype('str'))
tfidf_question2 = vectorizer.transform(data.question2.values.astype('str'))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf = tfidf_question1 - tfidf_question2
##########################################

In [10]:
##########################################
# Split the dataset into training and testing datasets
##########################################
#Loads the library required for splitting the dataset
from sklearn.model_selection import train_test_split

#Merge all the features together
import scipy
data_features = data.ix[:,'len_q1':'avg_words_shared']
merged_features = scipy.sparse.hstack(blocks=[diff_idf, scipy.sparse.csr_matrix(data_features)])

y = data.ix[:,'is_duplicate']

X_train, X_test, y_train, y_test = train_test_split(merged_features, y, test_size=0.2, random_state=1317)
##########################################

In [11]:
##########################################
# Running XGBoost
##########################################
import xgboost as xgb

# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
##########################################

[0]	train-logloss:0.685919	test-logloss:0.685964
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.626919	test-logloss:0.627328
[20]	train-logloss:0.585155	test-logloss:0.585831
[30]	train-logloss:0.55454	test-logloss:0.55543
[40]	train-logloss:0.531317	test-logloss:0.53238
[50]	train-logloss:0.513469	test-logloss:0.514721
[60]	train-logloss:0.499485	test-logloss:0.500857
[70]	train-logloss:0.488101	test-logloss:0.489552
[80]	train-logloss:0.479243	test-logloss:0.480765
[90]	train-logloss:0.472093	test-logloss:0.473666
[100]	train-logloss:0.466059	test-logloss:0.467695
[110]	train-logloss:0.461028	test-logloss:0.462726
[120]	train-logloss:0.456827	test-logloss:0.458625
[130]	train-logloss:0.45338	test-logloss:0.455242
[140]	train-logloss:0.450331	test-logloss:0.452287
[150]	train-logloss:0.447878	test-logloss:0.449935
[160]	train-logloss:0.445777	test-logloss:0.447907

In [12]:
#Saves Classifier
from sklearn.externals import joblib
joblib.dump(bst, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [38]:
#Loads Classifier
from sklearn.externals import joblib
bst = joblib.load('xgboost_model.pkl')

In [39]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
data_test = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/test.csv')
#data_test = pd.read_csv('/stfm/research5/m1pll00/quora/dataset/test.csv')
##########################################

In [40]:
##########################################
# Feature Engineering Method 1: 
# Creating new input variables to improve ML algorithm performance
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data_test['len_q1'] = data_test.question1.apply(lambda x: len(str(x)))
data_test['len_q2'] = data_test.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data_test['len_diff'] = data_test.len_q1 - data_test.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data_test['len_char_q1'] = data_test.question1.apply(lambda x: len(str(x).replace(' ', '')))
data_test['len_char_q2'] = data_test.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data_test['len_word_q1'] = data_test.question1.apply(lambda x: len(str(x).split()))
data_test['len_word_q2'] = data_test.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data_test['len_common_words'] = data_test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [41]:
data_test['avg_words_shared'] = data_test.apply(shared_words, axis=1, raw=True)

In [42]:
question1_df = data_test.question1
question2_df = data_test.question2

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
tfidf_question1_test = vectorizer.transform(question1_df.values.astype('str'))
tfidf_question2_test = vectorizer.transform(question2_df.values.astype('str'))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf_test = tfidf_question1_test - tfidf_question2_test

In [43]:
import scipy
data_features = data_test.ix[:,'len_q1':'avg_words_shared']
merged_features_test = scipy.sparse.hstack(blocks=[diff_idf_test, scipy.sparse.csr_matrix(data_features)])

In [44]:
import xgboost as xgb
test = xgb.DMatrix(merged_features_test)
test_prediction = bst.predict(test)

In [45]:
sub = pd.DataFrame()
sub['test_id'] = data_test['test_id']
sub['is_duplicate'] = test_prediction
sub.to_csv('simple_xgb.csv', index=False)