In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
##########################################

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
#Drop irrelevant features
#data = data.drop(['id', 'qid1', 'qid2'], axis=1)
df_train = data
##########################################

In [3]:
#These are the functions for our transformers
stop_words = set(stopwords.words("english"))

def shared_words(q1,q2):
    question1_words = []
    question2_words = []

    for word in set(str(q1).lower().split()):
        if word not in stop_words:
            question1_words.append(word)

    for word in set(str(q2).lower().split()):
        if word not in stop_words:
            question2_words.append(word)

    #Question contains only stop words (or is an empty string)
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    question1_shared_words = [w for w in question1_words if w in question2_words]
    question2_shared_words = [w for w in question2_words if w in question1_words]

    avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
    return avg_words_shared

class LevDistanceTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        
        lev_distance_strings = [[a,b] 
        for a,b in zip(q1_list, q2_list)]
        
        lev_dist_array = np.array([
    (float(levenshtein(pair[0], pair[1]))/
    (float(sum([x.count('') for x in pair[0]])) + 
    float(sum([x.count('') for x in pair[1]])))) 
    for pair in lev_distance_strings 
        ])
        
        return lev_dist_array.reshape(len(lev_dist_array),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class TfIdfDiffTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self, total_words):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = total_words)
        vectorizer.fit(total_questions)
        tf_diff = vectorizer.transform(q1_list) - vectorizer.transform(q2_list)
        return tf_diff

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class CosineDistTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english')
        vectorizer.fit(total_questions)
        
        q1_tf = vectorizer.transform(q1_list) 
        q2_tf = vectorizer.transform(q2_list)
        cos_sim = []
        for i in range(0,len(q1_list)):
            cos_sim.append(cosine_similarity(q1_tf[i], q2_tf[i])[0][0])
            
        return np.array(cos_sim).reshape(len(cos_sim),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class AverageSharedWords(BaseEstimator, TransformerMixin):
	"""Takes in two lists of strings, extracts the lev distance between each string, returns list"""

	def __init__(self):
		pass

	def transform(self, question_list):
		q1_list = question_list[0]
		q2_list = question_list[1]
		avg_words = [shared_words(q1,q2) for q1, q2 in zip(q1_list, q2_list)]
			
		return np.array(avg_words).reshape(len(avg_words),1)

	def fit(self, question_list, y=None):
		"""Returns `self` unless something different happens in train and test"""
		return self

In [4]:
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(data['question1'] + data['question2'])
total_words = list(set(vectorizer.get_feature_names()))

comb_features = FeatureUnion([('tf', TfIdfDiffTransformer(total_words)), 
                              ('cos_diff',CosineDistTransformer()), 
                              ('lev', LevDistanceTransformer()),
                              ('AvgWords', AverageSharedWords())
                             ])

In [7]:
##########################################
# Split the dataset into training and testing datasets
##########################################
y = data.ix[:,'is_duplicate'][0:5000]
all_features = comb_features.transform([data['question1'][0:5000], data['question2'][0:5000]])
X_train, X_test, y_train, y_test = train_test_split(all_features, y, test_size=0.2, random_state=1317)
##########################################

In [8]:
##########################################
# Running XGBoost
##########################################
# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
##########################################

[0]	train-logloss:0.68748	test-logloss:0.688019
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.640869	test-logloss:0.646349
[20]	train-logloss:0.606676	test-logloss:0.616444
[30]	train-logloss:0.580836	test-logloss:0.594589
[40]	train-logloss:0.561546	test-logloss:0.5785
[50]	train-logloss:0.546602	test-logloss:0.566265
[60]	train-logloss:0.534777	test-logloss:0.556676
[70]	train-logloss:0.52541	test-logloss:0.549253
[80]	train-logloss:0.517643	test-logloss:0.543283
[90]	train-logloss:0.511127	test-logloss:0.538703
[100]	train-logloss:0.505886	test-logloss:0.534731
[110]	train-logloss:0.501483	test-logloss:0.531589
[120]	train-logloss:0.497608	test-logloss:0.529153
[130]	train-logloss:0.494044	test-logloss:0.527198
[140]	train-logloss:0.490723	test-logloss:0.525586
[150]	train-logloss:0.487635	test-logloss:0.524478
[160]	train-logloss:0.484772	test-logloss:0.523315

In [12]:
#Saves Classifier
from sklearn.externals import joblib
joblib.dump(bst, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [38]:
#Loads Classifier
from sklearn.externals import joblib
bst = joblib.load('xgboost_model.pkl')

In [39]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
data_test = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/test.csv')
#data_test = pd.read_csv('/stfm/research5/m1pll00/quora/dataset/test.csv')
##########################################

In [40]:
##########################################
# Create the test features using FeatureUnion
##########################################
test_features = comb_features.transform([data_test['question1'][0:5000], data_test['question2'][0:5000]])


In [44]:
import xgboost as xgb
test = xgb.DMatrix(test_features)
test_prediction = bst.predict(test)

In [45]:
sub = pd.DataFrame()
sub['test_id'] = data_test['test_id']
sub['is_duplicate'] = test_prediction
sub.to_csv('simple_xgb.csv', index=False)