In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import word2vec, KeyedVectors
from scipy.stats import kurtosis
from sklearn.externals import joblib
import xgboost as xgb
##########################################

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
df_train = data
##########################################

In [3]:
#These are the functions for our transformers
stop_words = set(stopwords.words("english"))
model = KeyedVectors.load("300features_10minwords_5context")

def question_to_wordlist(text, remove_stopwords = False):
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

def makeDistributionalFeatures(q1,q2):
    data = pd.concat([q1, q2], axis=1)
    features = []
    
    #For each question in the dataset:
    # 1 - Compute similarity metric from word2vec model using every word combination between question1 and question2
    # 2 - Create the distributional summary statistics for every combination of the disimilar words
    for index in range(0, len(data)):
            #Convert question1 and question2 into a list of words
            question1 = question_to_wordlist(str(data.question1[index]))
            question2 = question_to_wordlist(str(data.question2[index]))
            
            #Finds every word combination between question1 and question2
            combinations = list(itertools.product(question1, question2))
            combinations = [list(combination) for combination in combinations]
            
            #Tracks word2vec similarity metric for every word combination
            values = []
            
            #Loops through each word combination
            for combination in combinations:
                #Checks if the model contains the words in its vocabulary
                # 1 - Yes, adds it to the values list to calculate distributional stats with
                # 2 - No, go to the next word pair
                try:
                    values.append(model.wv.similarity(combination[0], combination[1]))
                except KeyError:
                    pass
      
            #If there is at least one similarity metric calculate its mean and median
            if(len(values) >= 1):
                features.append([np.mean(values), np.median(values), np.std(values), kurtosis(values)])
            else:
                #Since we will not be deleting observations from the test dataset append [-1,-1,-1,-1] as stand in features
                # 1 - The only combination contained a word our model does not contain
                # 2 - Question1 or Question2 or both were ""
                # 3 - We could not tokenize either Question1 or Question2
                features.append([-1,-1,-1,-1])
    return features

def word_lengths(q1,q2):
    data = pd.concat([q1, q2], axis=1)

    #Feature: Length of Question
    data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
    data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

    #Feature: Difference in length between the Questions
    data['len_diff'] = data.len_q1 - data.len_q2

    #Feature: Character count of Question
    data['len_char_q1'] = data.question1.apply(lambda x: len(str(x).replace(' ', '')))
    data['len_char_q2'] = data.question2.apply(lambda x: len(str(x).replace(' ', '')))

    #Feature: Word count of Question
    data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
    data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

    #Feature: Common words between the Questions
    data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    return data.ix[:,'len_q1':'len_common_words']

def shared_words(q1,q2):
    question1_words = []
    question2_words = []

    for word in set(str(q1).lower().split()):
        if word not in stop_words:
            question1_words.append(word)

    for word in set(str(q2).lower().split()):
        if word not in stop_words:
            question2_words.append(word)

    #Question contains only stop words (or is an empty string)
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    question1_shared_words = [w for w in question1_words if w in question2_words]
    question2_shared_words = [w for w in question2_words if w in question1_words]

    avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
    return avg_words_shared

class LevDistanceTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        
        lev_distance_strings = [[a,b] 
        for a,b in zip(q1_list, q2_list)]
        
        lev_dist_array = np.array([
    (float(levenshtein(pair[0], pair[1]))/
    (float(sum([x.count('') for x in pair[0]])) + 
    float(sum([x.count('') for x in pair[1]])))) 
    for pair in lev_distance_strings 
        ])
        
        return lev_dist_array.reshape(len(lev_dist_array),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class TfIdfDiffTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self, total_words):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = total_words)
        vectorizer.fit(total_questions)
        tf_diff = vectorizer.transform(q1_list) - vectorizer.transform(q2_list)
        return tf_diff

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class CosineDistTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = q1_list + q2_list
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english')
        vectorizer.fit(total_questions)
        
        q1_tf = vectorizer.transform(q1_list) 
        q2_tf = vectorizer.transform(q2_list)
        cos_sim = []
        for i in range(0,len(q1_list)):
            cos_sim.append(cosine_similarity(q1_tf[i], q2_tf[i])[0][0])
            
        return np.array(cos_sim).reshape(len(cos_sim),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class AverageSharedWords(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        avg_words = [shared_words(q1,q2) for q1, q2 in zip(q1_list, q2_list)]

        return np.array(avg_words).reshape(len(avg_words),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

class WordLengths(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        word_len = word_lengths(q1_list, q2_list)
        return np.array(word_len)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self  
    
class Word2VecStats(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        stats = makeDistributionalFeatures(q1_list, q2_list)
        return np.array(stats)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self  

In [None]:
##########################################
# Combining all the features using FeatureUnion
##########################################
vectorizer = TfidfVectorizer(stop_words = 'english')
# vectorizer.fit(df_train['question1'][0:5000] + df_train['question2'][0:5000])
vectorizer.fit(df_train['question1'] + df_train['question2'])
total_words = list(set(vectorizer.get_feature_names()))

comb_features = FeatureUnion([('tf', TfIdfDiffTransformer(total_words)), 
                              ('cos_diff',CosineDistTransformer()), 
                              ('lev', LevDistanceTransformer()),
                              ('AvgWords', AverageSharedWords()),
                              ('WordLengths', WordLengths()),
                              ('Word2VecStats', Word2VecStats())
                             ])
##########################################

In [None]:
##########################################
# Split the dataset into training and testing datasets
# ##########################################
# y = df_train.ix[:,'is_duplicate'][0:5000]
# all_features = comb_features.transform([df_train['question1'][0:5000], df_train['question2'][0:5000]])
y = df_train.ix[:,'is_duplicate']
all_features = comb_features.transform([df_train['question1'], df_train['question2']])
X_train, X_test, y_train, y_test = train_test_split(all_features, y, test_size=0.2, random_state=1317)
##########################################

In [None]:
##########################################
# Running XGBoost
##########################################
# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
joblib.dump(bst, 'xgboost_model_400iterations_8depth.pkl')
##########################################

In [None]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
df_test = pd.read_csv('test.csv')

#Replaces np.nan with ''
df_test = df_test.replace(np.nan, '', regex=True)

#Saves the cleaned test.csv
df_test.to_csv('cleaned_test.csv')
##########################################

In [None]:
##########################################
# Create the test features using FeatureUnion
##########################################
# test_features = comb_features.transform([df_test['question1'][0:5000], df_test['question2'][0:5000]])
test_features = comb_features.transform([df_test['question1'], df_test['question2']])
joblib.dump(test_features, 'test_features.pkl')
##########################################

In [None]:
#Ensure Train and Test Features are the same size
all_features

In [None]:
test_features

In [None]:
##########################################
# Predicting using XGBoost
##########################################
test = xgb.DMatrix(test_features)
test_prediction = bst.predict(test)
##########################################

In [None]:
##########################################
# Creating Submission File
##########################################
sub = pd.DataFrame()
# sub['test_id'] = df_test['test_id'][0:5000]
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_prediction
sub.to_csv('simple_xgb.csv', index=False)
##########################################

In [None]:
#Check Submission File Length
len(sub)