In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import word2vec, KeyedVectors
from scipy.stats import kurtosis
from sklearn.externals import joblib
from gensim.corpora import Dictionary
from gensim import corpora, models
##########################################



In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
df_train = data
##########################################

In [3]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
df_test = pd.read_csv('test.csv')

#Replaces np.nan with ''
df_test = df_test.replace(np.nan, '', regex=True)

#Saves the cleaned test.csv
# df_test.to_csv('cleaned_test.csv')
##########################################

In [4]:
##########################################
# Initializes variables for Feature Creation
##########################################
stop_words = set(stopwords.words("english"))
model = KeyedVectors.load("300features_10minwords_5context")
##########################################

In [5]:
##########################################
# Function for Magic Features
##########################################
def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0

def magic_features(data = df_train, test_data = df_test):
    df1 = data[['question1']].copy()
    df2 = data[['question2']].copy()
    df1_test = test_data[['question1']].copy()
    df2_test = test_data[['question2']].copy()

    df2.rename(columns = {'question2':'question1'},inplace=True)
    df2_test.rename(columns = {'question2':'question1'},inplace=True)

    train_questions = df1.append(df2)
    train_questions = train_questions.append(df1_test)
    train_questions = train_questions.append(df2_test)
    train_questions.drop_duplicates(subset = ['question1'],inplace=True)

    train_questions.reset_index(inplace=True,drop=True)

    questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()

    train_cp = data.copy()
    test_cp = test_data.copy()
    train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

    test_cp['is_duplicate'] = -1
    test_cp.rename(columns={'test_id':'id'},inplace=True)

    comb = pd.concat([train_cp,test_cp])

    comb['q1_hash'] = comb['question1'].map(questions_dict)
    comb['q2_hash'] = comb['question2'].map(questions_dict)

    q1_vc = comb.q1_hash.value_counts().to_dict()
    q2_vc = comb.q2_hash.value_counts().to_dict()

    #map to frequency space
    comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
    comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

    train_comb = comb[comb['is_duplicate'] >= 0][['q1_hash','q2_hash','q1_freq','q2_freq']]
    test_comb = comb[comb['is_duplicate'] < 0][['q1_hash','q2_hash','q1_freq','q2_freq']]
    return np.array(train_comb), np.array(test_comb)
##########################################

In [6]:
##########################################
# Function and Transformer for Word2Vec Features
##########################################
def question_to_wordlist(text, remove_stopwords = False):
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    
    if remove_stopwords:
        words = [w for w in words if not w in stop_words]
    return(words)

def makeDistributionalFeatures(q1,q2):
    data = pd.concat([q1, q2], axis=1)
    features = []
    
    #For each question in the dataset:
    # 1 - Compute similarity metric from word2vec model using every word combination between question1 and question2
    # 2 - Create the distributional summary statistics for every combination of the disimilar words
    for index in range(0, len(data)):
            #Convert question1 and question2 into a list of words
            question1 = question_to_wordlist(data.question1[index])
            question2 = question_to_wordlist(data.question2[index])
            
            #Finds every word combination between question1 and question2
            combinations = list(itertools.product(question1, question2))
            combinations = [list(combination) for combination in combinations]
            
            #Tracks word2vec similarity metric for every word combination
            values = []
            
            #Loops through each word combination
            for combination in combinations:
                #Checks if the model contains the words in its vocabulary
                # 1 - Yes, adds it to the values list to calculate distributional stats with
                # 2 - No, go to the next word pair
                try:
                    values.append(model.wv.similarity(combination[0], combination[1]))
                except KeyError:
                    pass
      
            #If there is at least one similarity metric calculate its mean and median
            if(len(values) >= 1):
                features.append([np.mean(values), np.median(values), np.std(values), kurtosis(values)])
            else:
                #Since we will not be deleting observations from the test dataset append [-1,-1,-1,-1] as stand in features
                # 1 - The only combination contained a word our model does not contain
                # 2 - Question1 or Question2 or both were ""
                # 3 - We could not tokenize either Question1 or Question2
                features.append([-1,-1,-1,-1])
    return features

class Word2VecStats(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the generates the mean/median/std/kurtosis between each string, returns array of lists"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        stats = makeDistributionalFeatures(q1_list, q2_list)
        return np.array(stats)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self  
##########################################

In [7]:
##########################################
# Function and Transformer for Average Shared Words
##########################################
def shared_words(q1,q2):
    question1_words = []
    question2_words = []

    for word in set(q1.lower().split()):
        if word not in stop_words:
            question1_words.append(word)

    for word in set(q2.lower().split()):
        if word not in stop_words:
            question2_words.append(word)

    #Question contains only stop words (or is an empty string)
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0

    question1_shared_words = [w for w in question1_words if w in question2_words]
    question2_shared_words = [w for w in question2_words if w in question1_words]

    avg_words_shared = (len(question1_shared_words) + len(question2_shared_words))/(len(question1_words) + len(question2_words))
    return avg_words_shared

class AverageSharedWords(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the average shared words between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        avg_words = [shared_words(q1,q2) for q1, q2 in zip(q1_list, q2_list)]

        return np.array(avg_words).reshape(len(avg_words),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
##########################################

In [8]:
##########################################
# Function and Transformer for Word Length
##########################################
def word_lengths(q1,q2):
    data = pd.concat([q1, q2], axis=1)

    #Feature: Length of Question
    data['len_q1'] = data.question1.apply(lambda x: len(x))
    data['len_q2'] = data.question2.apply(lambda x: len(x))

    #Feature: Difference in length between the Questions
    data['len_diff'] = data.len_q1 - data.len_q2

    #Feature: Character count of Question
    data['len_char_q1'] = data.question1.apply(lambda x: len(x.replace(' ', '')))
    data['len_char_q2'] = data.question2.apply(lambda x: len(x.replace(' ', '')))

    #Feature: Word count of Question
    data['len_word_q1'] = data.question1.apply(lambda x: len(x.split()))
    data['len_word_q2'] = data.question2.apply(lambda x: len(x.split()))

    #Feature: Common words between the Questions
    data['len_common_words'] = data.apply(lambda x: len(set(x['question1'].lower().split()).intersection(set(x['question2'].lower().split()))), axis=1)
    return data.ix[:,'len_q1':'len_common_words']

class WordLengths(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the word lengths between each string, returns array of lists"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        word_len = word_lengths(q1_list, q2_list)
        return np.array(word_len)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self  
##########################################

In [9]:
##########################################
# Transformers for Levenshtein and Tfidf
##########################################
class LevDistanceTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the lev distance between each string, returns list"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        
        lev_distance_strings = [[a,b] for a,b in zip(q1_list, q2_list)]
        
        lev_dist_array = np.array([(float(levenshtein(pair[0], pair[1]))/
                                    (float(sum([x.count('') for x in pair[0]])) + float(sum([x.count('') for x in pair[1]])))) 
                                    for pair in lev_distance_strings 
                                    ])
        
        return lev_dist_array.reshape(len(lev_dist_array),1)

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class TfIdfDiffTransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the tfidf difference between each string, returns tfidf matrix"""

    def __init__(self, total_words):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = list(q1_list) + list(q2_list)
        total_questions = [x for x in total_questions if type(x) != float]
        
        vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = total_words)
        vectorizer.fit(total_questions)
        tf_diff = vectorizer.transform(q1_list) - vectorizer.transform(q2_list)
        return tf_diff

    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
##########################################

In [10]:
##########################################
# Transformer for LDA
##########################################
class LDATransformer(BaseEstimator, TransformerMixin):
    """Takes in two lists of strings, extracts the topics and probability for each string, returns list of lists"""

    def __init__(self):
        pass

    def transform(self, question_list):
        q1_list = question_list[0]
        q2_list = question_list[1]
        total_questions = list(q1_list) + list(q2_list)

        #Tokenize each question
        questions = [question_to_wordlist(question, remove_stopwords = True) for question in total_questions]

        #Create a Gensim dictionary from the questions
        dictionary = Dictionary(questions)

        #Remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
        dictionary.filter_extremes(no_below=1, no_above=0.8)

        #Convert the dictionary to a Bag of Words corpus for reference
        corpus = [dictionary.doc2bow(question) for question in questions]

        #Train LDA model
        topics=300
        lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=topics, workers=20)

        #Return Document Topics for Question1
        empty = np.zeros(shape=(len(q1_list),topics*2))
        empty[empty == 0] = -1

        colNames = []
        for i in range(0, topics):
            colNames.append('q1_topic' + str(i))
            colNames.append('q1_proba' + str(i))

        q1_df = pd.DataFrame(empty, columns=colNames)

        for x in tqdm(range(0, len(q1_list))):
            topic_list = lda.get_document_topics(corpus[x])
            for topic in topic_list:
                t = topic[0]
                p = topic[1]
                q1_df['q1_topic'+str(t)][x] = t
                q1_df['q1_proba'+str(t)][x] = p

        #Return Document Topics for Question2
        empty = np.zeros(shape=(len(q2_list),topics*2))
        empty[empty == 0] = -1

        colNames = []
        for i in range(0, topics):
            colNames.append('q2_topic' + str(i))
            colNames.append('q2_proba' + str(i))

        q2_df = pd.DataFrame(empty, columns=colNames)

        for x in tqdm(range(len(q1_list), len(corpus))):
            topic_list = lda.get_document_topics(corpus[x])
            for topic in topic_list:
                t = topic[0]
                p = topic[1]
                q2_df['q2_topic'+str(t)][x-len(q1_list)] = t
                q2_df['q2_proba'+str(t)][x-len(q1_list)] = p
        
        total_df = pd.concat([q1_df, q2_df], axis=1)
        return total_df
    
    def fit(self, question_list, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
##########################################

In [11]:
#Use word vocabulary from training data
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(df_train['question1'] + df_train['question2'])
total_words = list(set(vectorizer.get_feature_names()))

In [12]:
##########################################
# Combining all the features using FeatureUnion
##########################################
#Create Magic Features
magic_train, magic_test = magic_features()

#Feature Union Features
comb_features = FeatureUnion([('tf', TfIdfDiffTransformer(total_words)), 
                              ('lev', LevDistanceTransformer()),
                              ('AvgWords', AverageSharedWords()),
                              ('WordLengths', WordLengths()),
                              ('Word2VecStats', Word2VecStats())
                             ])
##########################################

In [14]:
##########################################
# Create features using FeatureUnion and Magic Features
##########################################
y = df_train.ix[:,'is_duplicate']
all_features = comb_features.transform([df_train['question1'], df_train['question2']])

#Merge FeatureUnion features with Magic Features
total_features = scipy.sparse.hstack(blocks=[all_features, magic_train])
##########################################

In [15]:
total_features

<404290x86721 sparse matrix of type '<type 'numpy.float64'>'
	with 9882154 stored elements in COOrdinate format>

In [16]:
#Saves Training Features
joblib.dump(total_features, 'total_features.pkl')

['total_features.pkl']

In [17]:
##########################################
# Split the dataset into training and testing datasets
##########################################
X_train, X_test, y_train, y_test = train_test_split(total_features, y, test_size=0.2, random_state=1317)
##########################################

In [18]:
##########################################
# Running XGBoost
##########################################
# Set parameters for XGBoost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'test')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=10)
joblib.dump(bst, 'xgboost_model_1000iterations_8depth.pkl')
##########################################

[0]	train-logloss:0.683926	test-logloss:0.684002
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.608543	test-logloss:0.609292
[20]	train-logloss:0.554786	test-logloss:0.556198
[30]	train-logloss:0.514818	test-logloss:0.516899
[40]	train-logloss:0.484263	test-logloss:0.486984
[50]	train-logloss:0.460863	test-logloss:0.464182
[60]	train-logloss:0.442677	test-logloss:0.446556
[70]	train-logloss:0.428061	test-logloss:0.432454
[80]	train-logloss:0.416287	test-logloss:0.421208
[90]	train-logloss:0.406758	test-logloss:0.412195
[100]	train-logloss:0.398841	test-logloss:0.404739
[110]	train-logloss:0.392199	test-logloss:0.398551
[120]	train-logloss:0.386552	test-logloss:0.393368
[130]	train-logloss:0.381867	test-logloss:0.389113
[140]	train-logloss:0.37781	test-logloss:0.385504
[150]	train-logloss:0.374169	test-logloss:0.382276
[160]	train-logloss:0.371221	test-logloss:0.379

['xgboost_model_400iterations_8depth.pkl']

In [19]:
##########################################
# Create the test features using FeatureUnion
##########################################
test_features = comb_features.transform([df_test['question1'], df_test['question2']])

#Merge FeatureUnion features with Magic Features
total_test_features = scipy.sparse.hstack(blocks=[test_features, magic_test])

joblib.dump(total_test_features, 'test_features.pkl')
##########################################

['test_features.pkl']

In [20]:
total_test_features

<2345796x86721 sparse matrix of type '<type 'numpy.float64'>'
	with 59340301 stored elements in COOrdinate format>

In [21]:
##########################################
# Predicting using XGBoost
##########################################
test = xgb.DMatrix(total_test_features)
test_prediction = bst.predict(test)
##########################################

In [25]:
##########################################
# Creating Submission File
##########################################
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = test_prediction
sub.to_csv('submission.csv', index=False)
##########################################

In [23]:
#Check Submission File Length
len(sub)

2345796

In [24]:
joblib.dump(sub, 'submission.pkl')

['submission.pkl']