In [29]:
# Feature engineering methods

import numpy as np
import pandas
import spacy
import re
import math
from collections import Counter
from bs4 import UnicodeDammit

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en")
stops = set(stopwords.words("english"))

question_tokens = set(["why", "how", "what", "when", "which", "who", "whose", "whom"])


def set_overlap_score_model(questions_df):
    """
    appending column of set overlap percentage, where each set
    is a set of tokens excluding stop-words and punctuation,
    and in lemmatised form
    """
    def set_overlap_score(row):
        set1, set2 = \
            (set([w.lemma_.lower() for w in row["cleaned_question1_words"]]),
             set([w.lemma_.lower() for w in row["cleaned_question2_words"]]))
        return 0.0 if not len(set1.union(set2)) else 1.0 * len(set1.intersection(set2)) / len(set1.union(set2))
    questions_df["cleaned_question1_words"] = questions_df["question1"].map(clean_statement)
    questions_df["cleaned_question2_words"] = questions_df["question2"].map(clean_statement)

    questions_df["score"] = questions_df.apply(set_overlap_score, axis=1)

    return questions_df


def remove_punc(s):
    return re.sub(r'[^\w\s]', '', UnicodeDammit(str(s)).markup)


def clean_statement(s):
    """
    Remove punctuation, stop words and standardise casing
    words, and return remaining tokens
    """

    # Remove punctuation
    s = remove_punc(s)
    sentence = nlp(s)
    sentence_with_stop_checks = [(sentence[i], sentence[i].is_stop) for i in range(len(sentence))]

    return sorted([w for (w, stop_bool) in sentence_with_stop_checks if not stop_bool])


def construct_doc_list(df):
    """
    Take the question pairs DF and return a list of 2 docs per
    row with the cleaned up sentence
    """
    for index, row in df.iterrows():
        q1, q2 = row["question1"], row["question2"]
        q1_tokens, q2_tokens = clean_statement(q1), clean_statement(q2)

        q1_doc = [w.lemma_.lower() for w in q1_tokens]
        q2_doc = [w.lemma_.lower() for w in q2_tokens]

        yield q1_doc
        yield q2_doc


def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None):
    """
    Training method for LDA. documents is a list of lists of words/tokens
    documents is used to construct a dictionary and a corpus from which the
    topics for LDA are inferred
    """
    # Construct dictionary of words if it's not passed
    if not id2word_dictionary:
        id2word_dictionary = corpora.Dictionary(documents)

    word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()])

    # Construct corpus for model
    if documents and not corpus:
        corpus = [id2word_dictionary.doc2bow(document) for document in documents]

    # Cluster the documents into topics using LDA. number of topics is given
    # by n_topics
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word_dictionary,
                         num_topics=n_topics,
                         update_every=1,
                         chunksize=10000,
                         passes=1)

    """
    Default value for topn (number of top words to show by probability) is 10.
    A high enough value should return the words covering most or all of the
    probability mass
    """
    topics = [lda_model.show_topic(idx, topn=50000)
              for idx in range(0, n_topics)]

    return lda_model, id2word_dictionary, word2idx_dictionary, topics


# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1.0 / (count + eps)

    
def tfidf_word_match_share(row, weights):
    q1words = {}
    q2words = {}
    for word in remove_punc(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in remove_punc(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def weighted_token_overlap_score(row):
    cleaned_question1_words = clean_statement(row["question1"])
    cleaned_question2_words = clean_statement(row["question2"])
    
    set1, set2 = \
            (set([w.lemma_.lower() for w in cleaned_question1_words]),
             set([w.lemma_.lower() for w in cleaned_question2_words]))
        
    return (1.0 * len(set1.intersection(set2)) / (len(set1.union(set2)) or 1)) * \
            (
                min(len(str(row["question1"])), len(str(row["question2"]))) / 
                (1.0 * max(len(str(row["question1"])), len(str(row["question2"]))))
            )
    
def stops_ratios(row):
    q1_tokens = [t.lower() for t in remove_punc(row["question1"]).split()]
    q2_tokens = [t.lower() for t in remove_punc(row["question2"]).split()]
    q1_stops = set([t for t in q1_tokens if t in stops])
    q2_stops = set([t for t in q2_tokens if t in stops])
    return (
        float(len(q1_stops.intersection(q2_stops))) / (len(q1_stops.union(q2_stops)) or 1.0),
        float(len(q1_stops)) / (len(q1_tokens) or 1.0),
        float(len(q2_stops)) / (len(q2_tokens) or 1.0),
        math.fabs(float(len(q1_stops)) / (len(q1_tokens) or 1.0) - float(len(q2_stops)) / (len(q2_tokens) or 1.0))
    )

def question_tokens_ratio(row):
    q1_quest_tokens = set([t.lower() for t in remove_punc(row["question1"]) if t.lower() in question_tokens])
    q2_quest_tokens = set([t.lower() for t in remove_punc(row["question2"]) if t.lower() in question_tokens])
    return (
        float(len(q1_quest_tokens.intersection(q2_quest_tokens))) / (len(q1_quest_tokens.union(q2_quest_tokens)) or 1.0)
    )


# def features(row, lda_model, word2idx_dict, n_lda_topics=10):
def features(df, lda_model, word2idx_dict, n_lda_topics=10, word_weights={}):
    """
    More features to implement:
    - TF-IDF or similar scheme string similarity (with and without stopwords)
    - Better LDA model by incorporating children, synonyms, related concepts, subtrees
    - Difference in lengths between both questions, ratio of lengths
        - for full original questions
        - noun phrases
        - after filtering stopwords
    - Number of sentences in both questions, ratios, difference in number
    - Question tokens in both questions (why, how, when, what, ..), set intersection, difference, etc
    - Stop words in both questions, stopq1/len(q1), stopq2/len(q2), stopq1.intersect(stopq2),...
    """

    features_col = pandas.Series([[]], index=np.arange(df.shape[0]))

    for (idx, row) in list(df.iterrows()):
        q1, q2 = row["question1"], row["question2"]
        q1_tokens, q2_tokens = clean_statement(q1), clean_statement(q2)
        tf_idf_sim = tfidf_word_match_share(row, word_weights)

        # LDA related features
        q1_lda_doc = [w.lemma_.lower() for w in q1_tokens]
        q2_lda_doc = [w.lemma_.lower() for w in q2_tokens]
        q1_topic_probs = dict(
            lda_model.get_document_topics(Counter([word2idx_dict[w] for w in q1_lda_doc if w in word2idx_dict]).items())
        )
        q2_topic_probs = dict(
            lda_model.get_document_topics(Counter([word2idx_dict[w] for w in q2_lda_doc if w in word2idx_dict]).items())
        )

        q1_topic_probs = [(t, q1_topic_probs[t]) if t in q1_topic_probs else (t, 0.0) for t in range(n_lda_topics)]
        q2_topic_probs = [(t, q2_topic_probs[t]) if t in q2_topic_probs else (t, 0.0) for t in range(n_lda_topics)]

        q1_topic_vector = np.array([prob for (topic, prob) in q1_topic_probs])
        q2_topic_vector = np.array([prob for (topic, prob) in q2_topic_probs])
        diff_topic_vector = q1_topic_vector - q2_topic_vector

        q1_doc = nlp(UnicodeDammit(' '.join([w.lemma_.lower() for w in q1_tokens])).markup) if q1_tokens else None
        q2_doc = nlp(UnicodeDammit(' '.join([w.lemma_.lower() for w in q2_tokens])).markup) if q2_tokens else None

        q1_vector, q2_vector = (
            q1_doc.vector if q1_doc and q1_doc.has_vector else None,
            q2_doc.vector if q2_doc and q2_doc.has_vector else None
        )

        q1_tokens_set = set(q1_tokens)
        q2_tokens_set = set(q2_tokens)

        token_overlap_ratio = (
            0.0 if not len(q1_tokens_set.union(q2_tokens_set))
            else 1.0 * float(len(q1_tokens_set.intersection(q2_tokens_set))) / len(q1_tokens_set.union(q2_tokens_set))
        )
        
        # Weighted TF-IDF sim
        wt_token_overlap_score = weighted_token_overlap_score(row)
        
        # Stop word occurrence
        (stops_ratio, stops_ratio_q1, stops_ratio_q2, stops_diff) = stops_ratios(row)

        if q1_vector is not None and q2_vector is not None:
            dot_product = q1_vector.dot(q2_vector) 
            cosine_sim = cosine_similarity(q1_vector, q2_vector)[0][0]
            euclidean_dist = np.linalg.norm(q1_vector - q2_vector)
            euclidean_lda_probs_dist = np.linalg.norm(diff_topic_vector)
        else:
            dot_product = cosine_sim = 0.0
            euclidean_dist = euclidean_lda_probs_dist = 100.0 # Not a very good hack

        feature_list = [
            token_overlap_ratio,
            float(token_overlap_ratio == 0),
            float(token_overlap_ratio == 1),
            dot_product,
            cosine_sim,
#             euclidean_dist,
#             euclidean_lda_probs_dist,
            tf_idf_sim,
            wt_token_overlap_score,
            stops_ratio,
            stops_ratio_q1,
            stops_ratio_q2,
            stops_diff
        ]
        feature_list.extend(list(diff_topic_vector))

        # return feature_list
        features_col[idx] = feature_list

    df["features"] = features_col
    return df


In [18]:
# Random Forest model
import numpy as np
import scipy as sp

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix

def predict_rf(row, model):
    """
    Assumes row object has a `features` column
    with the same features as those on which
    `model` was trained
    """
    return float(model.predict_proba(np.array(row["features"]))[0][1])

class RandomForestModel():
    n_trees = 400
    # test_size = 0.3
    rf_max_features = None
    folds = 10

    def train(self, training_df, cv=True):
        """
        Expects a `features` column which holds a
        list of floats to be used as features for
        the classifier and an integer `label` column
        encoding the output to be predicted
        """
        featureMatrix, labelVector = training_df["features"], training_df["label"]
        featureMatrix = np.array([list(f) for f in featureMatrix])
        featureMatrix = np.nan_to_num(featureMatrix)
        labelVector = np.array(list(labelVector))
        labelVector = np.nan_to_num(labelVector)

        auc_list = []
        logloss_list = []

        if cv:
            idx = 1
            for train, test in StratifiedKFold(labelVector, self.folds):
                print "Starting Cross Validation Fold {}".format(idx)

                x_train, y_train = featureMatrix[train], labelVector[train]
                x_test, y_test = featureMatrix[test], labelVector[test]
                x_train = np.asarray(x_train)
                y_train = np.asarray(y_train)
                x_test = np.asarray(x_test)
                y_test = np.asarray(y_test)

                model = RandomForestClassifier(n_estimators=self.n_trees, max_features=self.rf_max_features, class_weight="auto")\
                    if self.rf_max_features else RandomForestClassifier(n_estimators=self.n_trees, class_weight="auto")

                model.fit(x_train, y_train)

                predictions = model.predict_proba(x_test)[:, 1]
                fprArray, tprArray, thres = roc_curve(y_test, predictions)
                roc_auc = auc(fprArray, tprArray)
                logloss = binary_logloss(y_test, predictions)
                auc_list.append(roc_auc)
                logloss_list.append(logloss_list)

                print "CV Fold result: AUC is {auc} and Log Loss is {loss}".format(auc=roc_auc, loss=logloss)
                print "#########"

                idx += 1
            
            # Just for fast testing
            return

            model = RandomForestClassifier(n_estimators=self.n_trees, max_features=self.rf_max_features, class_weight="auto")\
                if self.rf_max_features else RandomForestClassifier(n_estimators=self.n_trees, class_weight="auto")

            roc_auc = np.mean(auc_list)
            logloss = np.mean(logloss_list)
            print "<======================================>"
            print "Finished cross validation experiments!"
            print "Average AUC is {auc} and average Log Loss is {loss}".format(auc=roc_auc, loss=logloss)
            print "Starting full model training!"

            model.fit(featureMatrix, labelVector)

            return {'model': model, 'roc_auc': roc_auc, 'logloss': logloss}
        else:
            model = RandomForestClassifier(n_estimators=self.n_trees, max_features=self.rf_max_features, class_weight="auto")\
                if self.rf_max_features else RandomForestClassifier(n_estimators=self.n_trees, class_weight="auto")

            model.fit(featureMatrix, labelVector)

            return {'model': model}

    def compute_precision_scores(self, y_pred, y_true, prob_thresholds):
        """
        Compute precision scores at different probability thresholds
        This allows us to pick a probability threshold for the classifier
        given a desired precision score
            pr = tpr  /  (tpr + fpr)
        returns: list((precision_score, prob_thres))
        """
        precisions = []
        for prob_thres in prob_thresholds:
            flagged_idxes = filter(lambda idx: y_pred[idx] >= prob_thres, range(len(y_pred)))
            true_flagged_idxes = filter(lambda idx: y_pred[idx] >= prob_thres and y_true[idx] == 1, range(len(y_pred)))
            precision = (len(true_flagged_idxes) / float(len(flagged_idxes))) if len(flagged_idxes) else 0.0
            precisions.append((precision, prob_thres))

        return sorted(precisions, key=lambda (prec, prob): prec)

    def compute_accuracy_scores(self, y_pred, y_true, prob_thresholds):
        """
        Compute accuracy scores at different probability thresholds
        This allows us to pick a probability threshold for the classifier
        given a desired precision score
        returns: list((accuracy_score, prob_thres))
        """
        accuracy_scores = []
        for prob_thres in prob_thresholds:
            correct_predicted_data_points = filter(lambda prob_idx:
                                                   (y_pred[prob_idx] >= prob_thres and y_true[prob_idx] == 1) or
                                                   (y_pred[prob_idx] < prob_thres and y_true[prob_idx] == 0),
                                                   range(len(y_pred)))
            accuracy = len(correct_predicted_data_points) / float(len(y_true)) if len(y_true) else 0.0
            accuracy_scores.append((accuracy, prob_thres))

        return sorted(accuracy_scores, key=lambda (acc, prob): acc)

In [12]:
# XgBoost model
import numpy as np
import scipy as sp

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix

import xgboost as xgb


def binary_logloss(act, pred):
    """
    act and pred are vectors of actual class
    and prediction probability of class 1,
    respectively
    """
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1 - epsilon, pred)
    ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred)))
    ll = ll * -1.0 / len(act)
    return ll


def xgboost_eval(act, pred):
    return 'error', binary_logloss(act, pred)


def predict_xgboost(row, model):
    """
    Assumes row object has a `features` column
    with the same features as those on which
    `model` was trained
    """
    return float(model.predict(np.array(row["features"]))[0])


class XgBoostModel():
    n_boost_rounds = 2000
    max_depth = 5
    objective = 'binary:logistic'
    eval_metric = "logloss"
    early_stopping_rounds = 70
    folds = 10
    learning_rate = 0.1
    scale_pos_weight = 1
    gamma = 0.1

    def train(self, training_df, cv=True):
        """
        Expects a `features` column which holds a
        list of floats to be used as features for
        the classifier and an integer `label` column
        encoding the output to be predicted
        """
        featureMatrix, labelVector = training_df["features"], training_df["label"]
        featureMatrix = np.array([list(f) for f in featureMatrix])
        featureMatrix = np.nan_to_num(featureMatrix)
        labelVector = np.array(list(labelVector))
        labelVector = np.nan_to_num(labelVector)

        auc_list = []
        logloss_list = []

        if cv:
            idx = 1
            for train, test in StratifiedKFold(labelVector, self.folds):
                print "Starting Cross Validation Fold {}".format(idx)

                x_train, y_train = featureMatrix[train], labelVector[train]
                x_test, y_test = featureMatrix[test], labelVector[test]
                x_train = np.asarray(x_train)
                y_train = np.asarray(y_train)
                x_test = np.asarray(x_test)
                y_test = np.asarray(y_test)
                
                params = {}
                params['objective'] = self.objective
                params['eval_metric'] = self.eval_metric
                params['eta'] = self.learning_rate
                params['max_depth'] = self.max_depth
                params['scale_pos_weight'] = self.scale_pos_weight
                params['gamma'] = self.gamma
                params['silent'] = 1

                d_train = xgb.DMatrix(x_train, label=y_train)
                d_valid = xgb.DMatrix(x_test, label=y_test)
                
                watchlist = [(d_train, 'train'), (d_valid, 'valid')]
                
                model = xgb.train(
                    params,
                    d_train,
                    self.n_boost_rounds,
                    watchlist,
                    early_stopping_rounds=self.early_stopping_rounds
                )

                x_test_df = pandas.DataFrame(x_test, columns=["feature_%s" % str(i) for i in range(x_test.shape[1])])
                predictions = model.predict(xgb.DMatrix(x_test_df))
                                
                fprArray, tprArray, thres = roc_curve(y_test, predictions)
                roc_auc = auc(fprArray, tprArray)
                logloss = binary_logloss(y_test, predictions)
                auc_list.append(roc_auc)
                logloss_list.append(logloss_list)

                print "CV Fold result: AUC is {auc} and Log Loss is {loss}".format(auc=roc_auc, loss=logloss)
                print "#########"

                idx += 1
            
            # Just for fast testing
            return

            roc_auc = np.mean(auc_list)
            logloss = np.mean(logloss_list)
            print "<======================================>"
            print "Finished cross validation experiments!"
            print "Average AUC is {auc} and average Log Loss is {loss}".format(auc=roc_auc, loss=logloss)
            print "Starting full model training!"

#             model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_trees)
#             model.fit(featureMatrix, labelVector, eval_metric=self.eval_metric)#, early_stopping_rounds=self.early_stopping_rounds)
            # make prediction
            # preds = model.predict(x_test)

            return {'model': model, 'roc_auc': roc_auc, 'logloss': logloss}
        else:
            model = xgb.XGBClassifier(max_depth=self.max_depth, n_estimators=self.n_trees)
            model.fit(featureMatrix, labelVector, eval_metric=self.eval_metric)#, early_stopping_rounds=self.early_stopping_rounds)

            return {'model': model}

In [4]:
# Read data

train_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/train.csv"
models_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/models.pkl"
train_pred_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/train_preds.csv"

import pickle
import numpy as np
import pandas
from functools import partial
from collections import Counter

def read_data(path):
    return pandas.read_csv(path)

n_sample = 100000
full_df = read_data(train_path)
rows = np.random.choice(full_df.index.values, n_sample)
df = full_df.ix[rows]

questions = pandas.Series(df['question1'].tolist() + df['question2'].tolist()).astype(str)
questions = [remove_punc(q).lower() for q in questions]
eps = 500 
words = (" ".join(questions)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count, eps=eps) for word, count in counts.items()}

# df = read_data(train_path)

In [5]:
# Train topic model
n_lda_topics = 20
print "Starting LDA modelling!"

doc_list_lda_train = list(construct_doc_list(df))
lda_model, id2word_dictionary, word2idx_dictionary, topics = \
    train_lda(n_lda_topics,
              documents=doc_list_lda_train)

Starting LDA modelling!


In [30]:
# Feature construction
print "Starting feature construction!"
feature_method = partial(
    features,
    lda_model=lda_model,
    word2idx_dict=word2idx_dictionary,
    n_lda_topics=n_lda_topics,
    word_weights=weights
    )
df = feature_method(df)
df["label"] = df["is_duplicate"].map(int)

Starting feature construction!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
# Purely for experimenting!! This oversampling process can lead to overfitting
# and is generally not very good ML practise
pos_train = df[df["is_duplicate"] == 1]
neg_train = df[df["is_duplicate"] == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((float(len(pos_train)) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pandas.concat([neg_train, neg_train])
    scale -=1
neg_train = pandas.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print len(pos_train) / float(len(pos_train) + len(neg_train))

df_resampled = pandas.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0.191234956605


In [31]:
# Model training
# model = XgBoostModel()
model = RandomForestModel()
model_obj = model.train(df)

Starting Cross Validation Fold 1
CV Fold result: AUC is 0.889879154053 and Log Loss is 0.405203875959
#########
Starting Cross Validation Fold 2
CV Fold result: AUC is 0.893805579495 and Log Loss is 0.400139766151
#########
Starting Cross Validation Fold 3
CV Fold result: AUC is 0.894732860574 and Log Loss is 0.404043903714
#########
Starting Cross Validation Fold 4
CV Fold result: AUC is 0.887795383009 and Log Loss is 0.406265820267
#########
Starting Cross Validation Fold 5
CV Fold result: AUC is 0.887664757364 and Log Loss is 0.406328622121
#########
Starting Cross Validation Fold 6
CV Fold result: AUC is 0.884218301149 and Log Loss is 0.411929958619
#########
Starting Cross Validation Fold 7
CV Fold result: AUC is 0.889516163899 and Log Loss is 0.403089507674
#########
Starting Cross Validation Fold 8
CV Fold result: AUC is 0.888630227929 and Log Loss is 0.406444776011
#########
Starting Cross Validation Fold 9
CV Fold result: AUC is 0.888765576276 and Log Loss is 0.407199716697
##