In [31]:
"""
Various feature generation methods
"""
import numpy as np
import pandas
import spacy
import re
import math
from geotext import GeoText
from nltk.corpus import stopwords
from collections import Counter
from bs4 import UnicodeDammit

nlp = spacy.load("en")
stops = set(stopwords.words("english"))

question_tokens = set(["why", "how", "what", "when", "which", "who", "whose", "whom"])

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1.0 / (count + eps)


def remove_punc(s):
    return re.sub(r'[^\w\s]', '', UnicodeDammit(str(s)).markup)


def clean_statement(s):
    """
    Remove punctuation, stop words and standardise casing
    words, and return remaining tokens
    """

    # Remove punctuation
    s = remove_punc(s)
    sentence = nlp(s)
    sentence_with_stop_checks = [(sentence[i], sentence[i].is_stop) for i in range(len(sentence))]

    return sorted([w for (w, stop_bool) in sentence_with_stop_checks if not stop_bool])


def token_overlap_score(row):
    """
    appending column of set overlap percentage, where each set
    is a set of tokens excluding stop-words and punctuation,
    and in lemmatised form
    """
    cleaned_question1_words = clean_statement(row["question1"])
    cleaned_question2_words = clean_statement(row["question2"])
    
    set1, set2 = \
            (set([w.lemma_.lower() for w in cleaned_question1_words]),
             set([w.lemma_.lower() for w in cleaned_question2_words]))
        
    return 0.0 if not len(set1.union(set2)) else 1.0 * len(set1.intersection(set2)) / len(set1.union(set2))


def weighted_token_overlap_score(row):
    cleaned_question1_words = clean_statement(row["question1"])
    cleaned_question2_words = clean_statement(row["question2"])
    
    set1, set2 = \
            (set([w.lemma_.lower() for w in cleaned_question1_words]),
             set([w.lemma_.lower() for w in cleaned_question2_words]))
        
    return (1.0 * len(set1.intersection(set2)) / (len(set1.union(set2)) or 1)) * \
            (
                min(len(str(row["question1"])), len(str(row["question2"]))) / 
                (1.0 * max(len(str(row["question1"])), len(str(row["question2"]))))
            )


def noun_phrase_overlap(row):
    q1_doc = nlp(UnicodeDammit(str(row["question1"])).markup)
    q2_doc = nlp(UnicodeDammit(str(row["question2"])).markup)
    q1_np = set([noun_p.text for noun_p in q1_doc.noun_chunks])
    q2_np = set([noun_p.text for noun_p in q2_doc.noun_chunks])
    return len(q1_np.intersection(q2_np)) / (float(len(q1_np.union(q2_np))) or 1.0)


def question_length_ratio(row):
    return min(float(len(str(row["question1"]))) / len(str(row["question2"])), 5)


def punctuation_sym_ratio(row):
    return min(
        5,
        (1.0 * len(re.split(r'[^\w\s]', UnicodeDammit(str(row["question1"])).markup))) / 
        len(re.split(r'[^\w\s]', UnicodeDammit(str(row["question2"])).markup))
    )


def countries_mentioned_overlap(row):
    q1 = remove_punc(row["question1"])
    q2 = remove_punc(row["question2"])
    q1_geo = GeoText(". ".join([w.upper() for w in q1.split(" ")]))
    q1_countries = set([k for (k, v) in q1_geo.country_mentions.items()])
    q2_geo = GeoText(". ".join([w.upper() for w in q2.split(" ")]))
    q2_countries = set([k for (k, v) in q2_geo.country_mentions.items()])
    
    return float(len(q1_countries.intersection(q2_countries))) / (len(q1_countries.union(q2_countries)) or 1.0)


def stops_ratios(row):
    q1_tokens = [t.lower() for t in remove_punc(row["question1"]).split()]
    q2_tokens = [t.lower() for t in remove_punc(row["question2"]).split()]
    q1_stops = set([t for t in q1_tokens if t in stops])
    q2_stops = set([t for t in q2_tokens if t in stops])
    return (
        float(len(q1_stops.intersection(q2_stops))) / (len(q1_stops.union(q2_stops)) or 1.0),
        float(len(q1_stops)) / (len(q1_tokens) or 1.0),
        float(len(q2_stops)) / (len(q2_tokens) or 1.0),
        math.fabs(float(len(q1_stops)) / (len(q1_tokens) or 1.0) - float(len(q2_stops)) / (len(q2_tokens) or 1.0))
    )


def question_tokens_ratio(row):
    q1_quest_tokens = set([t.lower() for t in remove_punc(row["question1"]).split() if t.lower() in question_tokens])
    q2_quest_tokens = set([t.lower() for t in remove_punc(row["question2"]).split() if t.lower() in question_tokens])
    return (
        float(len(q1_quest_tokens.intersection(q2_quest_tokens))) / (len(q1_quest_tokens.union(q2_quest_tokens)) or 1.0)
    )


def num_sentences_ratio(row, thres_mult=2.0):
    return float(
        float(len(str(row["question1"]).split("."))) / len(str(row["question2"]).split(".")) >= thres_mult
        or
        float(len(str(row["question1"]).split("."))) / len(str(row["question2"]).split(".")) <  1 / thres_mult
    )


def punc_blocks_ratio(row):
    return min(
        10,
        math.fabs(len(re.split(r"[,-.]+", str(row["question1"]))) - len(re.split(r"[,-.]+", str(row["question2"]))))
    )
    

In [4]:
import matplotlib.pyplot as plt

def show_var_dist_by_label(df, var_name, var_calc_fun, bins=20):
    """
    get a DF along with a callback to compute a specific feature
    and plot a distribution of the variable split by label
    """
    df[var_name] = df.apply(var_calc_fun, axis=1)
    plt.figure(figsize=(15, 5))
    plt.hist(df[var_name][df['is_duplicate'] == 0], bins=bins, normed=True, label='Not Duplicate')
    plt.hist(df[var_name][df['is_duplicate'] == 1], bins=bins, normed=True, alpha=0.7, label='Duplicate')
    plt.legend()
    plt.title('Label distribution over %s' % var_name, fontsize=15)
    plt.xlabel(var_name, fontsize=15)
    plt.show()

In [6]:
# Read data

train_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/train.csv"
models_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/models.pkl"
train_pred_path = "/Users/mohamedabdelbary/Documents/kaggle_quora/train_preds.csv"

import pickle
import numpy as np
import pandas
from functools import partial
from collections import Counter

def read_data(path):
    return pandas.read_csv(path)

df = read_data(train_path)

questions = pandas.Series(df['question1'].tolist() + df['question2'].tolist()).astype(str)
questions = [remove_punc(q).lower() for q in questions]
eps = 500 
words = (" ".join(questions)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count, eps=eps) for word, count in counts.items()}

In [32]:
# show_var_dist_by_label(df, 'token_overlap_score', token_overlap_score, bins=20)
# show_var_dist_by_label(df, 'question_length_ratio', question_length_ratio, bins=20)
# show_var_dist_by_label(df, 'punctuation_sym_ratio', punctuation_sym_ratio, bins=20)
# show_var_dist_by_label(df, 'noun_phrase_overlap', noun_phrase_overlap, bins=20) # Good feature
# show_var_dist_by_label(df, 'weighted_token_overlap_score', weighted_token_overlap_score, bins=20) # Very good feature
# show_var_dist_by_label(df, 'countries_mentioned_overlap', countries_mentioned_overlap, bins=20) # Good if you have countries in q
# show_var_dist_by_label(df, 'question_tokens_ratio', question_tokens_ratio, bins=20) # good
# show_var_dist_by_label(df, 'num_sentences_ratio', num_sentences_ratio, bins=20) # not very useful
show_var_dist_by_label(df, 'punc_blocks_ratio', punc_blocks_ratio, bins=20) # good feature

SystemExit: 0

To exit: use 'exit', 'quit', or Ctrl-D.
