In [None]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import xgboost as xgb
import multiprocessing
import difflib

In [None]:
df_train = pd.read_csv("train_org.csv")
df_test = pd.read_csv("test_final.csv")
df_train_clean = pd.read_csv('./processed_data/train_clean.csv')
df_test_clean = pd.read_csv('./processed_data/test_clean.csv')

In [None]:
q1= "question1"
q2= "question2"
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
tfidf_txt = pd.Series(df_train[q1].tolist() + df_train[q2].tolist() +
                      df_test[q1].tolist() + df_test[q2].tolist()).astype(str)
tfidf.fit_transform(tfidf_txt)
def diff_ratios(st1, st2):
    seq = difflib.SequenceMatcher()
    seq.set_seqs(str(st1).lower(), str(st2).lower())
    return seq.ratio()

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (float(len(shared_words_in_q1)) + float(len(shared_words_in_q2)))/(len(q1words) + len(q2words))
    return R

def get_features_1(df_features):
    print('nouns...')
    df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(x.decode('utf-8').strip().lower() if type(x)!=float else str(x))) if t[:1] in ['N']])
    df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(x.decode('utf-8').strip().lower() if type(x)!=float else str(x))) if t[:1] in ['N']])
    df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    print('lengths...')
    df_features['z_len1'] = df_features.question1.map(lambda x: len(str(x)))
    df_features['z_len2'] = df_features.question2.map(lambda x: len(str(x)))
    df_features['z_word_len1'] = df_features.question1.map(lambda x: len(str(x).split()))
    df_features['z_word_len2'] = df_features.question2.map(lambda x: len(str(x).split()))
    print('difflib...')
    df_features['z_match_ratio'] = df_features.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1)  #takes long
    print('word match...')
    df_features['z_word_match'] = df_features.apply(word_match_share, axis=1, raw=True)
    print('tfidf...')
    df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
    df_features = df_features.fillna(0.0)
    col = [c for c in df_features.columns if c[:1]=='z']
    return df_features[col]

In [None]:
df_train_clean = pd.read_csv('./processed_data/train_clean.csv')
df_test_clean = pd.read_csv('./processed_data/test_clean.csv')

In [None]:
from collections import Counter
import numpy as np
from collections import Counter
train_qs = pd.Series(df_train_clean['question1'].tolist() + df_train_clean['question2'].tolist()).astype(str)
test_qs = pd.Series(df_test_clean['question1'].tolist() + df_test_clean['question2'].tolist()).astype(str)
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1.0 / (count + eps)
eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1_list']:#).lower().split():
        if word not in stops:
            q1words[word] = 1
    
    for word in row['question2_list']:#).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights).astype(float) / np.sum(total_weights)
    return R

In [None]:
import Levenshtein
import difflib
def extract_fea_2(df_):
    data = pd.DataFrame()
    #distance feature
    data['len_q1'] = df_['question1'].apply(lambda x: len(str(x)))
    data['len_q2'] = df_['question2'].apply(lambda x: len(str(x)))
    data['x_diff_len'] = abs(data['len_q1'] - data['len_q2'])
    data['x_len_char_q1'] = df_['question1'].apply(lambda x: len(set(str(x))))
    data['x_len_char_q2'] = df_['question2'].apply(lambda x: len(set(str(x))))
    data['x_diff_char_len'] = abs(data['x_len_char_q1'] - data['x_len_char_q2'])
    data['len_word_q1'] = df_['question1'].apply(lambda x:len(str(x).split()))
    data['len_word_q2'] = df_['question2'].apply(lambda x:len(str(x).split()))
    data['x_diff_word_len'] = abs(data['len_word_q1'] - data['len_word_q2'])
    
    #jaccard_similarity
    def jaccard_similarity(a, b):
        a, b = set(a), set(b)
        c = a & b
        return float(len(c)) / (len(a) + len(b) - len(c))
    data['x_common_unigram_len'] = df_.apply(lambda x: len(set(str(x['question1']).split()).intersection(set(str(x['question2']).split()))), axis=1)
    data['x_match_char_jaccard'] = df_.apply(lambda x: jaccard_similarity(str(x['question1']), str(x['question2'])), axis=1)
    data['x_match_word_jaccard'] = df_.apply(lambda x: jaccard_similarity(str(x['question1']).split(),str(x['question2']).split()), axis=1)
    #edit distance
    data['x_levenshtein_ratio'] = df_.apply(lambda x: Levenshtein.ratio(str(x['question1']),str(x['question2'])), axis=1)
    data['x_levenshtein_seqratio'] = df_.apply(lambda x: Levenshtein.seqratio(str(x['question1']).split(),str(x['question2']).split()),axis=1)
    data['x_levenshtein_setratio'] = df_.apply(lambda x: Levenshtein.setratio(str(x['question1']).split(),str(x['question2']).split()), axis=1)
    #word matching
    data['x_difflib_sim'] = df_.apply(lambda x: difflib.SequenceMatcher(None, str(x['question1']), str(x['question2'])).ratio(), axis=1)
    data['x_word_match_tf_idf'] = df_.apply(word_match_share, axis=1)
    col = [c for c in data.columns if c[:1]=='x']
    return data[col]

In [None]:
from fuzzywuzzy import fuzz
def gen_fuzz_feature_3(df_):
    data = pd.DataFrame()
    data['fuzz_qratio'] = df_.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_WRatio'] = df_.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_partial_ratio'] = df_.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_token_sort_ratio'] = df_.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_partial_token_sort_ratio'] = df_.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    return data

In [None]:
#magic feature
def magic_feature(q1,q2,train,test):
    train_orig = train.copy()
    test_orig = test.copy()
    train_question1 = train_orig[[q1]].copy()
    train_question2 = train_orig[[q2]].copy()
    test_question1 = test_orig[[q1]].copy()
    test_question2 = test_orig[[q2]].copy()

    train_question2.rename(columns={q2: q1}, inplace=True)
    test_question2.rename(columns={q2: q1}, inplace=True)

    questions = train_question1.append(train_question2)
    questions = questions.append(test_question1)
    questions = questions.append(test_question2)
    questions.drop_duplicates(subset=[q1], inplace=True)
    questions.reset_index(inplace=True, drop=True)
    questions_dict = pd.Series(questions.index.values, index=questions[q1].values).to_dict()

    train_cp = train_orig.copy()
    test_cp = test_orig.copy()
    test_cp['is_duplicate'] = -1
    test_cp.rename(columns={'test_id': 'id'}, inplace=True)
    comb = pd.concat([train_cp, test_cp])
    comb['q1_hash'] = comb[q1].map(questions_dict)
    comb['q2_hash'] = comb[q2].map(questions_dict)

    q1_vc = comb.q1_hash.value_counts().to_dict()
    q2_vc = comb.q2_hash.value_counts().to_dict()

    def try_apply_dict(x, dict_to_apply):
        try:
            return dict_to_apply[x]
        except KeyError:
            return 0

    comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))
    comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))

    train_features = comb[comb['is_duplicate'] >= 0][['q1_freq', 'q2_freq']]
    test_features = comb[comb['is_duplicate'] < 0][['q1_freq', 'q2_freq']]
    del train_orig
    del test_orig
    return train_features,test_features

In [None]:
nodl_train_fea = pd.read_csv('./processed_data/train_extract_nodl_fea.csv')

In [None]:
nodl_train_fea.columns

In [None]:
nodl_test_fea = pd.read_csv('./processed_data/test_extract_nodl_fea.csv')

In [None]:
df_train_extract_1 = get_features_1(df_train_clean)
df_test_extract_1 = get_features_1(df_test_clean)

In [None]:
df_train_extract_2 = extract_fea_2(df_train_clean)
df_test_extract_2 = extract_fea_2(df_test_clean)

In [None]:
def get_new_feature(df_pre,df_now):
    for c in df_now.columns:
        df_pre[c] = df_now[c]
    return df_pre

In [None]:
extract_train_feature2 = get_new_feature(df_train_extract_1,df_train_extract_2)
extract_test_feature2 = get_new_feature(df_test_extract_1,df_test_extract_2)

In [None]:
train_fuzz_3 = gen_fuzz_feature_3(df_train_clean)
test_fuzz_3 = gen_fuzz_feature_3(df_test_clean)

In [None]:
extract_train_feature3 = get_new_feature(extract_train_feature2,train_fuzz_3)
extract_test_feature3 = get_new_feature(extract_test_feature2,test_fuzz_3)

In [None]:
# extract_train_feature3.to_csv('./processed_data/train_no_dl_fea.csv',index=False)
# extract_test_feature3.to_csv('./processed_data/test_no_dl_fea.csv',index=False)

In [None]:
def magic_feature_all_4(df_train_clean,df_test_clean):
    data_train = pd.DataFrame()
    data_test = pd.DataFrame()
    flags = ['clean','no_stops','stems','stems_no_stops']
    for flag in flags:
        train_magic,test_magic = magic_feature('q1_'+flag,'q2_'+flag,df_train_clean,df_test_clean)
        data_train['magic_'+flag+'_q1'] = train_magic.q1_freq
        data_train['magic_'+flag+'_q2'] = train_magic.q2_freq

        data_test['magic_'+flag+'_q1'] = test_magic.q1_freq
        data_test['magic_'+flag+'_q2']= test_magic.q2_freq
    return data_train,data_test

In [None]:
magic_train,magic_test = magic_feature_all_4(df_train_clean,df_test_clean)

In [None]:
extract_train_feature4 = get_new_feature(extract_train_feature3,magic_train)
extract_test_feature4 = get_new_feature(extract_test_feature3,magic_test)

In [None]:
# extract_train_feature4.to_csv('./processed_data/train_no_dl_fea.csv',index=False)
# extract_test_feature4.to_csv('./processed_data/test_no_dl_fea.csv',index=False)

In [None]:
all_data =df_train_clean.question1.apply(lambda x: str(x).split()).values+df_train_clean.question2.apply(lambda x: str(x).split()).values

In [None]:
one_list_data = np.concatenate(all_data,a)

In [None]:
def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val
def _is_str_match(str1, str2, threshold=1.0):
    assert threshold >= 0.0 and threshold <= 1.0, "Wrong threshold."
    if float(threshold) == 1.0:
        return str1 == str2
    else:
        return (1. - _edit_dist(str1, str2)) >= threshold

In [None]:
from difflib import SequenceMatcher
def jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(float(len(A.intersection(B))), len(A.union(B)))
def dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B)))
def edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d
import lzma
def compression_dist(x, y, l_x=None, l_y=None):
    if x == y:
        return 0
    x_b = x.encode('utf-8')
    y_b = y.encode('utf-8')
    if l_x is None:
        l_x = len(lzma.compress(x_b))
        l_y = len(lzma.compress(y_b))
    l_xy = len(lzma.compress(x_b+y_b))
    l_yx = len(lzma.compress(y_b+x_b))
    dist = _try_divide(min(l_xy,l_yx)-min(l_x,l_y), max(l_x,l_y))
    return dist
def cosine_d(a,b):
    a = set(a)
    b = set(b)
    d = len(a)*len(b)
    if (d == 0):
        return 0.0
    else: 
        return float(len(a.intersection(b)))/d

In [None]:
def IntersectCount_Ngram(obs_ngrams,target_ngrams):
    s = 0.0
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if _is_str_match(w1, w2):
                s += 1.
                break
    return s
def IntersectRatio_Ngram(obs_ngrams,target_ngrams):
    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if _is_str_match(w1, w2):
                s += 1.
                break
    return _try_divide(s, len(obs_ngrams))
def CooccurrenceCount_Ngram(obs_ngrams,target_ngrams):
    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if _is_str_match(w1, w2):
                s += 1.
    return s
def CooccurrenceRatio_Ngram(obs_ngrams,target_ngrams):
    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if _is_str_match(w1, w2):
                s += 1.
    return _try_divide(s, len(obs_ngrams)*len(target_ngrams))

In [None]:
from nltk import word_tokenize
from nltk.util import ngrams
def feature5(df_,q1,q2):
    data = pd.DataFrame()
    data['JaccardCoef_2gram'] = df_.apply(lambda x: jaccard_coef(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                  axis=1)
    data['DiceDistance_2gram'] = df_.apply(lambda x: dice_dist(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                  axis=1)
    data['JaccardCoef_3gram'] = df_.apply(lambda x: jaccard_coef(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                  axis=1)
    data['DiceDistance_3gram'] = df_.apply(lambda x: dice_dist(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),axis=1)
#   data['CompressionDistance'] = df_.apply(lambda x: compression_dist(str(x[q1]).decode('utf-8'),str(x[q2]).decode('utf-8')),axis=1)
    data['edit_dist'] = df_.apply(lambda x: edit_dist(str(x[q1]),str(x[q2])),axis=1)
    data['cosine_dist'] = df_.apply(lambda x: cosine_d(str(x[q1]).split(),str(x[q2]).split()),axis=1)
    data['IntersectCount_2gram'] = df_.apply(lambda x: IntersectCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['IntersectRatio_2gram'] = df_.apply(lambda x: IntersectRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['CooccurrenceCount_2gram'] = df_.apply(lambda x: CooccurrenceCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['CooccurrenceRatio_2gram'] = df_.apply(lambda x: CooccurrenceRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['IntersectCount_3gram'] = df_.apply(lambda x: IntersectCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['IntersectRatio_3gram'] = df_.apply(lambda x: IntersectRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['CooccurrenceCount_3gram'] = df_.apply(lambda x: CooccurrenceCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['CooccurrenceRatio_3gram'] = df_.apply(lambda x: CooccurrenceRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    return data

In [None]:
data_fea_train5 =feature5(df_train_clean,'question1','question2')

In [None]:
data_fea_train5.head(50)

In [None]:
data_fea_test5 =feature5(df_test_clean,'question1','question2')

In [None]:
data_fea_test5.to_csv('./processed_data/fea_5_test.csv',index=False)

In [None]:
data_fea_train5.to_csv('./processed_data/fea_5_train.csv',index=False)

In [None]:
nodl_train_fea = pd.read_csv('./processed_data/train_no_dl_fea.csv')
nodl_test_fea = pd.read_csv('./processed_data/test_no_dl_fea.csv')

In [None]:
for c in data_fea_train5.columns:
    nodl_train_fea[c] = data_fea_train5[c]

In [None]:
for c in data_fea_test5.columns:
    nodl_test_fea[c] = data_fea_test5[c]

In [None]:
nodl_train_fea.to_csv('./processed_data/train_no_dl_fea.csv',index=False)

In [None]:
nodl_train_fea.columns

In [None]:
def str_common_word(str1, str2, minLength=1, string_only=False):
    word_list=[]
    num=0
    total_entries=0
    cnt_letters=0
    cnt_unique_letters=0
    all_num=0
    all_total_entries=0
    all_cnt_letters=0
    for word in str1.split():
         if len(word)>=minLength:
                if string_only==False or len(re.findall(r'\d+', word))==0:
                    if (' '+word+' ') in (' '+str2+' '):
                        num+=1
                        total_entries+=(' '+str2+' ').count(' '+word+' ')
                        cnt_letters+=(' '+str2+' ').count(' '+word+' ') * (len(word))
                        cnt_unique_letters+=(len(word))
                        word_list.append(word)
                    all_num+=1
                    all_total_entries+=1
                    all_cnt_letters+=len(word)
    
    if all_num==0:
        ratio_num=0
    else:
        ratio_num=1.0*num/all_num
    
    if all_cnt_letters==0:
        ratio_letters=0
    else:
        ratio_letters=1.0*cnt_unique_letters/all_cnt_letters
                 
    return num, total_entries, cnt_unique_letters, ratio_num, ratio_letters, " ".join(word_list)

In [None]:
def seq_matcher(s1,s2):
    seq=difflib.SequenceMatcher(None, s1,s2)
    rt=round(seq.ratio(),7)
    l1=len(s1)
    l2=len(s2)
    if len(s1)==0 or len(s2)==0:
        rt=0
        rt_scaled=0
    else:
        rt_scaled=round(rt*max(l1,l2)/min(l1,l2),7)
    return rt, rt_scaled

In [None]:
def feature6(df_):  
    df_all = pd.DataFrame()
    df_all['word_in_title_tuple']=df_.apply(lambda x: \
                                               str_common_word(x['q1_stemmed'],x['q2_stemmed']),axis=1)
    df_all['f_word_in_letratio'] = df_all['word_in_title_tuple'].map(lambda x: x[4])
    df_all['seqmatch_tuple']=df_.apply(lambda x: \
                                                seq_matcher(x['q1_stemmed'],x['q2_stemmed']),axis=1)
    df_all['f_seqmatch_ratio'] = df_all['seqmatch_tuple'].map(lambda x: x[0])
    df_all['f_seqmatch_ratioscaled'] = df_all['seqmatch_tuple'].map(lambda x: x[1])
    col = [c for c in df_all.columns if c[:1]=='f']
    return df_all[col]

In [None]:
train_fea6 = feature6(df_train_clean)

In [None]:
test_fea6 = feature6(df_test_clean)

In [None]:
def feature7(df_,name,q1,q2):
    data = pd.DataFrame()
    data['JaccardCoef_2gram'+name] = df_.apply(lambda x: jaccard_coef(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                  axis=1)
    data['DiceDistance_2gram'+name] = df_.apply(lambda x: dice_dist(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                  axis=1)
    data['JaccardCoef_3gram'+name] = df_.apply(lambda x: jaccard_coef(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                  axis=1)
    data['DiceDistance_3gram'+name] = df_.apply(lambda x: dice_dist(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                     list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),axis=1)
#   data['CompressionDistance'] = df_.apply(lambda x: compression_dist(str(x[q1]).decode('utf-8'),str(x[q2]).decode('utf-8')),axis=1)
    data['edit_dist'+name] = df_.apply(lambda x: edit_dist(str(x[q1]),str(x[q2])),axis=1)
    data['cosine_dist'+name] = df_.apply(lambda x: cosine_d(str(x[q1]).split(),str(x[q2]).split()),axis=1)
    data['IntersectCount_2gram'+name] = df_.apply(lambda x: IntersectCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['IntersectRatio_2gram'+name] = df_.apply(lambda x: IntersectRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['CooccurrenceCount_2gram'+name] = df_.apply(lambda x: CooccurrenceCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['CooccurrenceRatio_2gram'+name] = df_.apply(lambda x: CooccurrenceRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),2)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),2))),
                                                   axis=1)
    data['IntersectCount_3gram'+name] = df_.apply(lambda x: IntersectCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['IntersectRatio_3gram'+name] = df_.apply(lambda x: IntersectRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['CooccurrenceCount_3gram'+name] = df_.apply(lambda x: CooccurrenceCount_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    data['CooccurrenceRatio_3gram'+name] = df_.apply(lambda x: CooccurrenceRatio_Ngram(list(ngrams(nltk.word_tokenize(str(x[q1]).decode('utf-8')),3)),
                                                                      list(ngrams(nltk.word_tokenize(str(x[q2]).decode('utf-8')),3))),
                                                   axis=1)
    return data

In [None]:
import pandas as pd
df_train_clean = pd.read_csv('./processed_data/train_clean.csv')

In [None]:
df_test_clean = pd.read_csv('./processed_data/test_clean.csv')

In [None]:
import nltk

In [None]:
data_fea_train7 =feature7(df_train_clean,'no_stops','q1_no_stops','q2_no_stops')

In [None]:
data_fea_test7 =feature7(df_test_clean,'no_stops','q1_no_stops','q2_no_stops')

In [None]:
nodl_train_fea = pd.read_csv('./processed_data/train_no_dl_fea.csv')
nodl_test_fea = pd.read_csv('./processed_data/test_no_dl_fea.csv')

In [None]:
for c in data_fea_train7.columns:
    nodl_train_fea[c] = data_fea_train7[c]
for c in data_fea_test7.columns:
    nodl_test_fea[c] = data_fea_test7[c]

In [None]:
nodl_train_fea.to_csv('./processed_data/train_no_dl_fea.csv',index=False)
nodl_test_fea.to_csv('./processed_data/test_no_dl_fea.csv',index=False)

In [None]:
nodl_test_fea.columns

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime
import operator
from sklearn.cross_validation import train_test_split
from collections import Counter
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from pylab import plot, show, subplot, specgram, imshow, savefig

RS = 12357
ROUNDS = 315

print("Started")
np.random.seed(RS)

In [None]:
from nltk.corpus import stopwords
def add_word_count(x, df, word):
    x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
def feature8(train,test):
	df_train = train.copy()
	df_test  = test.copy()
	print("Original data: X_train: {}, X_test: {}".format(df_train.shape, df_test.shape))

	print("Features processing, be patient...")

	# If a word appears only once, we ignore it completely (likely a typo)
	# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
	def get_weight(count, eps=10000, min_count=2):
		return 0.0 if count < min_count else 1.0 / (count + eps)

	train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
	words = (" ".join(train_qs)).lower().split()
	counts = Counter(words)
	weights = {word: get_weight(count) for word, count in counts.items()}

	stops = set(stopwords.words("english"))
	def word_shares(row):
		q1_list = str(row['question1']).lower().split()
		q1 = set(q1_list)
		q1words = q1.difference(stops)
		if len(q1words) == 0:
			return '0:0:0:0:0:0:0:0'
        
		q2_list = str(row['question2']).lower().split()
		q2 = set(q2_list)
		q2words = q2.difference(stops)
		if len(q2words) == 0:
			return '0:0:0:0:0:0:0:0'

		words_hamming = sum(1.0 for i in zip(q1_list, q2_list) if i[0]==i[1])/float(max(len(q1_list), len(q2_list)))

		q1stops = q1.intersection(stops)
		q2stops = q2.intersection(stops)

		q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
		q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

		shared_2gram = q1_2gram.intersection(q2_2gram)

		shared_words = q1words.intersection(q2words)
		shared_weights = [weights.get(w, 0) for w in shared_words]
		q1_weights = [weights.get(w, 0) for w in q1words]
		q2_weights = [weights.get(w, 0) for w in q2words]
		total_weights = q1_weights + q1_weights
		
		R1 = np.sum(shared_weights) / float(np.sum(total_weights)) #tfidf share
		R2 = len(shared_words) / float(len(q1words) + len(q2words) - len(shared_words)) #count share
		R31 = len(q1stops) / float(len(q1words)) #stops in q1
		R32 = len(q2stops) / float(len(q2words))#stops in q2
		Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights)).astype(float)*np.sqrt(np.dot(q2_weights,q2_weights)))
		Rcosine = float(np.dot(shared_weights, shared_weights))/Rcosine_denominator
		if len(q1_2gram) + len(q2_2gram) == 0:
			R2gram = 0.0
		else:
			R2gram = float(len(shared_2gram)) / (len(q1_2gram) + len(q2_2gram))
		return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, float(len(shared_words)), R31, R32, R2gram, Rcosine, words_hamming)

	df = pd.concat([df_train, df_test])
	df['word_shares'] = df.apply(word_shares, axis=1, raw=True)

	x = pd.DataFrame()

	x['h_word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
	x['h_word_match_2root'] = np.sqrt(x['h_word_match'])
	x['h_tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
	x['h_shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

	x['h_stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
	x['h_stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
	x['h_shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
	x['h_cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
	x['h_words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
	x['h_diff_stops_r']     = x['h_stops1_ratio'] - x['h_stops2_ratio']

	x['h_len_q1'] = df['question1'].apply(lambda x: len(str(x)))
	x['h_len_q2'] = df['question2'].apply(lambda x: len(str(x)))
	x['h_diff_len'] = x['h_len_q1'] - x['h_len_q2']
	
	x['h_caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
	x['h_caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
	x['h_diff_caps'] = x['h_caps_count_q1'] - x['h_caps_count_q2']

	x['h_len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
	x['h_len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
	x['h_diff_len_char'] = x['h_len_char_q1'] - x['h_len_char_q2']

	x['h_len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
	x['h_len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
	x['h_diff_len_word'] = x['h_len_word_q1'] - x['h_len_word_q2']

	x['h_avg_world_len1'] = x['h_len_char_q1'] / x['h_len_word_q1']
	x['h_avg_world_len2'] = x['h_len_char_q2'] / x['h_len_word_q2']
	x['h_diff_avg_word'] = x['h_avg_world_len1'] - x['h_avg_world_len2']

	x['h_exactly_same'] = (df['question1'] == df['question2']).astype(int)
	x['h_duplicated'] = df.duplicated(['question1','question2']).astype(int)
	add_word_count(x, df,'how')
	add_word_count(x, df,'what')
	add_word_count(x, df,'which')
	add_word_count(x, df,'who')
	add_word_count(x, df,'where')
	add_word_count(x, df,'when')
	add_word_count(x, df,'why')

	print(x.columns)
	print(x.describe())

	feature_names = list(x.columns.values)
# 	create_feature_map(feature_names)
# 	print("Features: {}".format(feature_names))

	x_train = x[:df_train.shape[0]]
	x_test  = x[df_train.shape[0]:]
	y_train = df_train['is_duplicate'].values
	del x, df_train
	return x_train,x_test

In [None]:
import pandas as pd
df_train = pd.read_csv('train_org.csv')

In [None]:
df_test = pd.read_csv('test_final.csv')

In [None]:
train_fea8,test_fea8 = feature8(df_train,df_test)

In [None]:
test_fea8.to_csv('./processed_data/test_no_dl_fea1.csv',index=False)

In [None]:
train_fea8.to_csv('./processed_data/train_no_dl_fea1.csv',index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
seed = 1024
np.random.seed(seed)
train = pd.read_csv("train_org.csv")
test = pd.read_csv("test_final.csv")

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
def stem_str(x,stemmer=SnowballStemmer('english')):
    x = text.re.sub("[^a-zA-Z0-9]"," ", x)
    #x = " ".join([stemmer.stem(z) for z in x.split(" ")])
    #x = " ".join(x.split())
    return x

porter = PorterStemmer()
snowball = SnowballStemmer('english')


print('Generate porter')
train['question1_porter'] = train['question1'].astype(str).apply(lambda x:stem_str(x.lower(),porter))
test['question1_porter'] = test['question1'].astype(str).apply(lambda x:stem_str(x.lower(),porter))

train['question2_porter'] = train['question2'].astype(str).apply(lambda x:stem_str(x.lower(),porter))
test['question2_porter'] = test['question2'].astype(str).apply(lambda x:stem_str(x.lower(),porter))

In [None]:
train.to_csv('./processed_data/train_porter.csv')
test.to_csv('./processed_data/test_porter.csv')

In [None]:
# train = pd.read_csv(path+"train_porter.csv")
# test = pd.read_csv(path+"test_porter.csv")
test['is_duplicated']=[-1]*test.shape[0]
len_train = train.shape[0]
data_all = pd.concat([train,test])

In [None]:
import distance
def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val
def calc_set_intersection(text_a, text_b):
    a = set(text_a.split())
    b = set(text_b.split())
    return _try_divide(len(a.intersection(b)) *1.0,len(a))
def str_jaccard(str1, str2):


    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res

# shortest alignment
def str_levenshtein_1(str1, str2):


    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=1)
    return res

# longest alignment
def str_levenshtein_2(str1, str2):

    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.nlevenshtein(str1, str2,method=2)
    return res

def str_sorensen(str1, str2):
    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.sorensen(str1_list, str2_list)
    return res
def feature9(df_):
    data = pd.DataFrame()
    print('Generate intersection')
    data['w_interaction'] = df_.astype(str).apply(lambda x:calc_set_intersection(x['question1'],x['question2']),axis=1)
    print('Generate porter intersection')
    data['w_porter_interaction'] = df_.astype(str).apply(lambda x:calc_set_intersection(x['question1_porter'],x['question2_porter']),axis=1)
    print('Generate jaccard')
    data['w_jaccard'] = df_.astype(str).apply(lambda x:str_jaccard(x['question1'],x['question2']),axis=1)
    print('Generate porter jaccard')
    data['w_porter_jaccard'] = df_.astype(str).apply(lambda x:str_jaccard(x['question1_porter'],x['question2_porter']),axis=1)
    print('Generate levenshtein_1')
    data['w_levenshtein_1']= df_.astype(str).apply(lambda x:str_levenshtein_1(x['question1'],x['question2']),axis=1)
    print('Generate porter levenshtein_1')
    data['w_porter_levenshtein_1'] = df_.astype(str).apply(lambda x:str_levenshtein_1(x['question1_porter'],x['question2_porter']),axis=1)
    print('Generate levenshtein_2')
    data['w_levenshtein_2'] = df_.astype(str).apply(lambda x:str_levenshtein_2(x['question1'],x['question2']),axis=1)
    print('Generate porter levenshtein_2')
    data['w_porter_levenshtein_2'] = df_.astype(str).apply(lambda x:str_levenshtein_2(x['question1_porter'],x['question2_porter']),axis=1)
    print('Generate sorensen')
    data['w_sorensen'] = df_.astype(str).apply(lambda x:str_sorensen(x['question1'],x['question2']),axis=1)
    print('Generate porter sorensen')
    data['w_porter_sorensen'] = df_.astype(str).apply(lambda x:str_sorensen(x['question1_porter'],x['question2_porter']),axis=1)
    return data

In [None]:
train_fea9 = feature9(train)

In [None]:
test_fea9 = feature9(test)

In [None]:
for c in train_fea9.columns:
    train_fea8[c] = train_fea9[c]

In [None]:
for c in test_fea9.columns:
    test_fea8[c] = test_fea9[c]

In [None]:
test_fea8.to_csv('./processed_data/test_no_dl_fea1.csv',index=False)
train_fea8.to_csv('./processed_data/train_no_dl_fea1.csv',index=False)

In [None]:
import pandas as pd

In [None]:
train_orig = pd.read_csv('train_org.csv')

In [None]:
test_orig = pd.read_csv('test_final.csv')

In [None]:
ques = pd.concat([train_orig[['question1', 'question2']], \
        test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
ques.shape

In [None]:
from collections import defaultdict

In [None]:
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

In [None]:
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

In [None]:
train_orig['q1_q2_intersect'] = train_orig.apply(q1_q2_intersect, axis=1, raw=True)
test_orig['q1_q2_intersect'] = test_orig.apply(q1_q2_intersect, axis=1, raw=True)

In [None]:
temp = train_orig.q1_q2_intersect.value_counts()

In [None]:
train_feat = train_orig[['q1_q2_intersect']]
test_feat = test_orig[['q1_q2_intersect']]

In [None]:
train_feat.describe()

In [None]:
test_fea8 = pd.read_csv('./processed_data/test_no_dl_fea1.csv')

In [None]:
test_fea8 = pd.read_csv('./processed_data/test_no_dl_fea1.csv')
train_fea8 = pd.read_csv('./processed_data/train_no_dl_fea1.csv')

In [None]:
train_fea8['magicfea2'] = train_feat.q1_q2_intersect

In [None]:
test_fea8['magicfea2'] = test_feat.q1_q2_intersect

In [None]:
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier


def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (float(len(shared_words_in_q1)) + float(len(shared_words_in_q2)))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (float(len(wic)) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1']))*1.0  
    l2 = len(''.join(row['question2']))
    if l2 == 0.0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0.0
    else:
        return 1.0 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights).astype(float) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights).astype(float) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['k_word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['k_tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['k_tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['k_jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['k_wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['k_wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['k_wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['k_wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['k_wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['k_wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['k_same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['k_char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['k_char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13

#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['k_total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['k_total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['k_char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X


def feature9():
    df_train = pd.read_csv('train_org.csv')
    df_train = df_train.fillna(' ')
    df_test = pd.read_csv('test_final.csv')
    ques = pd.concat([df_train[['question1', 'question2']], \
        df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
    q_dict = defaultdict(set)
    for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

    def q1_freq(row):
        return(len(q_dict[row['question1']]))
        
    def q2_freq(row):
        return(len(q_dict[row['question2']]))
        
    def q1_q2_intersect(row):
        return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

    df_train['k_q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
    df_train['k_q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
    df_train['k_q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

    df_test['k_q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
    df_test['k_q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
    df_test['k_q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

    test_leaky = df_test.loc[:, ['k_q1_q2_intersect','k_q1_freq','k_q2_freq']]
    del df_test

    train_leaky = df_train.loc[:, ['k_q1_q2_intersect','k_q1_freq','k_q2_freq']]

    # explore
    stops = set(stopwords.words("english"))

    df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
    df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

    words = [x for y in train_qs for x in y]
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}

    print('Building Features')
    X_train = build_features(df_train, stops, weights)
    X_train = pd.concat((X_train, train_leaky), axis=1)
#     y_train = df_train['is_duplicate'].values

#     X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)

#     #UPDownSampling
#     pos_train = X_train[y_train == 1]
#     neg_train = X_train[y_train == 0]
#     X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
#     y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
#     print(np.mean(y_train))
#     del pos_train, neg_train

#     pos_valid = X_valid[y_valid == 1]
#     neg_valid = X_valid[y_valid == 0]
#     X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
#     y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
#     print(np.mean(y_valid))
#     del pos_valid, neg_valid


#     params = {}
#     params['objective'] = 'binary:logistic'
#     params['eval_metric'] = 'logloss'
#     params['eta'] = 0.02
#     params['max_depth'] = 7
#     params['subsample'] = 0.6
#     params['base_score'] = 0.2
#     # params['scale_pos_weight'] = 0.2

#     d_train = xgb.DMatrix(X_train, label=y_train)
#     d_valid = xgb.DMatrix(X_valid, label=y_valid)

#     watchlist = [(d_train, 'train'), (d_valid, 'valid')]

#     bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
#     print(log_loss(y_valid, bst.predict(d_valid)))
#     bst.save_model(args.save + '.mdl')


#     print('Building Test Features')
#     df_test = pd.read_csv('../data/test_features.csv', encoding="ISO-8859-1")
#     x_test_ab = df_test.iloc[:, 2:-1]
#     x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
#     x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)
    
    df_test = pd.read_csv('test_final.csv')
    df_test = df_test.fillna(' ')

    df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
    df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
    
    x_test = build_features(df_test, stops, weights)
    x_test = pd.concat((x_test, test_leaky), axis=1)
#     d_test = xgb.DMatrix(x_test)
#     p_test = bst.predict(d_test)
#     sub = pd.DataFrame()
#     sub['test_id'] = df_test['test_id']
#     sub['is_duplicate'] = p_test
#     sub.to_csv('../predictions/' + args.save + '.csv')
    return X_train,x_test

In [None]:
train_9,test_9= feature9()

In [None]:
test_fea9 = pd.read_csv('./processed_data/test_no_dl_fea1.csv')
train_fea9 = pd.read_csv('./processed_data/train_no_dl_fea1.csv')

In [None]:
train_fea9.columns

In [None]:
for c in train_9.columns:
    train_fea9[c] = train_9[c]

In [None]:
train_fea9.to_csv('./processed_data/train_no_dl_fea1.csv',index=False)

In [None]:
for c in test_9.columns:
    test_fea9[c] = test_9[c]

In [None]:
test_fea9.to_csv('./processed_data/test_no_dl_fea1.csv',index=False)

In [None]:
test_fea9.head()

In [1]:
text = "I do not like green eggs and ham, I do not like them Sam I am!"

In [2]:
list(ngrams(nltk.word_tokenize(text),2))

NameError: name 'ngrams' is not defined

In [None]:
tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in scored)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [3]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
text = "Hi How are you? i am fine and you"
token=nltk.word_tokenize(text)
bigrams=ngrams(token,2)