In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import xgboost as xgb
from collections import Counter
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from collections import defaultdict

eng_stopwords = set(stopwords.words('english'))
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LTX_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LTX_\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [4]:
# Supported data_clean_type (DO NOT forget to put on "()", also, if data_clean_type is NOT empty string, please put a " " before (cleaned)):
# empty string, no character
# (cleaned)
# (cleaned)(hyper_cleaned)
# (cleaned)(hyper_cleaned)(punctuation_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(words_shortened)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)
data_clean_type = " (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)"
train_df = pd.read_csv("train" + data_clean_type + ".csv")
test_df = pd.read_csv("test" + data_clean_type + ".csv")
print(train_df.shape)
print(test_df.shape)

(399787, 14)
(4290, 4)


  exec(code_obj, self.user_global_ns, self.user_ns)


# Add word match share feature (feature 1)

In [5]:
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [6]:
def word_match_share(row):
    stops = set(stopwords.words("english"))
    q1words = {}
    q2words = {}
    for word in str(row[3]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[4]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def word_match_share_test(row):
    stops = set(stopwords.words("english"))
    q1words = {}
    q2words = {}
    for word in str(row[1]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[2]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

train_df['word_match_share'] = train_df.apply(word_match_share, axis=1, raw=True)
test_df['word_match_share'] = test_df.apply(word_match_share_test, axis=1, raw=True)

# Add TF-IDF word match share feature (feature 2)

In [7]:
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row[3]).lower().split():
        q1words[word] = 1
    for word in str(row[4]).lower().split():
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share_test(row):
    q1words = {}
    q2words = {}
    for word in str(row[1]).lower().split():
        q1words[word] = 1
    for word in str(row[2]).lower().split():
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train_df['tfidf_word_match_share'] = train_df.apply(tfidf_word_match_share, axis=1, raw=True)
test_df['tfidf_word_match_share'] = test_df.apply(tfidf_word_match_share_test, axis=1, raw=True)

  R = np.sum(shared_weights) / np.sum(total_weights)


# Add TF-IDF word match share for stop words feature (feature 3)

In [8]:
def tfidf_word_match_share_stop(row):
    stops = set(stopwords.words("english"))
    q1words = {}
    q2words = {}
    for word in str(row[3]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[4]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share_stop_test(row):
    stops = set(stopwords.words("english"))
    q1words = {}
    q2words = {}
    for word in str(row[1]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[2]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train_df['tfidf_word_match'] = train_df.apply(tfidf_word_match_share_stop, axis=1, raw=True)
test_df['tfidf_word_match'] = test_df.apply(tfidf_word_match_share_stop_test, axis=1, raw=True)

  R = np.sum(shared_weights) / np.sum(total_weights)


# Add "unigrams" feature (feature 4 and 5)

In [9]:
def get_unigrams(que):
    return [word for word in word_tokenize(que.lower()) if word not in eng_stopwords]

def get_common_unigrams(row):
    return len( set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])) )

def get_common_unigram_ratio(row):
    return float(row["unigrams_common_count"]) / max(len( set(row["unigrams_ques1"]).union(set(row["unigrams_ques2"])) ),1)

train_df["unigrams_ques1"] = train_df['question1'].apply(lambda x: get_unigrams(str(x)))
train_df["unigrams_ques2"] = train_df['question2'].apply(lambda x: get_unigrams(str(x)))
train_df["unigrams_common_count"] = train_df.apply(lambda row: get_common_unigrams(row),axis=1)
train_df["unigrams_common_ratio"] = train_df.apply(lambda row: get_common_unigram_ratio(row), axis=1)

test_df["unigrams_ques1"] = test_df['question1'].apply(lambda x: get_unigrams(str(x)))
test_df["unigrams_ques2"] = test_df['question2'].apply(lambda x: get_unigrams(str(x)))
test_df["unigrams_common_count"] = test_df.apply(lambda row: get_common_unigrams(row),axis=1)
test_df["unigrams_common_ratio"] = test_df.apply(lambda row: get_common_unigram_ratio(row), axis=1)

# Add frequency + intersection features(feature 6, 7, and 8)

In [10]:
ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')

In [11]:
from collections import defaultdict
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

In [12]:
def q1_freq(row):
    return(len(q_dict[row[3]]))
    
def q2_freq(row):
    return(len(q_dict[row[4]]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row[3]]).intersection(set(q_dict[row[4]]))))

train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

In [13]:
def q1_freq_test(row):
    return(len(q_dict[row[1]]))
    
def q2_freq_test(row):
    return(len(q_dict[row[2]]))
    
def q1_q2_intersect_test(row):
    return(len(set(q_dict[row[1]]).intersection(set(q_dict[row[2]]))))

test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect_test, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq_test, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq_test, axis=1, raw=True)

# Add jaccard feature (feature 9)

In [14]:
def jaccard(row):
    wic = set(str(row[3]).lower().split()).intersection(set(str(row[4]).lower().split()))
    uw = set(str(row[3]).lower().split()).union(set(str(row[4]).lower().split()))
    if len(uw) == 0:
        return np.nan
    else:
        return (len(wic) / len(uw))

def jaccard_test(row):
    wic = set(str(row[1]).lower().split()).intersection(set(str(row[2]).lower().split()))
    uw = set(str(row[1]).lower().split()).union(set(str(row[2]).lower().split()))
    if len(uw) == 0:
        return np.nan
    else:
        return (len(wic) / len(uw))

train_df['jaccard'] = train_df.apply(jaccard, axis=1, raw=True)
test_df['jaccard'] = test_df.apply(jaccard_test, axis=1, raw=True)

# Add common words feature (feature 10)

In [15]:
def common_words(row):
    return len(set(str(row[3]).lower().split()).intersection(set(str(row[4]).lower().split())))

def common_words_test(row):
    return len(set(str(row[1]).lower().split()).intersection(set(str(row[2]).lower().split())))

train_df['common_words'] = train_df.apply(common_words, axis=1, raw=True)
test_df['common_words'] = test_df.apply(common_words_test, axis=1, raw=True)

# Add common words for non-stop words feature (feature 11)

In [16]:
def common_words_stop(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[3]).lower().split()).intersection(set(str(row[4]).lower().split())) if x not in stops])

def common_words_stop_test(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[1]).lower().split()).intersection(set(str(row[2]).lower().split())) if x not in stops])

train_df['common_words_stop'] = train_df.apply(common_words_stop, axis=1, raw=True)
test_df['common_words_stop'] = test_df.apply(common_words_stop_test, axis=1, raw=True)

# Add total unique words feature (feature 12)

In [17]:
def total_unique_words(row):
    return len(set(str(row[3]).lower().split()).union(set(str(row[4]).lower().split())))

def total_unique_words_test(row):
    return len(set(str(row[1]).lower().split()).union(set(str(row[2]).lower().split())))

train_df['total_unique_words'] = train_df.apply(total_unique_words, axis=1, raw=True)
test_df['total_unique_words'] = test_df.apply(total_unique_words_test, axis=1, raw=True)

# Add total unique words for non-stop words feature (feature 13)

In [18]:
def total_unq_words_stop(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[3]).lower().split()).union(set(str(row[4]).lower().split())) if x not in stops])

def total_unq_words_stop_test(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[1]).lower().split()).union(set(str(row[2]).lower().split())) if x not in stops])

train_df['total_unq_words_stop'] = train_df.apply(total_unq_words_stop, axis=1, raw=True)
test_df['total_unq_words_stop'] = test_df.apply(total_unq_words_stop_test, axis=1, raw=True)

# Add word count difference feature (feature 14)

In [19]:
def wc_diff(row):
    return abs(len(str(row[3]).lower().split()) - len(str(row[4]).lower().split()))

def wc_diff_test(row):
    return abs(len(str(row[1]).lower().split()) - len(str(row[2]).lower().split()))

train_df['wc_diff'] = train_df.apply(wc_diff, axis=1, raw=True)
test_df['wc_diff'] = test_df.apply(wc_diff_test, axis=1, raw=True)

# Add word count ratio feature (feature 15)

In [20]:
def wc_ratio(row):
    l1 = len(str(row[3]).lower().split())
    l2 = len(str(row[4]).lower().split())
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
def wc_ratio_test(row):
    l1 = len(str(row[1]).lower().split())
    l2 = len(str(row[2]).lower().split())
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
train_df['wc_ratio'] = train_df.apply(wc_ratio, axis=1, raw=True)
test_df['wc_ratio'] = test_df.apply(wc_ratio_test, axis=1, raw=True)

# Add word count difference for unique words feature (feature 16)

In [21]:
def wc_diff_unique(row):
    return abs(len(set(str(row[3]).lower().split())) - len(set(str(row[4]).lower().split())))

def wc_diff_unique_test(row):
    return abs(len(set(str(row[1]).lower().split())) - len(set(str(row[2]).lower().split())))

train_df['wc_diff_unique'] = train_df.apply(wc_diff_unique, axis=1, raw=True)
test_df['wc_diff_unique'] = test_df.apply(wc_diff_unique_test, axis=1, raw=True)

# Add word count ratio for unique words feature (feature 17)

In [22]:
def wc_ratio_unique(row):
    l1 = len(set(str(row[3]).lower().split()))
    l2 = len(set(str(row[4]).lower().split()))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
def wc_ratio_unique_test(row):
    l1 = len(set(str(row[1]).lower().split()))
    l2 = len(set(str(row[2]).lower().split()))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
train_df['wc_ratio_unique'] = train_df.apply(wc_ratio_unique, axis=1, raw=True)
test_df['wc_ratio_unique'] = test_df.apply(wc_ratio_unique_test, axis=1, raw=True)

# Add word count difference for non-stop words feature (feature 18)

In [23]:
def wc_diff_unique_stop(row):
    stops = set(stopwords.words('english'))
    return abs(len([x for x in set(str(row[3]).lower().split()) if x not in stops]) - len([x for x in set(str(row[4]).lower().split()) if x not in stops]))

def wc_diff_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    return abs(len([x for x in set(str(row[1]).lower().split()) if x not in stops]) - len([x for x in set(str(row[2]).lower().split()) if x not in stops]))

train_df['wc_diff_unique_stop'] = train_df.apply(wc_diff_unique_stop, axis=1, raw=True)
test_df['wc_diff_unique_stop'] = test_df.apply(wc_diff_unique_stop_test, axis=1, raw=True)

# Add wc ratio for stop words feature (feature 19)

In [24]:
def wc_ratio_unique_stop(row):
    stops = set(stopwords.words('english'))
    l1 = len([x for x in set(str(row[3]).lower().split()) if x not in stops]) 
    l2 = len([x for x in set(str(row[4]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
def wc_ratio_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    l1 = len([x for x in set(str(row[1]).lower().split()) if x not in stops])
    l2 = len([x for x in set(str(row[2]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
train_df['wc_ratio_unique_stop'] = train_df.apply(wc_ratio_unique_stop, axis=1, raw=True)
test_df['wc_ratio_unique_stop'] = test_df.apply(wc_ratio_unique_stop_test, axis=1, raw=True)

# Add same start word feature (feature 20)

In [25]:
def same_start_word(row):
    if len(str(row[3]).lower().split()) == 0 or len(str(row[4]).lower().split()) == 0:
        return np.nan
    return int(str(row[3]).lower().split()[0] == str(row[4]).lower().split()[0])

def same_start_word_test(row):
    if len(str(row[1]).lower().split()) == 0 or len(str(row[2]).lower().split()) == 0:
        return np.nan
    return int(str(row[1]).lower().split()[0] == str(row[2]).lower().split()[0])

train_df['same_start_word'] = train_df.apply(same_start_word, axis=1, raw=True)
test_df['same_start_word'] = test_df.apply(same_start_word_test, axis=1, raw=True)

# Add character difference feature (feature 21)

In [26]:
def char_diff(row):
    return abs(len(''.join(str(row[3]).lower())) - len(''.join(str(row[4]).lower())))

def char_diff_test(row):
    return abs(len(''.join(str(row[1]).lower())) - len(''.join(str(row[2]).lower())))

train_df['char_diff'] = train_df.apply(char_diff, axis=1, raw=True)
test_df['char_diff'] = test_df.apply(char_diff_test, axis=1, raw=True)

# Add character length ratio feature (feature 22)

In [27]:
def char_ratio(row):
    l1 = len(''.join(str(row[3]).lower())) 
    l2 = len(''.join(str(row[4]).lower()))
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
def char_ratio_test(row):
    l1 = len(''.join(str(row[1]).lower())) 
    l2 = len(''.join(str(row[2]).lower()))
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
train_df['char_ratio'] = train_df.apply(char_ratio, axis=1, raw=True)
test_df['char_ratio'] = test_df.apply(char_ratio_test, axis=1, raw=True)

# Add number of character difference for non-stop words feature (feature 23)

In [28]:
def char_diff_unique_stop(row):
    stops = set(stopwords.words('english'))
    return abs(len(''.join([x for x in set(str(row[3]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[4]).lower().split()) if x not in stops])))

def char_diff_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    return abs(len(''.join([x for x in set(str(row[1]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[2]).lower().split()) if x not in stops])))

train_df['char_diff_unique_stop'] = train_df.apply(char_diff_unique_stop, axis=1, raw=True)
test_df['char_diff_unique_stop'] = test_df.apply(char_diff_unique_stop_test, axis=1, raw=True)

# Add Q1 to Q2 word count difference feature (feature 24)

In [29]:
def q1_to_q2_wc_diff(row):
    return len(str(row[3]).lower().split()) - len(str(row[4]).lower().split())

def q1_to_q2_wc_diff_test(row):
    return len(str(row[1]).lower().split()) - len(str(row[2]).lower().split())

train_df['q1_to_q2_wc_diff'] = train_df.apply(q1_to_q2_wc_diff, axis=1, raw=True)
test_df['q1_to_q2_wc_diff'] = test_df.apply(q1_to_q2_wc_diff_test, axis=1, raw=True)

# Add Q1 to Q2 word count ratio feature (feature 25)

In [30]:
def q1_to_q2_wc_ratio(row):
    l1 = len(str(row[3]).lower().split())
    l2 = len(str(row[4]).lower().split())
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
def q1_to_q2_wc_ratio_test(row):
    l1 = len(str(row[1]).lower().split())
    l2 = len(str(row[2]).lower().split())
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
train_df['q1_to_q2_wc_ratio'] = train_df.apply(q1_to_q2_wc_ratio, axis=1, raw=True)
test_df['q1_to_q2_wc_ratio'] = test_df.apply(q1_to_q2_wc_ratio_test, axis=1, raw=True)

# Add Q1 to Q2 word count difference for unique words feature (feature 26)

In [31]:
def q1_to_q2_wc_diff_unique(row):
    return len(set(str(row[3]).lower().split())) - len(set(str(row[4]).lower().split()))

def q1_to_q2_wc_diff_unique_test(row):
    return len(set(str(row[1]).lower().split())) - len(set(str(row[2]).lower().split()))

train_df['q1_to_q2_wc_diff_unique'] = train_df.apply(q1_to_q2_wc_diff_unique, axis=1, raw=True)
test_df['q1_to_q2_wc_diff_unique'] = test_df.apply(q1_to_q2_wc_diff_unique_test, axis=1, raw=True)

# Add Q1 to Q2 word count ratio for unique words feature (feature 27)

In [32]:
def q1_to_q2_wc_ratio_unique(row):
    l1 = len(set(str(row[3]).lower().split()))
    l2 = len(set(str(row[4]).lower().split()))
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
def q1_to_q2_wc_ratio_unique_test(row):
    l1 = len(set(str(row[1]).lower().split()))
    l2 = len(set(str(row[2]).lower().split()))
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
train_df['q1_to_q2_wc_ratio_unique'] = train_df.apply(q1_to_q2_wc_ratio_unique, axis=1, raw=True)
test_df['q1_to_q2_wc_ratio_unique'] = test_df.apply(q1_to_q2_wc_ratio_unique_test, axis=1, raw=True)

# Add Q1 to Q2 word count difference for non-stop words feature (feature 28)

In [33]:
def q1_to_q2_wc_diff_unique_stop(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[3]).lower().split()) if x not in stops]) - len([x for x in set(str(row[4]).lower().split()) if x not in stops])

def q1_to_q2_wc_diff_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    return len([x for x in set(str(row[1]).lower().split()) if x not in stops]) - len([x for x in set(str(row[2]).lower().split()) if x not in stops])

train_df['q1_to_q2_wc_diff_unique_stop'] = train_df.apply(q1_to_q2_wc_diff_unique_stop, axis=1, raw=True)
test_df['q1_to_q2_wc_diff_unique_stop'] = test_df.apply(q1_to_q2_wc_diff_unique_stop_test, axis=1, raw=True)

# Add Q1 to Q2 wc ratio for stop words feature (feature 29)

In [34]:
def q1_to_q2_wc_ratio_unique_stop(row):
    stops = set(stopwords.words('english'))
    l1 = len([x for x in set(str(row[3]).lower().split()) if x not in stops]) 
    l2 = len([x for x in set(str(row[4]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
def q1_to_q2_wc_ratio_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    l1 = len([x for x in set(str(row[1]).lower().split()) if x not in stops])
    l2 = len([x for x in set(str(row[2]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
train_df['q1_to_q2_wc_ratio_unique_stop'] = train_df.apply(q1_to_q2_wc_ratio_unique_stop, axis=1, raw=True)
test_df['q1_to_q2_wc_ratio_unique_stop'] = test_df.apply(q1_to_q2_wc_ratio_unique_stop_test, axis=1, raw=True)

# Add Q1 to Q2 character difference feature (feature 30)

In [35]:
def q1_to_q2_char_diff(row):
    return len(''.join(str(row[3]).lower())) - len(''.join(str(row[4]).lower()))

def q1_to_q2_char_diff_test(row):
    return len(''.join(str(row[1]).lower())) - len(''.join(str(row[2]).lower()))

train_df['q1_to_q2_char_diff'] = train_df.apply(q1_to_q2_char_diff, axis=1, raw=True)
test_df['q1_to_q2_char_diff'] = test_df.apply(q1_to_q2_char_diff_test, axis=1, raw=True)

# Add Q1 to Q2 character length ratio feature (feature 31)

In [36]:
def q1_to_q2_char_ratio(row):
    l1 = len(''.join(str(row[3]).lower())) 
    l2 = len(''.join(str(row[4]).lower()))
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
def q1_to_q2_char_ratio_test(row):
    l1 = len(''.join(str(row[1]).lower())) 
    l2 = len(''.join(str(row[2]).lower()))
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
train_df['q1_to_q2_char_ratio'] = train_df.apply(q1_to_q2_char_ratio, axis=1, raw=True)
test_df['q1_to_q2_char_ratio'] = test_df.apply(q1_to_q2_char_ratio_test, axis=1, raw=True)

# Add Q1 to Q2 number of character difference for non-stop words feature (feature 32)

In [37]:
def q1_to_q2_char_diff_unique_stop(row):
    stops = set(stopwords.words('english'))
    return len(''.join([x for x in set(str(row[3]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[4]).lower().split()) if x not in stops]))

def q1_to_q2_char_diff_unique_stop_test(row):
    stops = set(stopwords.words('english'))
    return len(''.join([x for x in set(str(row[1]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[2]).lower().split()) if x not in stops]))

train_df['q1_to_q2_char_diff_unique_stop'] = train_df.apply(q1_to_q2_char_diff_unique_stop, axis=1, raw=True)
test_df['q1_to_q2_char_diff_unique_stop'] = test_df.apply(q1_to_q2_char_diff_unique_stop_test, axis=1, raw=True)

# Add word match share feature using alternative stop words (feature 33)

In [38]:
def word_match_share_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    q1words = {}
    q2words = {}
    for word in str(row[3]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[4]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def word_match_share_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    q1words = {}
    q2words = {}
    for word in str(row[1]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[2]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

train_df['word_match_share_alternative_stop'] = train_df.apply(word_match_share_alternative_stop, axis=1, raw=True)
test_df['word_match_share_alternative_stop'] = test_df.apply(word_match_share_alternative_stop_test, axis=1, raw=True)

# Add TF-IDF word match share for stop words feature using alternative stop words (feature 34)

In [39]:
def tfidf_word_match_share_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    q1words = {}
    q2words = {}
    for word in str(row[3]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[4]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    q1words = {}
    q2words = {}
    for word in str(row[1]).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row[2]).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train_df['tfidf_word_match_share_alternative_stop'] = train_df.apply(tfidf_word_match_share_alternative_stop, axis=1, raw=True)
test_df['tfidf_word_match_share_alternative_stop'] = test_df.apply(tfidf_word_match_share_alternative_stop_test, axis=1, raw=True)

  R = np.sum(shared_weights) / np.sum(total_weights)


# Add common words for non-stop words feature using alternative stop words (feature 35)

In [40]:
def common_words_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[3]).lower().split()).intersection(set(str(row[4]).lower().split())) if x not in stops])

def common_words_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[1]).lower().split()).intersection(set(str(row[2]).lower().split())) if x not in stops])

train_df['common_words_alternative_stop'] = train_df.apply(common_words_alternative_stop, axis=1, raw=True)
test_df['common_words_alternative_stop'] = test_df.apply(common_words_alternative_stop_test, axis=1, raw=True)

# Add total unique words for non-stop words feature using alternative stop words (feature 36)

In [41]:
def total_unq_words_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[3]).lower().split()).union(set(str(row[4]).lower().split())) if x not in stops])

def total_unq_words_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[1]).lower().split()).union(set(str(row[2]).lower().split())) if x not in stops])

train_df['total_unq_words_alternative_stop'] = train_df.apply(total_unq_words_alternative_stop, axis=1, raw=True)
test_df['total_unq_words_alternative_stop'] = test_df.apply(total_unq_words_alternative_stop_test, axis=1, raw=True)

# Add word count difference for non-stop words feature using alternative stop words (feature 37)

In [42]:
def wc_diff_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return abs(len([x for x in set(str(row[3]).lower().split()) if x not in stops]) - len([x for x in set(str(row[4]).lower().split()) if x not in stops]))

def wc_diff_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return abs(len([x for x in set(str(row[1]).lower().split()) if x not in stops]) - len([x for x in set(str(row[2]).lower().split()) if x not in stops]))

train_df['wc_diff_unique_alternative_stop'] = train_df.apply(wc_diff_unique_alternative_stop, axis=1, raw=True)
test_df['wc_diff_unique_alternative_stop'] = test_df.apply(wc_diff_unique_alternative_stop_test, axis=1, raw=True)

# Add wc ratio for stop words feature using alternative stop words (feature 38)

In [43]:
def wc_ratio_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    l1 = len([x for x in set(str(row[3]).lower().split()) if x not in stops]) 
    l2 = len([x for x in set(str(row[4]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
def wc_ratio_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    l1 = len([x for x in set(str(row[1]).lower().split()) if x not in stops])
    l2 = len([x for x in set(str(row[2]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2 > 1:
        return l2 / l1
    else:
        return l1 / l2
    
train_df['wc_ratio_unique_alternative_stop'] = train_df.apply(wc_ratio_unique_alternative_stop, axis=1, raw=True)
test_df['wc_ratio_unique_alternative_stop'] = test_df.apply(wc_ratio_unique_alternative_stop_test, axis=1, raw=True)

# Add number of character difference for non-stop words feature using alternative stop words (feature 39)

In [44]:
def char_diff_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return abs(len(''.join([x for x in set(str(row[3]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[4]).lower().split()) if x not in stops])))

def char_diff_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return abs(len(''.join([x for x in set(str(row[1]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[2]).lower().split()) if x not in stops])))

train_df['char_diff_unique_alternative_stop'] = train_df.apply(char_diff_unique_alternative_stop, axis=1, raw=True)
test_df['char_diff_unique_alternative_stop'] = test_df.apply(char_diff_unique_alternative_stop_test, axis=1, raw=True)

# Add Q1 to Q2 word count difference for non-stop words feature using alternative stop words (feature 40)

In [45]:
def q1_to_q2_wc_diff_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[3]).lower().split()) if x not in stops]) - len([x for x in set(str(row[4]).lower().split()) if x not in stops])

def q1_to_q2_wc_diff_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len([x for x in set(str(row[1]).lower().split()) if x not in stops]) - len([x for x in set(str(row[2]).lower().split()) if x not in stops])

train_df['q1_to_q2_wc_diff_unique_alternative_stop'] = train_df.apply(q1_to_q2_wc_diff_unique_alternative_stop, axis=1, raw=True)
test_df['q1_to_q2_wc_diff_unique_alternative_stop'] = test_df.apply(q1_to_q2_wc_diff_unique_alternative_stop_test, axis=1, raw=True)

# Add Q1 to Q2 wc ratio for stop words feature using alternative stop words (feature 41)

In [46]:
def q1_to_q2_wc_ratio_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    l1 = len([x for x in set(str(row[3]).lower().split()) if x not in stops]) 
    l2 = len([x for x in set(str(row[4]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
def q1_to_q2_wc_ratio_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    l1 = len([x for x in set(str(row[1]).lower().split()) if x not in stops])
    l2 = len([x for x in set(str(row[2]).lower().split()) if x not in stops])
    if l2 == 0:
        return np.nan
    else:
        return l1 / l2
    
train_df['q1_to_q2_wc_ratio_unique_alternative_stop'] = train_df.apply(q1_to_q2_wc_ratio_unique_alternative_stop, axis=1, raw=True)
test_df['q1_to_q2_wc_ratio_unique_alternative_stop'] = test_df.apply(q1_to_q2_wc_ratio_unique_alternative_stop_test, axis=1, raw=True)

# Add Q1 to Q2 number of character difference for non-stop words feature using alternative stop words (feature 42)

In [47]:
def q1_to_q2_char_diff_unique_alternative_stop(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len(''.join([x for x in set(str(row[3]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[4]).lower().split()) if x not in stops]))

def q1_to_q2_char_diff_unique_alternative_stop_test(row):
    stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
    return len(''.join([x for x in set(str(row[1]).lower().split()) if x not in stops])) - len(''.join([x for x in set(str(row[2]).lower().split()) if x not in stops]))

train_df['q1_to_q2_char_diff_unique_alternative_stop'] = train_df.apply(q1_to_q2_char_diff_unique_alternative_stop, axis=1, raw=True)
test_df['q1_to_q2_char_diff_unique_alternative_stop'] = test_df.apply(q1_to_q2_char_diff_unique_alternative_stop_test, axis=1, raw=True)

# Export features

In [48]:
train_df.to_csv('train_with_features' + data_clean_type + '.csv', index=False)
test_df.to_csv('test_with_features' + data_clean_type + '.csv', index=False)