In [1]:
import re
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk import word_tokenize
from fuzzywuzzy import fuzz
import distance



In [2]:
STOP_WORDS = ['it', 'is', 'the', 'had', 'have', 'has', 'i', 'a', 'and', 'our', 'are', 
              'you', 'do', 'my', 'am', 'were', 'was', 'by', 'until', 'but', 'my', 
              'myself', 'itself', 'them', 'themself', 'themselves', 'at', 'ours', 
              'do', 'his', 'ourself', 'ourselves', 'must', 'we', 'be', 'here', 'there', 
              'some', 'for', 'while', 'should', 'her', 'hers', 'their', 'theirs', 'by', 
              'on', 'about', 'could', 'would', 'of', 'against', 'more', 'him', 'that', 
              'with', 'than', 'those', 'he', 'me', 'in', 'any', 'if', 'again', 'no', 
              'same', 'other', 'such', 'a', 'yours', 'your', 'so', 'having', 'once'] 

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
def pure_nlp(train, nlp_df, q_col):
    l_list = []
    p_list = []
    d_list = []
    a_list = []
    for i in tqdm(range(len(train)), desc='For {} column'.format(q_col)):
        lemma_list = []
        pos_list = []
        dep_list = []
        is_alpha_tag = []
        text = nlp(train[i])
        for j in range(len(text)):
            lemma_list.append(text[j].lemma_)
            pos_list.append(text[j].pos_)
            dep_list.append(text[j].dep_)
            if(text[j].is_alpha==True):
                is_alpha_tag.append(1)
            else:
                is_alpha_tag.append(0)
        l_list.append(lemma_list)
        p_list.append(pos_list)
        d_list.append(dep_list)
        a_list.append(is_alpha_tag)
    nlp_df['lemma'] = l_list
    nlp_df['POS'] = p_list
    nlp_df['dependency'] = d_list
    nlp_df['alpha'] = a_list

In [9]:
def get_pair_features(q1, q2):
    pair_features = [0.0]*8
    t_1 = word_tokenize(q1)
    t_2 = word_tokenize(q2)
    if len(t_1) == 0 or len(t_2) == 0:
        return pair_features
    w_1 = set(t_1)
    w_2 = set(t_2)
    w_count = len(w_1.intersection(w_2))
    t_count = len(set(t_1).intersection(set(t_2)))
    pair_features[0] = w_count/(min(len(w_1), len(w_2)) + 0.01)
    pair_features[1] = w_count/(min(len(w_1), len(w_2)) + 0.01)
    pair_features[2] = t_count/(min(len(t_1), len(t_2)) + 0.01)
    pair_features[3] = t_count/(max(len(t_1), len(t_2)) + 0.01)
    pair_features[4] = int(t_1[-1] == t_2[-1])
    pair_features[5] = int(t_1[0] == t_2[0])
    pair_features[6] = abs(len(t_1) - len(t_2))
    pair_features[7] = (len(t_1) + len(t_2))/2
    return pair_features

In [10]:
def longest_substring_ratio(s1, s2):
    strs = list(distance.lcsubstrings(s1, s2))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0])/(min(len(s1), len(s2)) + 1)

In [None]:
def insert_features(train_df, nlp_df):
    pair_features = train_df.apply(lambda x: get_pair_features(x['question1'], x['question2']), axis=1)
    nlp_df['min_common_word_count'] = list(map(lambda x: x[0], pair_features))
    nlp_df['max_common_word_count'] = list(map(lambda x: x[1], pair_features))
    nlp_df['min_common_token_count'] = list(map(lambda x: x[2], pair_features))
    nlp_df['max_common_token_count'] = list(map(lambda x: x[3], pair_features))
    nlp_df['last_token_check'] = list(map(lambda x: x[4], pair_features))
    nlp_df['first_token_check'] = list(map(lambda x: x[5], pair_features))
    nlp_df['diff_tokens']