In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import joblib
from bs4 import BeautifulSoup
from tqdm import tqdm

from fuzzywuzzy import fuzz
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from gensim.models.keyedvectors import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer

#importing soma distances and calculating
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock,canberra, euclidean, minkowski
from scipy.spatial.distance import braycurtis, chebyshev, correlation, mahalanobis
from scipy.spatial.distance import seuclidean, hamming, jaccard, kulsinski, rogerstanimoto,\
                        russellrao, sokalmichener, sokalsneath, kulsinski, yule

import spacy
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

In [2]:
class LoadEssentials():
    def __init__(self, glove_file, word2tfidf_file, model_file):
        self.glove_file = glove_file
        self.word2tfidf_file= word2tfidf_file
        self.model_file = model_file
        
    def load_all(self):
        glove_model = self.load_glove_model(self.glove_file)
        word2tfidf = self.load_word2tfidf(self.word2tfidf_file ,3000)
        model = self.load_ml_model(self.model_file)
        return glove_model, word2tfidf, model
    
    def load_glove_model(self, file):
        return KeyedVectors.load_word2vec_format(file, binary=False, unicode_errors='ignore')
    
    def load_word2tfidf(self, file ,data_points):
        df = pd.read_csv(file)
        df['question1'] = df['question1'].apply(lambda x: str(x))
        df['question2'] = df['question2'].apply(lambda x: str(x))
        questions = list(df['question1'] + df['question2'])

        tfidf = TfidfVectorizer(lowercase=False, )
        tfidf.fit_transform(questions)

        # dict key:word and value:tf-idf score
        return dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        
    def load_ml_model(self, file):
        return joblib.load(file)
    
###################################################################################################
###################################################################################################
###################################################################################################

class BasicFeature():
    def __init__(self, df):
        self.df = df
        
    def extract_features(self):
        self.df['freq_qid1'] = self.df.groupby('qid1')['qid1'].transform('count') 
        self.df['freq_qid2'] = self.df.groupby('qid2')['qid2'].transform('count')
        self.df['q1len'] = self.df['question1'].str.len() 
        self.df['q2len'] = self.df['question2'].str.len()
        self.df['q1_n_words'] = self.df['question1'].apply(lambda row: len(row.split(" ")))
        self.df['q2_n_words'] = self.df['question2'].apply(lambda row: len(row.split(" ")))        
        self.df['word_Common'] = self.df.apply(self.normalized_word_Common, axis=1)
        self.df['word_Total'] = self.df.apply(self.normalized_word_Total, axis=1)
        self.df['word_share'] = self.df.apply(self.normalized_word_share, axis=1)
        self.df['freq_q1+q2'] = self.df['freq_qid1']+self.df['freq_qid2']
        self.df['freq_q1-q2'] = abs(self.df['freq_qid1']-self.df['freq_qid2'])
        return self.df
        
    def normalized_word_Common(self, row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
        return 1.0 * len(w1 & w2)

    def normalized_word_Total(self, row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
        return 1.0 * (len(w1) + len(w2))

    def normalized_word_share(self, row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
        return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
    
###################################################################################################
###################################################################################################
###################################################################################################

class AdvanceFeature():
    def __init__(self, df):
        self.df = df
        # To get the results in 4 decemal points
        self.SAFE_DIV = 0.0001 
        self.STOP_WORDS = stopwords.words("english")
        
    def extract_features(self):
        # preprocessing each question
        self.df["question1"] = self.df["question1"].fillna("").apply(self.preprocess)
        self.df["question2"] = self.df["question2"].fillna("").apply(self.preprocess)

        # Merging Features with dataset
        token_features = self.df.apply(lambda x: self.get_token_features(x["question1"], x["question2"]), axis=1)
        self.df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        self.df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        self.df["csc_min"]       = list(map(lambda x: x[2], token_features))
        self.df["csc_max"]       = list(map(lambda x: x[3], token_features))
        self.df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        self.df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        self.df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        self.df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        self.df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        self.df["mean_len"]      = list(map(lambda x: x[9], token_features))

        #Computing Fuzzy Features and Merging with Dataset

        # do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
        # https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
        # https://github.com/seatgeek/fuzzywuzzy
        self.df["token_set_ratio"] = self.df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
        # The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and 
        # then joining them back into a string We then compare the transformed strings with a simple ratio().
        self.df["token_sort_ratio"] = self.df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
        self.df["fuzz_ratio"] = self.df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
        self.df["fuzz_partial_ratio"] = self.df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
        return self.df
        
    def get_token_features(self, q1, q2):
        token_features = [0.0]*10
        
        # Converting the Sentence into Tokens: 
        q1_tokens = q1.split()
        q2_tokens = q2.split()

        if len(q1_tokens) == 0 or len(q2_tokens) == 0:
            return token_features
        
        # Get the non-stopwords in Questions
        q1_words = set([word for word in q1_tokens if word not in self.STOP_WORDS])
        q2_words = set([word for word in q2_tokens if word not in self.STOP_WORDS])

        #Get the stopwords in Questions
        q1_stops = set([word for word in q1_tokens if word in self.STOP_WORDS])
        q2_stops = set([word for word in q2_tokens if word in self.STOP_WORDS])
        # Get the common non-stopwords from Question pair
        common_word_count = len(q1_words.intersection(q2_words))
        # Get the common stopwords from Question pair
        common_stop_count = len(q1_stops.intersection(q2_stops))

        # Get the common Tokens from Question pair
        common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
        token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + self.SAFE_DIV)
        token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + self.SAFE_DIV)
        token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + self.SAFE_DIV)
        token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + self.SAFE_DIV)
        token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + self.SAFE_DIV)
        token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + self.SAFE_DIV)
        # Last word of both question is same or not
        token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
        # First word of both question is same or not
        token_features[7] = int(q1_tokens[0] == q2_tokens[0])
        token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
        #Average Token Length of both Questions
        token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
        return token_features
    
    def preprocess(self, x):
        x = str(x).lower()
        x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
           .replace("€", " euro ").replace("'ll", " will")
        x = re.sub(r"([0-9]+)000000", r"\1m", x)
        x = re.sub(r"([0-9]+)000", r"\1k", x)

        porter = PorterStemmer()
        pattern = re.compile('\W')
        
        if type(x) == type(''):
            x = re.sub(pattern, ' ', x)
        if type(x) == type(''):
            x = porter.stem(x)
            example1 = BeautifulSoup(x)
            x = example1.get_text()
        return x
    
###################################################################################################
###################################################################################################
###################################################################################################

class DistanceFeature():
    def __init__(self, df, glove):
        self.df = df
        self.glove_model = glove
        
    def extract_features(self):
        #converting into lists
        list_of_question1=[]
        for sent in df['question1'].values:
            list_of_question1.append(sent.split())
        list_of_question2=[]
        for sent in df['question2'].values:
            list_of_question2.append(sent.split())

        avgw2v_q1 = self.avg_w2v(list_of_question1, self.glove_model,300)
        avgw2v_q2 = self.avg_w2v(list_of_question2, self.glove_model,300)
        self.df['Word_Mover_Dist'] = self.df.apply(lambda x: self.wmd(x['question1'], x['question2'], self.glove_model), axis=1)
        self.df['dist_cosine'] = [cosine(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_cityblock'] = [cityblock(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_canberra'] = [canberra(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_euclidean'] = [euclidean(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_minkowski'] = [minkowski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_braycurtis'] = [braycurtis(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_chebyshev'] = [chebyshev(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_correlation'] = [correlation(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_hamming'] = [hamming(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_jaccard'] = [jaccard(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_kulsinski'] = [kulsinski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_rogerstanimoto'] = [rogerstanimoto(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_russellrao'] = [russellrao(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_sokalmichener'] = [sokalmichener(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_kulsinski'] = [kulsinski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        self.df['dist_yule'] = [yule(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
        #filling na values with 0  for cosine distance
        self.df = self.df.fillna(0)
        return self.df        

    def wmd(self, s1, s2, model):
        s1 = str(s1)
        s2 = str(s2)
        s1 = s1.split()
        s2 = s2.split()
        return model.wmdistance(s1, s2)
    
    def avg_w2v(self, list_of_sent, model, d):
        '''
        Returns average of word vectors for
        each sentance with dimension of model given
        '''
        sent_vectors = []
        for sent in list_of_sent: # for each review/sentence
            doc = [word for word in sent if word in model.key_to_index]
            if doc: 
                sent_vec = np.mean(model[doc],axis=0)
            else:
                sent_vec = np.zeros(d)
            sent_vectors.append(sent_vec)
        return sent_vectors
       
###################################################################################################
###################################################################################################
###################################################################################################

class Tfidf_Weighted_W2V_Features():
    def __init__(self, df, w2v):
        self.df = df
        self.word2tfidf = w2v
        
    def extract_features(self):
        df_new= pd.DataFrame()
        df_new['id']=df['id']

        df_new['q1_feats_m'] = list(self.tifidf_to_wieghtedW2V(self.df['question1'], self.word2tfidf))
        df_new['q2_feats_m'] = list(self.tifidf_to_wieghtedW2V(self.df['question2'], self.word2tfidf))

        df_q1 = pd.DataFrame(df_new['q1_feats_m'].values.tolist(), index= df_new.index)
        df_q2 = pd.DataFrame(df_new['q2_feats_m'].values.tolist(), index= df_new.index)
        df_q1['id']=df['id']
        df_q2['id']=df['id']
        result  = df_q1.merge(df_q2, on='id',how='left')
        result['id']=df['id']

        return self.df.merge(result, on='id',how='left')
        
    def tifidf_to_wieghtedW2V(self, text_df, tfidf_vectors):
        vecs = []
        # https://github.com/noamraph/tqdm
        # tqdm is used to print the progress bar
        for ques in tqdm(list(text_df)):
            doc = nlp(ques) 
            # 384 is the number of dimensions of vectors 
            mean_vec = np.zeros([len(doc), len(doc[0].vector)])
            for word in doc:
                # word2vec
                vec = word.vector
                # fetch df score
                try:
                    idf = tfidf_vectors[str(word)]
                except:
                    idf = 0
                # compute final vec
                mean_vec += vec * idf
            mean_vec = mean_vec.mean(axis=0)
            vecs.append(mean_vec)
        return vecs
    
###################################################################################################
###################################################################################################
###################################################################################################

class CleanFeature():
    def __init__(self, df):
        self.df = df
        
    def clean_feature(self):
        self.df = self.df.drop(['id', 'qid1', 'qid2', 'question1', 'question2'], axis=1)
        self.df = self.clean_dataset()
        self.df = self.df.replace(np.nan, 0)
        self.df.reset_index()
        self.df = pd.DataFrame(np.array(self.df.values, dtype=np.float64), columns=list(self.df.columns))
        return self.df

    def clean_dataset(self):
        assert isinstance(self.df, pd.DataFrame), "df needs to be a pd.DataFrame"
        self.df.dropna(inplace=True)
        indices_to_keep = ~self.df.isin([np.nan, np.inf, -np.inf]).any(1)
        return self.df[indices_to_keep].astype(np.float64)

## Applying Model

In [3]:
load = LoadEssentials(glove_file="data/glove_vectors.txt",
                      word2tfidf_file="data/00_train.csv",
                      model_file="models/TfidfAvgW2V_XGBOOST.joblib")
glove, w2v, model = load.load_all()

In [19]:
def questions_to_df(id, qid1, qid2,  q1, q2): 
    return pd.DataFrame([[id, qid1, qid2, q1, q2]], 
                        columns =["id", 'qid1', 'qid2', 'question1', 'question2'])

In [20]:
# Duplicate question: 1
dup_q1 = "How do I read and find my YouTube comments?"
dup_q2 = "Read my Youtube comments?"

df =  questions_to_df(4, 3, 5, dup_q1, dup_q2)
df = BasicFeature(df).extract_features()
df = AdvanceFeature(df).extract_features()
df = DistanceFeature(df, glove).extract_features()
df = Tfidf_Weighted_W2V_Features(df, w2v).extract_features()
df = CleanFeature(df).clean_feature()
print(model.predict(df))
model.predict_proba(df)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

[1.]





array([[0.49106997, 0.50893   ]], dtype=float32)

In [21]:
# Not duplicate question: "0"
not_dup_q1 = "What is the story of Kohinoor (Koh-i-Noor) Diamond?"
not_dup_q2 = "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?"

df =  questions_to_df(4, 3, 5, not_dup_q1, not_dup_q2)
df = BasicFeature(df).extract_features()
df = AdvanceFeature(df).extract_features()
df = DistanceFeature(df, glove).extract_features()
df = Tfidf_Weighted_W2V_Features(df, w2v).extract_features()
df = CleanFeature(df).clean_feature()
print(model.predict(df))
model.predict_proba(df)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.01it/s]

[0.]





array([[0.5545331 , 0.44546688]], dtype=float32)