# The world is cruel!

In [7]:
import numpy as np
import pandas as pd

import re
import distance
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from thefuzz import fuzz
import spacy

import xgboost as xgb

from joblib import dump, load
import pickle

# To predict in real time

In [8]:
def get_simple_features(question_one, quesion_two):
    SAFE_DIV = 0.0001
    STOP_WORDS = stopwords.words("english")
    
    simple_features = [0.0] * 10

    question_one_tokens = question_one.split()
    quesion_two_tokens = quesion_two.split()

    if len(question_one_tokens) == 0 or len(quesion_two_tokens) == 0:
        return simple_features

    question_one_words = set([word for word in question_one_tokens if word not in STOP_WORDS])
    quesion_two_words = set([word for word in quesion_two_tokens if word not in STOP_WORDS])

    question_one_stops = set([word for word in question_one_tokens if word in STOP_WORDS])
    quesion_two_stops = set([word for word in quesion_two_tokens if word in STOP_WORDS])

    common_word_count = len(question_one_words.intersection(quesion_two_words))
    common_stop_count = len(question_one_stops.intersection(quesion_two_stops))
    common_token_count = len(set(question_one_tokens).intersection(set(quesion_two_tokens)))

    simple_features[0] = common_word_count / (min(len(question_one_words), len(quesion_two_words)) + SAFE_DIV)
    simple_features[1] = common_word_count / (max(len(question_one_words), len(quesion_two_words)) + SAFE_DIV)
    simple_features[2] = common_stop_count / (min(len(question_one_stops), len(quesion_two_stops)) + SAFE_DIV)
    simple_features[3] = common_stop_count / (max(len(question_one_stops), len(quesion_two_stops)) + SAFE_DIV)
    simple_features[4] = common_token_count / (min(len(question_one_tokens), len(quesion_two_tokens)) + SAFE_DIV)
    simple_features[5] = common_token_count / (max(len(question_one_tokens), len(quesion_two_tokens)) + SAFE_DIV)
    simple_features[6] = int(question_one_tokens[-1] == quesion_two_tokens[-1])
    simple_features[7] = int(question_one_tokens[0] == quesion_two_tokens[0])
    simple_features[8] = abs(len(question_one_tokens) - len(quesion_two_tokens))
    simple_features[9] = (len(question_one_tokens) + len(quesion_two_tokens)) / 2

    return simple_features


def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))

    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)


def get_vectors_predict(question, tfidf_model, nlp_model):  # Step - 3.2
    doc1 = nlp_model(question)
    tfid_vector = np.zeros([len(doc1), len(doc1[0].vector)])

    for word in doc1:
        vector = word.vector

        try:
            idf = tfidf_model[str(word)]
        except:
            idf = 0

        tfid_vector += vector * idf

    tfid_vector = tfid_vector.mean(axis=0)

    return list(tfid_vector)

In [9]:
def preprocess(question):
    question = str(question).lower()
    question = question.replace(",000,000", "m").replace(",000", "k").replace("000", "k").replace("′", "'").replace("’", "'")\
        .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
        .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
        .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
        .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
        .replace("€", " euro ").replace("'ll", " will")

    question = re.sub(r"([0-9]+)000000", r"\1m", question)  # could be done with replce

    porter = PorterStemmer()
    pattern = re.compile('\W')

    if type(question) == type(''):
        question = porter.stem(question)  # programming -> program

        example1 = BeautifulSoup(question)
        question = example1.get_text()

    if type(question) == type(''):
        question = re.sub(pattern, ' ', question)

    return question


def get_features_predict(question_one, quesion_two):  # Step - 1
    question_one = preprocess(question_one)
    quesion_two = preprocess(quesion_two)

    features = get_simple_features(question_one, quesion_two)

    features.append(fuzz.token_set_ratio(question_one, quesion_two))
    features.append(fuzz.token_sort_ratio(question_one, quesion_two))
    features.append(fuzz.QRatio(question_one, quesion_two))
    features.append(fuzz.partial_ratio(question_one, quesion_two))
    
    features.append(get_longest_substr_ratio(question_one, quesion_two))
    
    features.append(1)
    features.append(1)    
    features.append(len(question_one))
    features.append(len(quesion_two))
    features.append(len(question_one.split(" ")))
    features.append(len(quesion_two.split(" ")))
    
    word_Common = 1.0 * len(set(question_one.split(" ")).intersection(set(quesion_two.split(" "))))
    word_Total = 1.0 * (len(question_one.split(" ")) + len(quesion_two.split(" ")))
    
    features.append(word_Common)    
    features.append(word_Total)    
    features.append(word_Common / word_Total) 
    
    features.append(2)    
    features.append(0)

    return features


def transform(question_one, quesion_two):
    features = get_features_predict(question_one, quesion_two)

    with open('Models/tfid_model.p', 'rb') as fp:
        tfid = pickle.load(fp)

    nlp_model = spacy.load("en_core_web_lg")

    question_one_vector = get_vectors_predict(question_one, tfid, nlp_model)
    question_two_vector = get_vectors_predict(quesion_two, tfid, nlp_model)

    to_test = [features + question_one_vector + question_two_vector]
    
    return to_test


def load_models():
    lr_cal_model = load("Models/Logistic_Calibrated_Quora.pkl")

    svm_cal_model = load("Models/SVM_Calibrated_Quora.pkl")

    keys = pd.read_csv("Models/Boost_Keys.csv", encoding='latin-1')
    keys.drop(['Unnamed: 0'], axis=1, inplace=True)
    keys = keys.T.to_numpy()[0]

    bst = xgb.Booster()
    bst.load_model('Models/Boost_F_Quora.json')

    stack_model = load("Models/Stack_F_Quora.pkl")
    
    return (lr_cal_model, svm_cal_model, stack_model, bst, keys)

In [10]:
def predict_real_time(question_one, quesion_two):
    to_test = transform(question_one, quesion_two)
    
    lr_cal_model, svm_cal_model, stack_model, boost_model, keys = load_models()

    boost_yp = xgb.DMatrix(pd.DataFrame(data=np.array(to_test[0])[:, np.newaxis].T, columns=keys))

    p_boost = boost_model.predict(boost_yp)
    p_lr = lr_cal_model.predict_proba(to_test)
    p_svm = svm_cal_model.predict_proba(to_test)
    p_stack = stack_model.predict_proba(to_test)

    print("Logistic:", 1 - p_lr[0][0], "\nSVM:", 1 - p_svm[0][0],
          "\nBoosting: ", p_boost[0], "\nStacking: ", 1 - p_stack[0][0])

    print("\nMean:", (p_boost[0] + 1-p_lr[0][0] + 1-p_svm[0][0] + 1-p_stack[0][0]) / 4)

    return None

In [11]:
question_one = "Who am I?"
quesion_two = "What am I?"

predict_real_time(question_one, quesion_two)

Logistic: 0.3214695739700255 
SVM: 0.46391128045886343 
Boosting:  0.26260194 
Stacking:  0.41151093990545595

Mean: 0.36487343403957606
