# Quora Question Pair Similarity
### Kaggle Competition link: https://www.kaggle.com/c/quora-question-pairs

### Our strategy for feature engineering is:
0. Load raw & processed data
1. Feature Engineering (on raw data - basic)
2. EDA of new features
3. Feature Engineering (on preprocessed data - thefuzz & raios)
4. EDA of new features
5. Feature Engineering (on processed data - TFIDF Weighted W2V)
6. EDA of new features
7. Combine all features 
8. Store the final features in sqlite database

In [2]:

# Imports 

# General 
from datetime import datetime
import os 
from tqdm import tqdm


# Data
import pandas as pd 
import numpy as np

# Visualization
import matplotlib.pyplot as plt 
import seaborn as sns 

# Feature engineering
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
#nltk.download('punkt')
from thefuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
#!python -m spacy download en_core_web_lg

# Database
import sqlite3
from sqlalchemy import create_engine



#### 1. Load data

In [184]:
start = datetime.now()

# Read data
raw_data = pd.read_csv("../data/raw/train.csv", sep=",")

# Processed data 
processed_data = pd.read_csv("../data/processed/processed_data.csv", sep=",")

print('Data loading completed.\nTime taken: {0}'.format(datetime.now() - start))

Data loading completed.
Time taken: 0:00:01.876207


In [8]:
raw_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [19]:
processed_data.drop(["Unnamed: 0"], axis=1, inplace=True)
processed_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,step step guide invest share market india,step step guide invest share market,0
1,1,3,4,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,2,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,3,7,8,mentally lonely solve,find remainder math2324math divided 2423,0
4,4,9,10,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


#### 1. Feature Engineering (on raw data)
<li>q1_frequency = Frequency of Question1 in train corpus </li>
<li>q2_frequency = Frequency of Question2 in train corpus </li>
<li>q1_len = Length of Question1 </li>
<li>q2_len = Length of Question2 </li>
<li>q1_tokens_count = # of Tokens in Question1 </li>
<li>q2_tokens_count = # of Tokens in Question2 </li>
<li>q1_words_count = # of Words in Question1 </li>
<li>q2_words_count = # of Words in Question2 </li>
<li>q1_nonstopwords_count = # of Non-Stopwords in Question1 </li>
<li>q2_nonstopwords_count = # of Non-Stopwords in Question2 </li>
<li>common_tokens_count = # of Common Tokens in Question1 & Question2 </li>
<li>common_tokens_share = (# of Common Tokens in Question1 & Question2) / (Total Tokens in Question1 & Question2) </li>
<li>common_words_count = # of Common Words in Question1 & Question2 </li>
<li>common_words_share = (# of Common Words in Question1 & Question2) / (Total Words in Question1 & Question2) </li>
<li>common_nonstopwords_count = # of Common Non-Stopwords in Question1 & Question2 </li>
<li>common_nonstopwords_share = (# of Common Non-Stopwords in Question1 & Question2) / (Total Non-Stopwords in Question1 & Question2) </li>


In [167]:
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('not')

def compute_common_tokens_count(row):
    """
    Returns number of common tokens in Question1 & Question2
    
    """
    t1 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question1']) ))
    t2 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question2']) ))  
    return len(t1 & t2)

def compute_common_tokens_share(row):
    """
    Returns percentage of common tokens in Question1 & Question2 i.e. common tokens / total tokens
    
    """
    t1 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question1']) ))
    t2 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question2']) ))  
    return len(t1 & t2) / (len(t1)+len(t2))

def compute_common_words_count(row):
    """
    Returns number of common words in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), row['question1'].split(" ") ))
    w2 = set(map(lambda word: str(word).lower().strip(), row['question2'].split(" ") ))    
    return len(w1 & w2)   

def compute_common_words_share(row):
    """
    Returns percentage of common words in Question1 & Question2 i.e. common words / total words
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), row['question1'].split(" ") ))
    w2 = set(map(lambda word: str(word).lower().strip(), row['question2'].split(" ") ))    
    return len(w1 & w2) / (len(w1) + len(w2))

def compute_common_nonstopwords_count(row):
    """
    Returns number of common nonstopwords in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question1'].split(" ") if word not in STOPWORDS] ))
    w2 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question2'].split(" ") if word not in STOPWORDS] ))    
    return len(w1 & w2)    

def compute_common_nonstopwords_share(row):
    """
    Returns percentage of common nonstopwords in Question1 & Question2 i.e. common nonstopwords / total nonstopwords
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question1'].split(" ") if word not in STOPWORDS] ))
    w2 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question2'].split(" ") if word not in STOPWORDS] ))    
    return len(w1 & w2) / (len(w1) + len(w2))


In [168]:
def feature_engg_basic(df):
    """
    Returns set of features:
	q1_frequency = Frequency of Question1 in train corpus 
	q2_frequency = Frequency of Question2 in train corpus 
	q1_len = Length of Question1 
	q2_len = Length of Question2 
	q1_tokens_count = # of Tokens in Question1 
	q2_tokens_count = # of Tokens in Question2 
	q1_words_count = # of Words in Question1 
	q2_words_count = # of Words in Question2 
	q1_nonstopwords_count = # of Non-Stopwords in Question1 
	q2_nonstopwords_count = # of Non-Stopwords in Question2 
	common_tokens_count = # of Common Tokens in Question1 & Question2 
	common_tokens_share = (# of Common Tokens in Question1 & Question2) / (Total Tokens in Question1 & Question2) 
	common_words_count = # of Common Words in Question1 & Question2 
	common_words_share = (# of Common Words in Question1 & Question2) / (Total Words in Question1 & Question2) 
	common_nonstopwords_count = # of Common Non-Stopwords in Question1 & Question2 
	common_nonstopwords_share = (# of Common Non-Stopwords in Question1 & Question2) / (Total Non-Stopwords in Question1 & Question2)     
    
    """
    # Convert all questions to string first
    df['question1'] = df['question1'].apply(lambda x: str(x))
    df['question2'] = df['question2'].apply(lambda x: str(x))
    
    # q1_frequency
    start = datetime.now()
    df['q1_frequency'] = df.groupby(by='qid1')['qid1'].transform('count')
    print("Feature 'q1_frequency' created. Time taken: {0}".format(datetime.now() - start))
    
    # q2_frequency
    start = datetime.now()
    df['q2_frequency'] = df.groupby(by='qid2')['qid2'].transform('count')
    print("Feature 'q2_frequency' created. Time taken: {0}".format(datetime.now() - start))
    
    # q1_length
    start = datetime.now()
    df['q1_length'] = df['question1'].str.len()
    print("Feature 'q1_length' created. Time taken: {0}".format(datetime.now() - start))
    
    # q2_length
    start = datetime.now()
    df['q2_length'] = df['question2'].str.len()
    print("Feature 'q2_length' created. Time taken: {0}".format(datetime.now() - start))
    
    # q1_tokens_count
    start = datetime.now()
    df['q1_tokens_count'] = df['question1'].apply(lambda x: len(word_tokenize(x)))
    print("Feature 'q1_tokens_count' created. Time taken: {0}".format(datetime.now() - start))
    
    # q2_tokens_count
    start = datetime.now()
    df['q2_tokens_count'] = df['question2'].apply(lambda x: len(word_tokenize(x)))
    print("Feature 'q2_tokens_count' created. Time taken: {0}".format(datetime.now() - start))

    # q1_words_count
    start = datetime.now()
    df['q1_words_count'] = df['question1'].apply(lambda x: len(x.split(" ")))
    print("Feature 'q1_words_count' created. Time taken: {0}".format(datetime.now() - start))

    # q2_words_count
    start = datetime.now()
    df['q2_words_count'] = df['question2'].apply(lambda x: len(x.split(" ")))
    print("Feature 'q2_words_count' created. Time taken: {0}".format(datetime.now() - start))
    
    # q1_nonstopwords_count
    start = datetime.now()
    df['q1_nonstopwords_count'] = df['question1'].apply(lambda x: len([word for word in str.lower(x).split(" ") if word not in STOPWORDS]))
    print("Feature 'q1_nonstopwords_count' created. Time taken: {0}".format(datetime.now() - start))
    
    # q2_nonstopwords_count
    start = datetime.now()
    df['q2_nonstopwords_count'] = df['question2'].apply(lambda x: len([word for word in str.lower(x).split(" ") if word not in STOPWORDS]))   
    print("Feature 'q2_nonstopwords_count' created. Time taken: {0}".format(datetime.now() - start)) 
    
    # common_tokens_count
    start = datetime.now()
    df['common_tokens_count'] =  df.apply(compute_common_tokens_count, axis=1)
    print("Feature 'common_tokens_count' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_tokens_share
    start = datetime.now()
    df['common_tokens_share'] =  df.apply(compute_common_tokens_share, axis=1)
    print("Feature 'common_tokens_share' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_words_count
    start = datetime.now()
    df['common_words_count'] =  df.apply(compute_common_words_count, axis=1)
    print("Feature 'common_words_count' created. Time taken: {0}".format(datetime.now() - start))
        
    # common_words_share
    start = datetime.now()
    df['common_words_share'] =  df.apply(compute_common_words_share, axis=1)
    print("Feature 'common_words_share' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_nonstopwords_count
    start = datetime.now()
    df['common_nonstopwords_count'] =  df.apply(compute_common_nonstopwords_count, axis=1)
    print("Feature 'common_nonstopwords_count' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_nonstopwords_share
    start = datetime.now()
    df['common_nonstopwords_share'] =  df.apply(compute_common_nonstopwords_share, axis=1)
    print("Feature 'common_nonstopwords_share' created. Time taken: {0}".format(datetime.now() - start))
    
    

In [152]:
raw_data_featurized = raw_data.copy()

In [153]:
if os.path.isfile("../data/processed/raw_data_featurized.csv"):
    print('Featurized data already available. Getting it from the disk...')
    raw_data_featurized = pd.read_csv("../data/processed/raw_data_featurized.csv", sep=",", encoding='latin-1')
    raw_data_featurized.fillna("")
    print('Data loaded!')
else:
    start = datetime.now()
    print("Feature engineering on raw_data started!")
    feature_engg_basic(raw_data_featurized)
    print("Feature engineering on raw_data completed!\n\nTotal time taken: {0}".format(datetime.now() - start))

Feature engineering on raw_data started!
Feature 'q1_frequency' created. Time taken: 0:00:00.076525
Feature 'q2_frequency' created. Time taken: 0:00:00.056018
Feature 'q1_length' created. Time taken: 0:00:00.100526
Feature 'q2_length' created. Time taken: 0:00:00.117523
Feature 'q1_tokens_count' created. Time taken: 0:00:38.699519
Feature 'q2_tokens_count' created. Time taken: 0:00:40.492525
Feature 'q1_words_count' created. Time taken: 0:00:00.373136
Feature 'q2_words_count' created. Time taken: 0:00:00.385225
Feature 'q1_nonstopwords_count' created. Time taken: 0:00:00.931859
Feature 'q2_nonstopwords_count' created. Time taken: 0:00:00.926097
Feature 'common_tokens_count' created. Time taken: 0:01:34.755135
Feature 'common_tokens_share' created. Time taken: 0:01:54.588778
Feature 'common_words_count' created. Time taken: 0:00:07.703769
Feature 'common_words_share' created. Time taken: 0:00:07.634031
Feature 'common_nonstopwords_count' created. Time taken: 0:00:07.646243
Feature 'comm

In [156]:
print('Shape of raw_data: {0}'.format(raw_data.shape))
print('Shape of raw_data_featurized: {0}'.format(raw_data_featurized.shape))


Shape of raw_data: (404290, 6)
Shape of raw_data_featurized: (404290, 22)


In [186]:
# Export raw_data_featurized to csv
raw_data_featurized.to_csv('../data/processed/raw_data_featurized.csv', index=False)
print('raw_data_featurized exported!')

raw_data_featurized exported!


#### 3. Feature Engineering (on processed data)
Here we will use the library thefuzz (https://github.com/seatgeek/thefuzz) to generate features and we will also use some advanced ratios:

<li>Fuzz ratio </li>
<li>Fuzz partial ratio </li>
<li>Fuzz token sort ratio </li>
<li>Fuzz token set ratio </li>
<li>Fuzz partial token sort ratio </li>


<li>common_tokens_count_min = (# of Common Tokens in Question1 & Question2) / MIN(Total Tokens in Question1, Total Tokens in Question2) </li>
<li>common_tokens_count_max = (# of Common Tokens in Question1 & Question2) / MAX(Total Tokens in Question1, Total Tokens in Question2) </li>
<li>common_words_count_min = (# of Common Words in Question1 & Question2) / MIN(Total Words in Question1,Total Words in Question2) </li>
<li>common_words_count_max = (# of Common Words in Question1 & Question2) / MAX(Total Words in Question1, Total Words in Question2) </li>
<li>common_nonstopwords_count_min = (# of Common Non-Stopwords in Question1 & Question2) / MIN(Total Non-Stopwords in Question1, Total Non-Stopwords in Question2) </li>
<li>common_nonstopwords_count_max = (# of Common Non-Stopwords in Question1 & Question2) / MAX(Total Non-Stopwords in Question1, Total Non-Stopwords in Question2) </li>


In [177]:
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('not')

SAFE_DIVISION = 0.0001

def compute_common_tokens_count_min(row):
    """
    Returns number of common tokens in Question1 & Question2
    
    """
    t1 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question1']) ))
    t2 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question2']) ))  
    return len(t1 & t2) / (min(len(t1), len(t2)) + SAFE_DIVISION)

def compute_common_tokens_count_max(row):
    """
    Returns number of common tokens in Question1 & Question2
    
    """
    t1 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question1']) ))
    t2 = set(map(lambda word: str(word).lower().strip(), word_tokenize(row['question2']) ))  
    return len(t1 & t2) / (max(len(t1), len(t2)) + SAFE_DIVISION)


def compute_common_words_count_min(row):
    """
    Returns number of common words in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), row['question1'].split(" ") ))
    w2 = set(map(lambda word: str(word).lower().strip(), row['question2'].split(" ") ))    
    return len(w1 & w2) / (min(len(w1), len(w2)) + SAFE_DIVISION)

def compute_common_words_count_max(row):
    """
    Returns number of common words in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), row['question1'].split(" ") ))
    w2 = set(map(lambda word: str(word).lower().strip(), row['question2'].split(" ") ))    
    return len(w1 & w2) / (max(len(w1), len(w2)) + SAFE_DIVISION)

def compute_common_nonstopwords_count_min(row):
    """
    Returns number of common nonstopwords in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question1'].split(" ") if word not in STOPWORDS] ))
    w2 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question2'].split(" ") if word not in STOPWORDS] ))    
    return len(w1 & w2) / (min(len(w1), len(w2)) + SAFE_DIVISION)

def compute_common_nonstopwords_count_max(row):
    """
    Returns number of common nonstopwords in Question1 & Question2
    
    """
    w1 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question1'].split(" ") if word not in STOPWORDS] ))
    w2 = set(map(lambda word: str(word).lower().strip(), [word for word in row['question2'].split(" ") if word not in STOPWORDS] ))    
    return len(w1 & w2) / (max(len(w1), len(w2)) + SAFE_DIVISION)



In [178]:
def feature_engg_adv(df):
    """
    Returns set of features:
    fuzz_ratio 
    fuzz_partial_ratio 
    fuzz_token_sort_ratio 
    fuzz_token_set_ratio 
    fuzz_partial_token_sort_ratio 
    common_tokens_count_min = (# of Common Tokens in Question1 & Question2) / MIN(Total Tokens in Question1, Total Tokens in Question2) 
    common_tokens_count_max = (# of Common Tokens in Question1 & Question2) / MAX(Total Tokens in Question1, Total Tokens in Question2) 
    common_words_count_min = (# of Common Words in Question1 & Question2) / MIN(Total Words in Question1,Total Words in Question2) 
    common_words_count_max = (# of Common Words in Question1 & Question2) / MAX(Total Words in Question1, Total Words in Question2) 
    common_nonstopwords_count_min = (# of Common Non-Stopwords in Question1 & Question2) / MIN(Total Non-Stopwords in Question1, Total Non-Stopwords in Question2) 
    common_nonstopwords_count_max = (# of Common Non-Stopwords in Question1 & Question2) / MAX(Total Non-Stopwords in Question1, Total Non-Stopwords in Question2) 
    
    """
    start = datetime.now()
    
    # Convert all questions to string
    df['question1'] = df['question1'].apply(lambda x: str(x))
    df['question2'] = df['question2'].apply(lambda x: str(x))
    
    # fuzz_ratio
    start = datetime.now()
    df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(x['question1'], x['question2']), axis=1 )
    print("Feature 'fuzz_ratio' created. Time taken: {0}".format(datetime.now() - start))
    
    # fuzz_partial_ratio
    start = datetime.now()
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(x['question1'], x['question2']), axis=1 )
    print("Feature 'fuzz_partial_ratio' created. Time taken: {0}".format(datetime.now() - start))
    
    # fuzz_token_sort_ratio
    start = datetime.now()
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']), axis=1 )
    print("Feature 'fuzz_token_sort_ratio' created. Time taken: {0}".format(datetime.now() - start))
    
    # fuzz_token_set_ratio
    start = datetime.now()
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(x['question1'], x['question2']), axis=1 )
    print("Feature 'fuzz_token_set_ratio' created. Time taken: {0}".format(datetime.now() - start))

    # fuzz_partial_token_sort_ratio
    start = datetime.now()
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(x['question1'], x['question2']), axis=1 )
    print("Feature 'fuzz_partial_token_sort_ratio' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_tokens_count_min
    start = datetime.now()
    df['common_tokens_count_min'] =  df.apply(compute_common_tokens_count_min, axis=1)
    print("Feature 'common_tokens_count_min' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_tokens_count_max
    start = datetime.now()
    df['common_tokens_count_max'] =  df.apply(compute_common_tokens_count_max, axis=1)
    print("Feature 'common_tokens_count_max' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_words_count_min
    start = datetime.now()
    df['common_words_count_min'] =  df.apply(compute_common_words_count_min, axis=1)
    print("Feature 'common_words_count_min' created. Time taken: {0}".format(datetime.now() - start))
        
    # common_words_count_max
    start = datetime.now()
    df['common_words_count_max'] =  df.apply(compute_common_words_count_max, axis=1)
    print("Feature 'common_words_count_max' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_nonstopwords_count_min
    start = datetime.now()
    df['common_nonstopwords_count_min'] =  df.apply(compute_common_nonstopwords_count_min, axis=1)
    print("Feature 'common_nonstopwords_count_min' created. Time taken: {0}".format(datetime.now() - start))
    
    # common_nonstopwords_count_max
    start = datetime.now()
    df['common_nonstopwords_count_max'] =  df.apply(compute_common_nonstopwords_count_max, axis=1)
    print("Feature 'common_nonstopwords_count_max' created. Time taken: {0}".format(datetime.now() - start))
    
    

In [182]:
processed_data_featurized = processed_data.copy()

In [179]:
if os.path.isfile("../data/processed/processed_data_featurized.csv"):
    print('Featurized data already available. Getting it from the disk...')
    processed_data_featurized = pd.read_csv("../data/processed/processed_data_featurized.csv", sep=",", encoding='latin-1')
    processed_data_featurized.fillna("")
    print('Data loaded!')
else:
    start = datetime.now()
    print("Feature engineering on processed_data started!")
    feature_engg_adv(processed_data_featurized)
    print("Feature engineering on processed_data completed!\n\nTotal time taken: {0}".format(datetime.now() - start))

Feature engineering on processed_data started!
Feature 'fuzz_ratio' created. Time taken: 0:00:37.170625
Feature 'fuzz_partial_ratio' created. Time taken: 0:03:38.835334
Feature 'fuzz_token_sort_ratio' created. Time taken: 0:00:56.323212
Feature 'fuzz_token_set_ratio' created. Time taken: 0:01:16.347247
Feature 'fuzz_partial_token_sort_ratio' created. Time taken: 0:04:07.361945
Feature 'common_tokens_count_min' created. Time taken: 0:01:13.918511
Feature 'common_tokens_count_max' created. Time taken: 0:01:14.509648
Feature 'common_words_count_min' created. Time taken: 0:00:06.311327
Feature 'common_words_count_max' created. Time taken: 0:00:06.898898
Feature 'common_nonstopwords_count_min' created. Time taken: 0:00:08.161611
Feature 'common_nonstopwords_count_max' created. Time taken: 0:00:07.421651
Feature engineering on processed_data completed!

Total time taken: 0:13:33.516169


In [185]:
print('Shape of processed_data: {0}'.format(processed_data.shape))
print('Shape of processed_data_featurized: {0}'.format(processed_data_featurized.shape))

Shape of processed_data: (404290, 7)
Shape of processed_data_featurized: (404290, 17)


In [187]:
# Export processed_data_featurized to csv
processed_data_featurized.to_csv('../data/processed/processed_data_featurized.csv', index=False)
print('processed_data_featurized exported!')

processed_data_featurized exported!


#### 5. Feature Engineering (on processed data - TFIDF Weighted W2V)

Here, we will convert each question into a 300 dimensional vector using below strategy:<br>
1. For each question, we will compute its TF-IDF value for each word
2. We will use a pre-trained model (GloVe) to get W2V vector of each word
3. For each question, we will element wise multiplication of all words' TF-IDF values by its W2V value and then divide its sum by sum of all word's TF-IDF values <br>



In [204]:
# Compute TF-IDF for all words in all the questions
questions = list(processed_data['question1']) + list(processed_data['question2'])
questions = [x for x in questions if pd.notnull(x)]

start = datetime.now()

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)
print('All words are vectorized using TFIDF.\n\nTime taken: {0}'.format(datetime.now()-start))

All words are vectorized using TFIDF.

Time taken: 0:00:05.156417


In [210]:
# Store the words and their TF IDF values in dictionary
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

In [211]:
word2tfidf['hi']

9.443780468237108

In [216]:
# Load pre-trained GloVe model
nlp = spacy.load('en_core_web_lg')

In [222]:
# Testing
x=nlp("man")
print("Length of vector: {0}".format(len(x.vector)))
print("Vector: {0}".format(x.vector))


Length of vector: 300
Vector: [ -1.2867    -0.7992    -2.092     -0.77679   -2.5057     2.7123
   0.59127    3.2927    -1.5826     6.4515     1.3452    -1.9711
   0.93059    2.8943     4.2116     1.6        2.6821    -8.4476
   2.3301     6.0751    -0.39937    7.3433    -2.2546    -5.9357
   3.6748    -4.9191    -3.1941    -4.2882     3.4951    -3.1585
   0.69749    0.48132   -0.6059     0.22147   -2.9045     0.27525
  -6.0088     5.0995    -3.367      2.6089    -5.6207    -2.6762
   6.0931     3.1168     3.2641    -4.0576    -4.435      1.4214
   0.59049    8.941      2.0718     5.3188     2.8866     0.0945
  -0.25755    0.93984    7.9412    -2.2701    -0.65029    1.4952
  -2.5503    -3.7978    -5.853     -1.7847     1.4484    -3.9781
  -1.3968   -10.793     -4.5546    -0.12542    4.4986     1.7492
   0.50073   -1.1922     2.0405    -2.1606    -1.5879    10.005
   1.5086    -2.7168    -1.2617    -2.1364     1.2624    -4.1934
   0.87337    2.2741    -1.8725     4.7847    -0.19699    0.

Convert question1 & question2 to vectors

In [274]:
def compute_tfidf_avg_w2v(question):
    question = str(question)
        
    question_nlp = nlp(question)
    avg_vector = np.zeros([len(question_nlp), len(question_nlp[0].vector)]) # Initialize the mean vector with 0s
    
    for word in question_nlp:
        word_vector = word.vector
        try:
            idf = word2tfidf[str(word)]
        except:
            idf = 0
        avg_vector += word_vector * idf
    return avg_vector.mean(axis=0)
     
        

In [275]:
processed_data_tfidfavgw2v = processed_data.copy()

In [276]:
# Check if features are already saved in disk. If yes, load it from disk, else regrenerate.

if os.path.isfile("../data/processed/q1_tfidfavgw2v_features.csv"):
    print('Featurized data already available. Getting it from the disk...')
    q1_tfidfavgw2v_features = pd.read_csv("../data/processed/q1_tfidfavgw2v_features.csv", sep=",", encoding='latin-1')
    q1_tfidfavgw2v_features.fillna("")
    print('Data loaded!')
else:
    start = datetime.now()
    print("Computing TFIDF Avg. W2V for Question1")
    processed_data_tfidfavgw2v['q1_tfidfavgw2v_features'] = processed_data_tfidfavgw2v['question1'].apply(compute_tfidf_avg_w2v)
    print("Completed!\nTime taken: {0}".format(datetime.now() - start))
    q1_tfidfavgw2v_features = pd.DataFrame(processed_data_tfidfavgw2v['q1_tfidfavgw2v_features'].values.tolist(), index=processed_data_tfidfavgw2v.index)


if os.path.isfile("../data/processed/q2_tfidfavgw2v_features.csv"):
    print('Featurized data already available. Getting it from the disk...')
    q2_tfidfavgw2v_features = pd.read_csv("../data/processed/q2_tfidfavgw2v_features.csv", sep=",", encoding='latin-1')
    q2_tfidfavgw2v_features.fillna("")
    print('Data loaded!')
else:
    start = datetime.now()
    print("Computing TFIDF Avg. W2V for Question1")
    processed_data_tfidfavgw2v['q2_tfidfavgw2v_features'] = processed_data_tfidfavgw2v['question2'].apply(compute_tfidf_avg_w2v)
    print("Completed!\nTime taken: {0}".format(datetime.now() - start))
    q2_tfidfavgw2v_features = pd.DataFrame(processed_data_tfidfavgw2v['q2_tfidfavgw2v_features'].values.tolist(), index=processed_data_tfidfavgw2v.index)


Computing TFIDF Avg. W2V for Question1
Completed!
Time taken: 0:38:07.884703
Computing TFIDF Avg. W2V for Question1
Completed!
Time taken: 0:39:23.286483


In [296]:
print("Shape of q1_tfidfavgw2v_features: {0}".format(q1_tfidfavgw2v_features.shape))
print("Shape of q2_tfidfavgw2v_features: {0}".format(q2_tfidfavgw2v_features.shape))

Shape of q1_tfidfavgw2v_features: (404290, 300)
Shape of q2_tfidfavgw2v_features: (404290, 300)


In [301]:
q1_tfidfavgw2v_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,77.973457,44.453701,-173.21983,-4.410628,75.493393,41.393043,-41.580649,84.183355,-278.957716,23.654842,...,26.416055,-84.681734,-88.431,37.586479,4.812845,-50.764075,-21.439394,-47.322849,-114.61161,-0.633502
1,-4.109793,2.340805,-21.485545,-59.833237,25.337529,-48.522722,20.918352,88.40415,-18.247384,0.922639,...,18.760534,28.946899,40.868272,41.995686,-16.039801,-6.515951,17.120571,4.826173,-36.251713,-4.849846
2,-46.598287,26.09858,-58.658402,62.726847,51.278867,-51.657226,61.282318,218.317215,-145.487613,51.764659,...,16.600708,10.613136,44.777887,13.170672,-40.014347,90.090577,110.328283,-130.407567,-71.836666,49.603967
3,35.360342,60.163725,-27.901776,-53.709725,-35.968431,24.307124,47.461727,82.214472,-20.191026,8.70573,...,40.188845,-64.445338,3.300989,-3.460388,-5.688108,-39.654131,-8.5016,41.287611,-22.82638,72.29597
4,34.598532,-100.970966,-192.003554,32.115457,105.654387,-244.451265,0.215317,295.835997,140.65161,-64.710206,...,53.694146,-93.960855,280.22713,-71.913438,-244.190637,108.494457,354.313131,51.786041,66.434969,-46.842632


Change column names

In [320]:
q1_features_headers = []
q2_features_headers = []
for i in range(300):
    q1_features_headers.append("q1_feat_"+str(i+1))
    q2_features_headers.append("q2_feat_"+str(i+1))
    

In [321]:
q1_tfidfavgw2v_features.columns = q1_features_headers
q1_tfidfavgw2v_features.head()

Unnamed: 0,q1_feat_1,q1_feat_2,q1_feat_3,q1_feat_4,q1_feat_5,q1_feat_6,q1_feat_7,q1_feat_8,q1_feat_9,q1_feat_10,...,q1_feat_291,q1_feat_292,q1_feat_293,q1_feat_294,q1_feat_295,q1_feat_296,q1_feat_297,q1_feat_298,q1_feat_299,q1_feat_300
0,77.973457,44.453701,-173.21983,-4.410628,75.493393,41.393043,-41.580649,84.183355,-278.957716,23.654842,...,26.416055,-84.681734,-88.431,37.586479,4.812845,-50.764075,-21.439394,-47.322849,-114.61161,-0.633502
1,-4.109793,2.340805,-21.485545,-59.833237,25.337529,-48.522722,20.918352,88.40415,-18.247384,0.922639,...,18.760534,28.946899,40.868272,41.995686,-16.039801,-6.515951,17.120571,4.826173,-36.251713,-4.849846
2,-46.598287,26.09858,-58.658402,62.726847,51.278867,-51.657226,61.282318,218.317215,-145.487613,51.764659,...,16.600708,10.613136,44.777887,13.170672,-40.014347,90.090577,110.328283,-130.407567,-71.836666,49.603967
3,35.360342,60.163725,-27.901776,-53.709725,-35.968431,24.307124,47.461727,82.214472,-20.191026,8.70573,...,40.188845,-64.445338,3.300989,-3.460388,-5.688108,-39.654131,-8.5016,41.287611,-22.82638,72.29597
4,34.598532,-100.970966,-192.003554,32.115457,105.654387,-244.451265,0.215317,295.835997,140.65161,-64.710206,...,53.694146,-93.960855,280.22713,-71.913438,-244.190637,108.494457,354.313131,51.786041,66.434969,-46.842632


In [322]:
q2_tfidfavgw2v_features.columns = q2_features_headers
q2_tfidfavgw2v_features.head()

Unnamed: 0,q2_feat_1,q2_feat_2,q2_feat_3,q2_feat_4,q2_feat_5,q2_feat_6,q2_feat_7,q2_feat_8,q2_feat_9,q2_feat_10,...,q2_feat_291,q2_feat_292,q2_feat_293,q2_feat_294,q2_feat_295,q2_feat_296,q2_feat_297,q2_feat_298,q2_feat_299,q2_feat_300
0,79.057064,52.681846,-169.935391,-3.243233,53.222714,59.307491,-58.016089,84.049185,-274.510704,30.463488,...,13.756101,-75.554456,-83.103968,43.434313,26.063637,-52.79759,-17.386567,-35.062074,-102.498085,-11.97082
1,-73.991234,18.972404,-104.731602,-26.501467,62.58638,36.496242,37.162907,132.71887,60.264024,-1.595595,...,58.229712,59.083779,20.090896,9.392637,-76.729526,42.790878,-84.178477,41.36093,-90.049836,65.292126
2,19.054749,8.966757,-32.669609,64.714678,70.89238,-41.697565,32.054564,218.153316,-99.107224,12.227849,...,15.184921,27.294229,30.089786,-1.801036,-60.964952,89.953825,121.342901,-75.809698,-74.579956,76.083279
3,-67.871643,-1.291338,-28.803573,-6.979753,117.594441,34.824081,-1.18307,65.186494,-35.391374,-34.799505,...,-13.915117,-33.911907,-27.728121,-14.628237,-42.69283,20.24197,-47.275902,-47.144042,3.134417,-20.761216
4,10.763654,6.005122,-111.848332,17.465527,62.162129,-88.357357,-31.264316,156.222801,-70.721046,58.084528,...,87.066215,-51.738576,95.601252,-65.619994,-139.510619,-99.868408,5.537125,-11.700133,-26.135739,16.903017


Export data to csv

In [323]:
start = datetime.now()
q1_tfidfavgw2v_features.to_csv('../data/processed/q1_tfidfavgw2v_features.csv', index=False)
print('q1_tfidfavgw2v_features exported!\nTime taken: {0}'.format(datetime.now()-start))

start = datetime.now()
q2_tfidfavgw2v_features.to_csv('../data/processed/q2_tfidfavgw2v_features.csv', index=False)
print('\nq2_tfidfavgw2v_features exported!\nTime taken: {0}'.format(datetime.now()-start))


q1_tfidfavgw2v_features exported!
Time taken: 0:03:06.010511

q2_tfidfavgw2v_features exported!
Time taken: 0:03:10.378680


#### 7. Combine all features

We have below DataFrames with different features<br>
1. raw_data_featurized - This has id, qid, qid2, question1, question2, is_duplicate and 16 new features
2. processed_data_featurized - This has id, qid, qid2, question1, question2, is_duplicate and 11 new features
3. q1_tfidfavgw2v_features - This has 300 features derived from question1
4. q2_tfidfavgw2v_features - This has 300 features derived from question2<br><br>

So, in total, we have 627 new features. Lets combine all these features into single DataFrame and dump it into sqlite3.

In [304]:
print("Shape of raw_data_featurized: {0}".format(raw_data_featurized.shape))
print("Shape of processed_data_featurized: {0}".format(processed_data_featurized.shape))
print("Shape of q1_tfidfavgw2v_features: {0}".format(q1_tfidfavgw2v_features.shape))
print("Shape of q2_tfidfavgw2v_features: {0}".format(q2_tfidfavgw2v_features.shape))

Shape of raw_data_featurized: (404290, 22)
Shape of processed_data_featurized: (404290, 17)
Shape of q1_tfidfavgw2v_features: (404290, 300)
Shape of q2_tfidfavgw2v_features: (404290, 300)


In [333]:
start = datetime.now()
print("Combining all features")
processed_final_features = pd.concat([raw_data_featurized, processed_data_featurized.iloc[:,6:], q1_tfidfavgw2v_features, q2_tfidfavgw2v_features], axis=1)
print("Operation complete.\nTime taken: {0}".format(datetime.now()-start))
print("\nShape of processed_final_features: {0}".format(processed_final_features.shape))

Combining all features
Operation complete.
Time taken: 0:00:00.894343

Shape of processed_final_features: (404290, 633)


In [334]:
processed_final_features.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_frequency,q2_frequency,q1_length,q2_length,...,q2_feat_291,q2_feat_292,q2_feat_293,q2_feat_294,q2_feat_295,q2_feat_296,q2_feat_297,q2_feat_298,q2_feat_299,q2_feat_300
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66.0,57.0,...,13.756101,-75.554456,-83.103968,43.434313,26.063637,-52.79759,-17.386567,-35.062074,-102.498085,-11.97082
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51.0,88.0,...,58.229712,59.083779,20.090896,9.392637,-76.729526,42.790878,-84.178477,41.36093,-90.049836,65.292126
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73.0,59.0,...,15.184921,27.294229,30.089786,-1.801036,-60.964952,89.953825,121.342901,-75.809698,-74.579956,76.083279
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50.0,65.0,...,-13.915117,-33.911907,-27.728121,-14.628237,-42.69283,20.24197,-47.275902,-47.144042,3.134417,-20.761216
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76.0,39.0,...,87.066215,-51.738576,95.601252,-65.619994,-139.510619,-99.868408,5.537125,-11.700133,-26.135739,16.903017


Export

In [335]:
start = datetime.now()
processed_final_features.to_csv('../data/processed/processed_final_features.csv', index=False)
print('processed_final_features exported!\nTime taken: {0}'.format(datetime.now()-start))

processed_final_features exported!
Time taken: 0:06:36.372394


#### 8. Store the final features in sqlite database

In [6]:
start = datetime.now()
disk_engine = create_engine('sqlite:///train.db')
processed_final_features.to_sql('train_data', disk_engine, if_exists='append')
print("Wrote to SQLite.\nTime taken: {0}".format(datetime.now() - start))

Wrote to SQLite.
Time taken: 1:21:49.501035
