# Project 1: Quora Question Pairs

## Description:

This notebook uses NLP to generate predictions for the Quora Question Pairs dataset from https://www.kaggle.com/c/quora-question-pairs/data

In [1]:
from pathlib import Path
import random

import spacy
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer


## Function definitions, Training Set Import, Preprocessing

### Define helper functions to calculate cosine similarity

In [2]:
def parse(nlp, docs):
    parsed_docs = []
    
    for doc in nlp.pipe(list(docs), n_threads=10):
        parsed_docs.append(doc)
    
    return parsed_docs


def get_similarity(docs1, docs2):
    similarity = []
    
    for idx in range(len(docs1)):
        similarity.append(docs1[idx].similarity(docs2[idx]))            
        
    return similarity


def get_sentiment(docs):
    sid = SentimentIntensityAnalyzer()    
    sentiment = {'compound': [],
                 'neg': [],
                 'neu': [],
                 'pos': [],}
    
    for doc in list(docs):
        polarity = sid.polarity_scores(doc)
        
        sentiment['compound'].append(polarity['compound'])
        sentiment['neg'].append(polarity['neg'])
        sentiment['neu'].append(polarity['neu'])
        sentiment['pos'].append(polarity['pos'])
        
    return sentiment

### Load in train.csv. For faster computation, only load 2.5% of the full sample, or about 10,000 rows

In [3]:
csv = Path.cwd().joinpath('train.csv')
p = 0.025
df = pd.read_csv(csv,
                 index_col='id',
                 skiprows=lambda i: i>0 and random.random() > p)
df['is_duplicate'].value_counts()

0    6448
1    3668
Name: is_duplicate, dtype: int64

### Calculate cosine similarity between question 1 and question 2, then concatenate the questions for TFIDF generation

In [4]:
nlp = spacy.load('en_core_web_lg')

q1_parsed = parse(nlp, df['question1'].astype(str))
q2_parsed = parse(nlp, df['question2'].astype(str))

df['similarity'] = get_similarity(q1_parsed, q2_parsed)
df['q_concat'] = df['question1'].map(str) + ' ' + df['question2']

df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,similarity,q_concat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
46,93,94,How did Darth Vader fought Darth Maul in Star ...,Does Quora have a character limit for profile ...,0,0.486686,How did Darth Vader fought Darth Maul in Star ...
214,429,430,How many derivatives of an object's position w...,When does drawing become fun?,0,0.827193,How many derivatives of an object's position w...
247,495,496,Why can flash run so fast?,The Flash (DC character): How fast can the Fla...,0,0.8956,Why can flash run so fast? The Flash (DC chara...
271,542,543,Why aren't there more apps like Word Lens?,What does Word Lens use for OCR?,0,0.866187,Why aren't there more apps like Word Lens? Wha...
352,703,704,How do I send message from one Android phone t...,How do I transfer my contacts from one Android...,0,0.977695,How do I send message from one Android phone t...


### Calculate polarity scores for each question separately

In [5]:
sentiment1 = pd.DataFrame(
    get_sentiment(list(df['question1'].astype(str))),
    index=df.index
)
sentiment2 = pd.DataFrame(
    get_sentiment(list(df['question2'].astype(str))),
    index=df.index
)

print(sentiment1.head())
print(sentiment2.head())

     compound    neg    neu    pos
id                                
46    -0.7096  0.396  0.604  0.000
214    0.4767  0.000  0.819  0.181
247    0.0000  0.000  1.000  0.000
271    0.4173  0.000  0.716  0.284
352    0.0000  0.000  1.000  0.000
     compound  neg    neu    pos
id                              
46     0.0000  0.0  1.000  0.000
214    0.5106  0.0  0.548  0.452
247    0.0000  0.0  1.000  0.000
271    0.0000  0.0  1.000  0.000
352    0.0000  0.0  1.000  0.000


### Calculate absolute differences in sentimentality for each question-pair

In [6]:
df['compound_diff'] = (sentiment1['compound'] - sentiment2['compound']).abs()
df['neg_diff'] = (sentiment1['neg'] - sentiment2['neg']).abs()
df['neu_diff'] = (sentiment1['neu'] - sentiment2['neu']).abs()
df['pos_diff'] = (sentiment1['pos'] - sentiment2['pos']).abs()

df.head(10)

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,similarity,q_concat,compound_diff,neg_diff,neu_diff,pos_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
46,93,94,How did Darth Vader fought Darth Maul in Star ...,Does Quora have a character limit for profile ...,0,0.486686,How did Darth Vader fought Darth Maul in Star ...,0.7096,0.396,0.396,0.0
214,429,430,How many derivatives of an object's position w...,When does drawing become fun?,0,0.827193,How many derivatives of an object's position w...,0.0339,0.0,0.271,0.271
247,495,496,Why can flash run so fast?,The Flash (DC character): How fast can the Fla...,0,0.8956,Why can flash run so fast? The Flash (DC chara...,0.0,0.0,0.0,0.0
271,542,543,Why aren't there more apps like Word Lens?,What does Word Lens use for OCR?,0,0.866187,Why aren't there more apps like Word Lens? Wha...,0.4173,0.0,0.284,0.284
352,703,704,How do I send message from one Android phone t...,How do I transfer my contacts from one Android...,0,0.977695,How do I send message from one Android phone t...,0.0,0.0,0.0,0.0
380,758,759,Which one is better among KMC Manipal and KMC ...,What is the brief comparison of KMC in Manipal...,1,0.861096,Which one is better among KMC Manipal and KMC ...,0.4404,0.0,0.244,0.244
518,1034,1035,What are scraping techniques?,What are some good free web scrapers / scrapin...,1,0.909148,What are scraping techniques? What are some go...,0.7351,0.0,0.47,0.47
528,1054,1055,Who do I activate the dlc of skyrim in a lapto...,I'm tired of the laptops Ive bought getting ol...,0,0.945128,Who do I activate the dlc of skyrim in a lapto...,0.2784,0.07,0.259,0.189
540,1078,1079,How can I become a true computer science engin...,How do I become a good computer science engineer?,1,0.985212,How can I become a true computer science engin...,0.0189,0.0,0.008,0.008
554,1106,1107,How much weight will I lose by not eating for ...,How much weight do I lose by purging after eve...,0,0.962932,How much weight will I lose by not eating for ...,0.0,0.018,0.018,0.0


### Train-test split

In [7]:
x = df.drop(['question1', 
             'question2', 
             'qid1', 
             'qid2', 
             'is_duplicate'], axis=1)
y = df['is_duplicate']

x_train, x_test, y_train, y_test = train_test_split(
        x, y, stratify=y, random_state=42
    )

x_train.head()

Unnamed: 0_level_0,similarity,q_concat,compound_diff,neg_diff,neu_diff,pos_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
96458,0.854179,How many Tatkal tickets can be booked through ...,0.0,0.0,0.0,0.0
246514,0.913031,How can we improve our English language skills...,0.356,0.0,0.148,0.148
203730,0.985838,What are some unexpected things first-time vis...,0.0,0.0,0.0,0.0
381650,0.859498,What are non expensive dandiya nights in banga...,0.4939,0.219,0.219,0.0
89277,0.952381,Should poop not stink? Why does poop stink?,0.7108,0.474,0.045,0.429


## TF-IDF Vectorizer

Generate TF-IDF's for the train and test sets

In [8]:
vectorizer = TfidfVectorizer()
train_tfidf = vectorizer.fit_transform(
        x_train['q_concat'].values.astype('U')
    )
test_tfidf = vectorizer.transform(
        x_test['q_concat'].values.astype('U')
    )
x_train_bow = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf.todense(), index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_bow = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf.todense(), index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_bow.head()

Unnamed: 0_level_0,similarity,compound_diff,neg_diff,neu_diff,pos_diff,0,1,2,3,4,...,12951,12952,12953,12954,12955,12956,12957,12958,12959,12960
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96458,0.854179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246514,0.913031,0.356,0.0,0.148,0.148,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203730,0.985838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381650,0.859498,0.4939,0.219,0.219,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89277,0.952381,0.7108,0.474,0.045,0.429,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model 1: Logistic Regression

In [9]:
logit = LogisticRegression(solver='liblinear', random_state=42)
logit.fit(x_train_bow, y_train)
preds = logit.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.756425464610518
[[1423  189]
 [ 427  490]]


## Model 2: Multinomial Naive Bayes

Multinomial Naive Bayes shows a strong bias towards non-duplicate predictions

In [10]:
mnb = MultinomialNB()
mnb.fit(x_train_bow, y_train)
preds = mnb.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.6903914590747331
[[1573   39]
 [ 744  173]]


## Feature transformation: Singular Value Decomposition

Using sklearn's TruncatedSVD class, reduce the TF-IDF's into a lower feature space of 100 components

In [11]:
svd = TruncatedSVD(n_components=100, random_state=42)
train_tfidf_lsa = svd.fit_transform(train_tfidf)
test_tfidf_lsa = svd.transform(test_tfidf)

x_train_lsa = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf_lsa, index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_lsa = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf_lsa, index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_lsa.head()

Unnamed: 0_level_0,similarity,compound_diff,neg_diff,neu_diff,pos_diff,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96458,0.854179,0.0,0.0,0.0,0.0,0.089077,0.04786,0.018747,-0.021163,-0.003157,...,0.03129,0.011947,-0.035358,0.005949,0.010662,-0.01035,0.018888,0.003911,-0.002653,0.015167
246514,0.913031,0.356,0.0,0.148,0.148,0.218779,0.084972,0.25808,-0.04028,0.02391,...,-0.006558,-0.011903,-0.013266,-0.038212,-0.014277,-0.03651,0.020632,-0.012096,-0.022315,-0.014182
203730,0.985838,0.0,0.0,0.0,0.0,0.142902,-0.067568,0.032803,0.159766,0.016165,...,0.048632,-0.105241,0.115322,0.025983,0.039766,-0.108821,0.034258,0.126028,0.000153,-0.041487
381650,0.859498,0.4939,0.219,0.219,0.0,0.14848,-0.05984,0.022469,0.036854,0.023903,...,-0.000536,-0.031192,0.002404,-0.024569,-0.012066,-0.005212,0.007402,-0.003633,0.024728,-0.019789
89277,0.952381,0.7108,0.474,0.045,0.429,0.037406,0.027118,-0.041854,-0.015871,-0.013738,...,0.018752,0.012213,0.014776,0.032842,-0.027788,0.001827,-0.010946,-0.001521,-0.008642,-0.000536


## Model 1: Logistic Regression

Not much improvement over the non-reduced dataset

In [12]:
logit = LogisticRegression(C=999999, solver='liblinear', random_state=42)
logit.fit(x_train_lsa, y_train)
preds = logit.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.7425860023724793
[[1377  235]
 [ 416  501]]


## Model 2: Support Vector Machine

Using cosine similarity, sentiment differences, and the decomposed TF-IDF's as features, the linear Support Vector Machine Classifier demonstrates greatly improved performance over Multinomial Naive Bayes, with much less bias toward non-duplicate predictions

In [13]:
svc = SVC(kernel='linear', random_state=42)
svc.fit(x_train_lsa, y_train)
preds = svc.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.723210755239225
[[1445  167]
 [ 533  384]]
