# Project 1: Quora Question Pairs

## Description:

This notebook uses NLP to generate predictions for the Quora Question Pairs dataset from https://www.kaggle.com/c/quora-question-pairs/data

In [1]:
from pathlib import Path
import random
import io

import spacy
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer


## Function definitions, Training Set Import, Preprocessing

### Define helper functions to calculate cosine similarity

In [2]:
def parse(nlp, docs):
    parsed_docs = []
    
    for doc in nlp.pipe(list(docs), n_threads=10):
        parsed_docs.append(doc)
    
    return parsed_docs


def get_similarity(docs):
    return docs[0].similarity(docs[1])


def get_sentiment(text):
    sid = SentimentIntensityAnalyzer()
    polarity = sid.polarity_scores(text)

    compound = polarity['compound']
    neg = polarity['neg']
    neu = polarity['neu']
    pos = polarity['pos']
        
    return compound, neg, neu, pos


sentiment_vectorized = np.vectorize(get_sentiment)

### Load in train.csv. For faster computation, only load 2.5% of the full sample, or about 10,000 rows

In [3]:
random.seed(42)
csv = Path.cwd().joinpath('train.csv')
p = 0.025
df = pd.read_csv(csv,
                 index_col='id',
                 skiprows=lambda i: i>0 and random.random() > p)
df['is_duplicate'].value_counts()

0    6378
1    3659
Name: is_duplicate, dtype: int64

### Calculate cosine similarity between question 1 and question 2, then concatenate the questions for TFIDF generation

In [4]:
nlp = spacy.load('en_core_web_lg')

df['q1_parsed'] = parse(nlp, df['question1'].astype(str))
df['q2_parsed'] = parse(nlp, df['question2'].astype(str))

df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,q1_parsed,q2_parsed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19,39,40,Which is the best digital marketing institutio...,Which is the best digital marketing institute ...,0,"(Which, is, the, best, digital, marketing, ins...","(Which, is, the, best, digital, marketing, ins..."
100,201,202,Will there really be any war between India and...,Will there be a nuclear war between India and ...,1,"(Will, there, really, be, any, war, between, I...","(Will, there, be, a, nuclear, war, between, In..."
124,249,250,What is the alternative to machine learning?,How do I over-sample a multi-class imbalance d...,0,"(What, is, the, alternative, to, machine, lear...","(How, do, I, over, -, sample, a, multi, -, cla..."
126,253,254,What is the mean of future budget?,What is the meaning of the future?,0,"(What, is, the, mean, of, future, budget, ?)","(What, is, the, meaning, of, the, future, ?)"
269,538,539,What are the rights of a prisoner?,"If I am in prison, what rights would I have?",1,"(What, are, the, rights, of, a, prisoner, ?)","(If, I, am, in, prison, ,, what, rights, would..."


In [5]:
df['similarity'] = df[['q1_parsed', 'q2_parsed']].apply(get_similarity, axis=1)
df['q_concat'] = df['question1'].map(str) + ' ' + df['question2']

df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,q1_parsed,q2_parsed,similarity,q_concat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19,39,40,Which is the best digital marketing institutio...,Which is the best digital marketing institute ...,0,"(Which, is, the, best, digital, marketing, ins...","(Which, is, the, best, digital, marketing, ins...",0.969762,Which is the best digital marketing institutio...
100,201,202,Will there really be any war between India and...,Will there be a nuclear war between India and ...,1,"(Will, there, really, be, any, war, between, I...","(Will, there, be, a, nuclear, war, between, In...",0.947574,Will there really be any war between India and...
124,249,250,What is the alternative to machine learning?,How do I over-sample a multi-class imbalance d...,0,"(What, is, the, alternative, to, machine, lear...","(How, do, I, over, -, sample, a, multi, -, cla...",0.835211,What is the alternative to machine learning? H...
126,253,254,What is the mean of future budget?,What is the meaning of the future?,0,"(What, is, the, mean, of, future, budget, ?)","(What, is, the, meaning, of, the, future, ?)",0.957273,What is the mean of future budget? What is the...
269,538,539,What are the rights of a prisoner?,"If I am in prison, what rights would I have?",1,"(What, are, the, rights, of, a, prisoner, ?)","(If, I, am, in, prison, ,, what, rights, would...",0.849611,What are the rights of a prisoner? If I am in ...


### Calculate polarity scores for each question

In [6]:
sentiment1 = sentiment_vectorized(df['question1'].values)
sentiment2 = sentiment_vectorized(df['question2'].values)

In [None]:
df['compound1'] = sentiment1[0]
df['neg1'] = sentiment1[1]
df['neu1'] = sentiment1[2]
df['pos1'] = sentiment1[3]

df['compound2'] = sentiment2[0]
df['neg2'] = sentiment2[1]
df['neu2'] = sentiment2[2]
df['pos2'] = sentiment2[3]

df.head()

### Calculate absolute differences in sentimentality for each question-pair

In [None]:
df['compound_diff'] = (df['compound1'] - df['compound2']).abs()
df['neg_diff'] = (df['neg1'] - df['neg2']).abs()
df['neu_diff'] = (df['neu1'] - df['neu2']).abs()
df['pos_diff'] = (df['pos1'] - df['pos2']).abs()

df.head(10)

### Train-test split

In [None]:
x = df.drop(['question1', 
             'question2', 
             'qid1', 
             'qid2',
             'compound1',
             'neg1',
             'neu1',
             'pos1',
             'compound2',
             'neg2',
             'neu2',
             'pos2',
             'is_duplicate'], axis=1)
y = df['is_duplicate']

x_train, x_test, y_train, y_test = train_test_split(
        x, y, stratify=y, random_state=42
    )

x_train.head()

## TF-IDF Vectorizer

Generate TF-IDF's for the train and test sets

In [None]:
vectorizer = TfidfVectorizer()
train_tfidf = vectorizer.fit_transform(
        x_train['q_concat'].values.astype('U')
    )
test_tfidf = vectorizer.transform(
        x_test['q_concat'].values.astype('U')
    )
x_train_bow = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf.todense(), index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_bow = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf.todense(), index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_bow.head()

## Model 1: Logistic Regression

In [None]:
logit = LogisticRegression(solver='sag', random_state=42)
logit.fit(x_train_bow, y_train)
preds = logit.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Model 2: Multinomial Naive Bayes

Multinomial Naive Bayes shows a strong bias towards non-duplicate predictions

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train_bow, y_train)
preds = mnb.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Feature transformation: Singular Value Decomposition

Using sklearn's TruncatedSVD class, reduce the TF-IDF's into a lower feature space of 100 components

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42)
train_tfidf_lsa = svd.fit_transform(train_tfidf)
test_tfidf_lsa = svd.transform(test_tfidf)

x_train_lsa = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf_lsa, index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_lsa = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf_lsa, index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_lsa.head()

## Model 1: Logistic Regression

Not much improvement over the non-reduced dataset

In [None]:
logit = LogisticRegression(C=999999, solver='liblinear', random_state=42)
logit.fit(x_train_lsa, y_train)
preds = logit.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Model 2: Support Vector Machine

Using cosine similarity, sentiment differences, and the decomposed TF-IDF's as features, the linear Support Vector Machine Classifier demonstrates greatly improved performance over Multinomial Naive Bayes, with much less bias toward non-duplicate predictions

In [None]:
svc = SVC(kernel='linear', random_state=42)
svc.fit(x_train_lsa, y_train)
preds = svc.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))