# Project 1: Quora Question Pairs

## Description:

This notebook uses NLP to generate predictions for the Quora Question Pairs dataset from https://www.kaggle.com/c/quora-question-pairs/data

In [None]:
from pathlib import Path
import random
import io

import spacy
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer


## Function definitions, Training Set Import, Preprocessing

### Define helper functions to calculate cosine similarity

In [None]:
def parse(nlp, docs):
    parsed_docs = []
    
    for doc in nlp.pipe(list(docs), n_threads=10):
        parsed_docs.append(doc)
    
    return parsed_docs


def get_similarity(docs):    
    return docs[0].similarity(docs[1])


def get_sentiment(doc):
    sid = SentimentIntensityAnalyzer()
    polarity = sid.polarity_scores(doc)

    compound = polarity['compound']
    neg = polarity['neg']
    neu = polarity['neu']
    pos = polarity['pos']
        
    return compound, neg, neu, pos

### Load in train.csv. For faster computation, only load 2.5% of the full sample, or about 10,000 rows

In [None]:
random.seed(42)
csv = Path.cwd().joinpath('train.csv')
p = 0.025
df = pd.read_csv(csv,
                 index_col='id',
                 skiprows=lambda i: i>0 and random.random() > p)
df['is_duplicate'].value_counts()

### Calculate cosine similarity between question 1 and question 2, then concatenate the questions for TFIDF generation

In [None]:
nlp = spacy.load('en_core_web_lg')

df['q1_parsed'] = parse(nlp, df['question1'].astype(str))
df['q2_parsed'] = parse(nlp, df['question2'].astype(str))

df['similarity'] = df[['q1_parsed', 'q2_parsed']].apply(get_similarity, axis=1)
df['q_concat'] = df['question1'].map(str) + ' ' + df['question2']

df.head()

### Calculate polarity scores for each question separately

In [None]:
df[['compound1', 'neg1', 'neu1', 'pos1']] = df['question1'].apply(
    get_sentiment, axis=1, result_type='expand'
)
df[['compound2', 'neg2', 'neu2', 'pos2']] = df['question1'].apply(
    get_sentiment, axis=1, result_type='expand'
)

print(sentiment1.head())
print(sentiment2.head())

### Calculate absolute differences in sentimentality for each question-pair

In [None]:
df['compound_diff'] = (df['compound1'] - df['compound2']).abs()
df['neg_diff'] = (df['neg1'] - df['neg2']).abs()
df['neu_diff'] = (df['neu1'] - df['neu2']).abs()
df['pos_diff'] = (df['pos1'] - df['pos2']).abs()

df.head(10)

### Train-test split

In [None]:
x = df.drop(['question1', 
             'question2', 
             'qid1', 
             'qid2',
             'compound1',
             'neu1',
             'neg1',
             'pos1',
             'compound2',
             'neu2',
             'neg2',
             'pos2',
             'is_duplicate'], axis=1)
y = df['is_duplicate']

x_train, x_test, y_train, y_test = train_test_split(
        x, y, stratify=y, random_state=42
    )

x_train.head()

## TF-IDF Vectorizer

Generate TF-IDF's for the train and test sets

In [None]:
vectorizer = TfidfVectorizer()
train_tfidf = vectorizer.fit_transform(
        x_train['q_concat'].values.astype('U')
    )
test_tfidf = vectorizer.transform(
        x_test['q_concat'].values.astype('U')
    )
x_train_bow = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf.todense(), index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_bow = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf.todense(), index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_bow.head()

## Model 1: Logistic Regression

In [None]:
logit = LogisticRegression(solver='liblinear', random_state=42)
logit.fit(x_train_bow, y_train)
preds = logit.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Model 2: Multinomial Naive Bayes

Multinomial Naive Bayes shows a strong bias towards non-duplicate predictions

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train_bow, y_train)
preds = mnb.predict(x_test_bow)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Feature transformation: Singular Value Decomposition

Using sklearn's TruncatedSVD class, reduce the TF-IDF's into a lower feature space of 100 components

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42)
train_tfidf_lsa = svd.fit_transform(train_tfidf)
test_tfidf_lsa = svd.transform(test_tfidf)

x_train_lsa = pd.merge(
        x_train.drop('q_concat', axis=1), 
        pd.DataFrame(train_tfidf_lsa, index=x_train.index), 
        on=x_train.index
    ).set_index('key_0')
x_test_lsa = pd.merge(
        x_test.drop('q_concat', axis=1), 
        pd.DataFrame(test_tfidf_lsa, index=x_test.index), 
        on=x_test.index
    ).set_index('key_0')

x_train_lsa.head()

## Model 1: Logistic Regression

Not much improvement over the non-reduced dataset

In [None]:
logit = LogisticRegression(C=999999, solver='liblinear', random_state=42)
logit.fit(x_train_lsa, y_train)
preds = logit.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

## Model 2: Support Vector Machine

Using cosine similarity, sentiment differences, and the decomposed TF-IDF's as features, the linear Support Vector Machine Classifier demonstrates greatly improved performance over Multinomial Naive Bayes, with much less bias toward non-duplicate predictions

In [None]:
svc = SVC(kernel='linear', random_state=42)
svc.fit(x_train_lsa, y_train)
preds = svc.predict(x_test_lsa)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))