This notebook is to train and store models to disc.\
This Notebook has to be clean (do not define functions here, do them in an
external utils.py and import them).\
This notebook has to be reproducible (if you run it twice, the same output has to
be displayed and stored to disk).

In [2]:
# import libraries and set randome seed for reproducibility
import pandas as pd
import numpy as np
import sklearn
import scipy
from sklearn import *
import os
import pickle # to save model
from utils import *
RANDOM_SEED = 123 # taken from task description
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
# use this to train and VALIDATE your solution
train_df = pd.read_csv("./quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./quora_test_data.csv")

We need to convert our questions to strings in order to work with CountVectorizer.

In [4]:
# extract questions (documents) and cast to strings
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
all_questions = q1_train + q2_train

# fit on train set
count_vectorizer_v1 = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer_v1.fit(all_questions)

X_tr_q1q2 = get_features_from_df(train_df,count_vectorizer_v1)
X_te_q1q2  = get_features_from_df(test_df, count_vectorizer_v1)

X_tr_q1q2.shape, train_df.shape, X_te_q1q2.shape, test_df.shape

((323432, 156550), (323432, 6), (80858, 156550), (80858, 6))

Divide processed train set into train and validation according to task description split. Using `random_seed = 123`.

In [20]:
# divide into train and validation set
tr_df, va_df, y_train, y_val = sklearn.model_selection.train_test_split(X_tr_q1q2, train_df["is_duplicate"].values, test_size=0.05, random_state=RANDOM_SEED)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',X_te_q1q2.shape)
print('y_train.shape=',y_train.shape)
print('y_val.shape=',y_val.shape)

tr_df.shape= (307260, 156550)
va_df.shape= (16172, 156550)
te_df.shape= (80858, 156550)
y_train.shape= (307260,)
y_val.shape= (16172,)


Train model on count vectorized matrix of question1 and question2 using train set.

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",random_state=RANDOM_SEED)
logistic.fit(tr_df, y_train)

In [None]:
# save the model to disk
save_model("logreg",logistic)

In [None]:
# Validation
print("Validation Results")
predictions = logistic.predict(va_df)
result = roc_auc_score(y_val, predictions)
print("Val ROC-AUC: %.3f"%(result))
      
# Test   
print("\nTest Results")
predictions = logistic.predict(X_te_q1q2)
result = roc_auc_score(test_df["is_duplicate"].values, predictions)
print("Test ROC-AUC: %.3f"%(result))

### Improved version using cosine similiarity and preprocessing
Our manually written preprocess function is very inefficient and takes quite long to run, instead we use CountVectorizer with similar hyperparameters.

In [6]:
from utils import preprocess # own preprocess function takes to long
# divide into train and validation set

SUBSET = 30000
tr_df, va_df, y_train, y_val = sklearn.model_selection.train_test_split(train_df, train_df["is_duplicate"].values, test_size=0.05, random_state=RANDOM_SEED)
q1_train =  tr_df["question1"].fillna(' ')[:SUBSET]
q2_train =  tr_df["question2"].fillna(' ')[:SUBSET]
q1_val =  va_df["question1"][:SUBSET]
q2_val =  va_df["question2"][:SUBSET]
q1_test =  test_df["question1"][:SUBSET]
q2_test =  test_df["question2"][:SUBSET]
all_questions = q1_train + q2_train

In [7]:
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# count_vectorizer = CountVectorizer(ngram_range=(1,1),lowercase=True,tokenizer=tokenize)
count_vectorizer = CountVectorizer(ngram_range=(1,1),lowercase=True,stop_words='english')
sparse_matrix = count_vectorizer.fit(all_questions) # vocabulary
q1 = count_vectorizer.transform(q1_train)
q2 = count_vectorizer.transform(q2_train)
result = cosine_similarity(q1,q2).diagonal()

In [41]:
# Train Set Performance
print("Train Results")
predictions = np.where(result > 0.5,1,0)
result = roc_auc_score(y_train[:SUBSET], predictions)
print("Train ROC-AUC: %.3f"%(result))

Train Results
Train ROC-AUC: 0.676


In [42]:
# Validation
print("Validation Results")
q1 = count_vectorizer.transform(q1_val)
q2 = count_vectorizer.transform(q2_val)
result = cosine_similarity(q1,q2).diagonal()
predictions = np.where(result > 0.5,1,0)
result = roc_auc_score(y_val[:SUBSET], predictions)
print("Val ROC-AUC: %.3f"%(result))
      
# Test   
print("\nTest Results")
q1 = count_vectorizer.transform(q1_test)
q2 = count_vectorizer.transform(q2_test)
result = cosine_similarity(q1,q2).diagonal()
predictions = np.where(result > 0.5,1,0)
result = roc_auc_score(test_df["is_duplicate"].values[:SUBSET], predictions)
print("Test ROC-AUC: %.3f"%(result))

Validation Results
Val ROC-AUC: 0.680

Test Results
Test ROC-AUC: 0.683


Use TD-IDF to compute feature vectors and cosine similiarity to compute if questions are similar.

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer # can reuse wordcount from previous cells
transformer = TfidfTransformer(use_idf=True) # use_idf needs to be set to true for td-idf
