This notebook is to train and store models to disc.\
This Notebook has to be clean (do not define functions here, do them in an
external utils.py and import them).\
This notebook has to be reproducible (if you run it twice, the same output has to
be displayed and stored to disk).

In [24]:
# import libraries and set randome seed for reproducibility
import pandas as pd
import numpy as np
import sklearn
import scipy
from sklearn import *
import os
import pickle # to save model
from utils import *
RANDOM_SEED = 123 # taken from task description
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

Divide processed train set into train and validation according to task description split. Using `random_seed = 123`.

In [2]:
# use this to train and VALIDATE your solution
data = pd.read_csv("./quora_train_data.csv")

# this split is to create same splits across teams
A_df, test_df, y_A, y_test = train_test_split(data, data["is_duplicate"].values, test_size=0.05, random_state=RANDOM_SEED)
train_df, va_df, y_train, y_val = train_test_split(A_df,y_A, test_size=0.05, random_state=RANDOM_SEED)

print('tr_df.shape=',train_df.shape) # tr_df.shape= (307260, 156550)
print('va_df.shape=',va_df.shape) # va_df.shape= (16172, 156550)
print('te_df.shape=',test_df.shape) # te_df.shape= (80858, 156550)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


We need to convert our questions to strings in order to work with CountVectorizer.

In [3]:
# extract questions (documents) and cast to strings
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
all_questions = q1_train + q2_train

# fit on train set
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
count_vectorizer.fit(all_questions)

X_tr_q1q2 = get_features_from_df(train_df, count_vectorizer)
X_va_q1q2  = get_features_from_df(va_df, count_vectorizer)

X_tr_q1q2.shape, train_df.shape, X_va_q1q2.shape, va_df.shape

((291897, 149650), (291897, 6), (15363, 149650), (15363, 6))

Train model on count vectorized matrix of question1 and question2 using train set.

In [4]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",random_state=RANDOM_SEED)
logistic.fit(X_tr_q1q2, y_train)

LogisticRegression(random_state=123, solver='liblinear')

In [5]:
# save the model to disk
with open("logreg.sav", 'wb') as fout:
    pickle.dump((count_vectorizer, logistic), fout)

In [6]:
# Validation
print("Validation Results")
va_df_prep = get_features_from_df(va_df,count_vectorizer)
predictions = logistic.predict(va_df_prep)
result = roc_auc_score(y_val, predictions)
print("Val ROC-AUC: %.3f"%(result))
      
# Test   
print("\nTest Results")
te_df_prep = get_features_from_df(test_df,count_vectorizer)
predictions = logistic.predict(te_df_prep)
result = roc_auc_score(test_df["is_duplicate"].values, predictions)
print("Test ROC-AUC: %.3f"%(result))

Validation Results
Val ROC-AUC: 0.720

Test Results
Test ROC-AUC: 0.729


### Improved version using cosine similiarity and preprocessing
Our manually written preprocess function is very inefficient and takes quite long to run, instead we use CountVectorizer with similar hyperparameters.

In [7]:
from utils import preprocess # own preprocess function takes to long

q1_train =  train_df["question1"].fillna(' ')
q2_train =  train_df["question2"].fillna(' ')
q1_val =  va_df["question1"]
q2_val =  va_df["question2"]
q1_test =  test_df["question1"]
q2_test =  test_df["question2"]
all_questions = q1_train + q2_train

(291897,)

In [8]:
#count_vectorizer = CountVectorizer(tokenizer=tokenize) # with stemming
# sparse_matrix = count_vectorizer.fit(all_questions) # vocabulary
q1 = count_vectorizer.transform(q1_train)
q2 = count_vectorizer.transform(q2_train)
# result = cosine_similarity(q1,q2).diagonal() # takes to much memory for big matrices

In [9]:
# import required libraries
import numpy as np
from numpy.linalg import norm
from sklearn.preprocessing import normalize

result = []
for i in range(q1.shape[0]): # for loop to avoid running out of memory
    result.append(cosine_similarity(q1[i],q2[i]))

In [11]:
result = [item for sub_list in result for item in sub_list]
result = [item for sub_list in result for item in sub_list] # needs to be done 2 times

In [None]:
result = np.asarray(result)

In [16]:
# save the dataframe as a csv file
pd.DataFrame(result).to_csv("train_cosine_similiarity.csv",index=False)

In [19]:
# Train Set Performance
print("Train Results")
predictions = np.where(np.asarray(result) > 0.5,1,0)
# result = roc_auc_score(y_train[:SUBSET], predictions)
result = roc_auc_score(y_train, predictions)
print("Train ROC-AUC: %.3f"%(result))

Train Results
Train ROC-AUC: 0.672


In [23]:
# Validation
print("Validation Results")
q1 = count_vectorizer.transform(q1_val)
q2 = count_vectorizer.transform(q2_val)
result = get_cosine_sim(q1,q2)
pd.DataFrame(result).to_csv("val_cosine_similiarity.csv",index=False)
predictions = np.where(result > 0.5,1,0)
result = roc_auc_score(y_val, predictions)
print("Val ROC-AUC: %.3f"%(result))
      
# Test   
print("\nTest Results")
q1 = count_vectorizer.transform(q1_test)
q2 = count_vectorizer.transform(q2_test)
result = get_cosine_sim(q1,q2)
pd.DataFrame(result).to_csv("test_cosine_similiarity.csv",index=False)
predictions = np.where(result > 0.5,1,0)
result = roc_auc_score(test_df["is_duplicate"].values, predictions)
print("Test ROC-AUC: %.3f"%(result))

Validation Results
Val ROC-AUC: 0.669

Test Results
Test ROC-AUC: 0.668


Use TD-IDF to compute feature vectors and cosine similiarity to compute if questions are similar.

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer # can reuse wordcount from previous cells
tfidf_transformer = TfidfTransformer(use_idf=True) # use_idf needs to be set to true for td-idf
word_count = count_vectorizer.transform(all_questions)
tfidf = tfidf_transformer.fit_transform(word_count)

In [26]:
df = pd.DataFrame(tfidf[0].T.todense(), index=count_vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

                TF-IDF
backend       0.550495
java          0.379052
or            0.297714
gui           0.293719
amongst       0.289712
startups      0.222999
develop       0.201415
popular       0.192509
development   0.189478
software      0.173458
language      0.165008
most          0.127981
will          0.113445
have          0.102124
which         0.101078
for           0.078056
and           0.072000
do            0.064819
how           0.058484
is            0.052148
the           0.048485
peacemaker    0.000000
peacekeeping  0.000000
peach         0.000000
peacekeepers  0.000000


In [None]:
from sklearn.metrics.pairwise import linear_kernel
q1_mat = tfidf_transformer.transform(count_vectorizer.transform(q1_train))
q2_mat = tfidf_transformer.transform(count_vectorizer.transform(q2_train))

In [28]:
train_tfidf_cos = get_cosine_sim(q1_mat, q2_mat)

In [29]:
# save the dataframe as a csv file
pd.DataFrame(train_tfidf_cos).to_csv("train_tfidf_cos.csv",index=False)

In [30]:
# Train Set Performance
print("Train Results")
predictions = np.where(train_tfidf_cos > 0.5,1,0)
result = roc_auc_score(y_train, predictions)
print("Train ROC-AUC: %.3f"%(result))

Train Results
Train ROC-AUC: 0.675


In [35]:
# Validation
print("Validation Results")
q1 = tfidf_transformer.transform(count_vectorizer.transform(q1_val))
q2 = tfidf_transformer.transform(count_vectorizer.transform(q2_val))
val_result = get_cosine_sim(q1, q2)
pd.DataFrame(val_result).to_csv("val_tfidf_cos.csv",index=False)
predictions = np.where(val_result > 0.5,1,0)
result = roc_auc_score(y_val, predictions)
print("Val ROC-AUC: %.3f"%(result))

Validation Results
Val ROC-AUC: 0.666


In [36]:
# Test   
print("\nTest Results")
q1 = tfidf_transformer.transform(count_vectorizer.transform(q1_test))
q2 = tfidf_transformer.transform(count_vectorizer.transform(q2_test))
test_result = cosine_similarity(q1,q2).diagonal()
pd.DataFrame(test_result).to_csv("test_tfidf_cos.csv",index=False)
predictions = np.where(test_result > 0.5,1,0)
result = roc_auc_score(test_df["is_duplicate"], predictions)
print("Test ROC-AUC: %.3f"%(result))


Test Results
Test ROC-AUC: 0.669
