This notebook is to train and store models to disc.\
This Notebook has to be clean (do not define functions here, do them in an
external utils.py and import them).\
This notebook has to be reproducible (if you run it twice, the same output has to
be displayed and stored to disk).

In [2]:
# import libraries and set randome seed for reproducibility
import pandas as pd
import numpy as np
import sklearn
import scipy
from sklearn import *
import os
import pickle # to save model
from utils import *
RANDOM_SEED = 123 # taken from task description

In [3]:
# use this to train and VALIDATE your solution
train_df = pd.read_csv("./quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./quora_test_data.csv")

We need to convert our questions to strings in order to work with CountVectorizer.

In [5]:
# extract questions (documents) and cast to strings
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
all_questions = q1_train + q2_train

# fit on train set
count_vectorizer_v1 = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer_v1.fit(all_questions)

X_tr_q1q2 = get_features_from_df(train_df,count_vectorizer_v1)
X_te_q1q2  = get_features_from_df(test_df, count_vectorizer_v1)

X_tr_q1q2.shape, train_df.shape, X_te_q1q2.shape, test_df.shape

((323432, 156550), (323432, 6), (80858, 156550), (80858, 6))

Divide processed train set into train and validation according to task description split. Using `random_seed = 123`.

In [6]:
# divide into train and validation set
tr_df, va_df, y_train, y_val = sklearn.model_selection.train_test_split(X_tr_q1q2, train_df["is_duplicate"].values, test_size=0.05, random_state=RANDOM_SEED)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',X_te_q1q2.shape)
print('y_train.shape=',y_train.shape)
print('y_val.shape=',y_val.shape)

tr_df.shape= (307260, 156550)
va_df.shape= (16172, 156550)
te_df.shape= (80858, 156550)
y_train.shape= (307260,)
y_val.shape= (16172,)


Train model on count vectorized matrix of question1 and question2 using train set.

In [7]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",random_state=RANDOM_SEED)
logistic.fit(tr_df, y_train)

LogisticRegression(random_state=123, solver='liblinear')

In [8]:
# save the model to disk
save_model("logreg",logistic)

In [9]:
from sklearn.metrics import roc_auc_score
# Validation
print("Validation Results")
predictions = logistic.predict(va_df)
result = roc_auc_score(y_val, predictions)
print("Val ROC-AUC: %.3f"%(result))
      
# Test   
print("\nTest Results")
predictions = logistic.predict(X_te_q1q2)
result = roc_auc_score(test_df["is_duplicate"].values, predictions)
print("Test ROC-AUC: %.3f"%(result))

Validation Results
Val ROC-AUC: 0.727

Test Results
Test ROC-AUC: 0.724


In [48]:
import nltk
# nltk.download('punkt') # download if not exist
# nltk.download('stopwords') # download if not exist
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def preprocess(sentence):
    """
    Function to preprocess sentences in question1 and question2 column. First words are lower cased,
    then punctuation and stop words (using nltk library) are removed. Then we stem the words
    to remove pre- or postfixes.
    """
    text = sentence.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    print(text)
    text = text.lower() # lower case
    print(text)
    text_tokens = word_tokenize(text) # tokenizing words
    print(text_tokens)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()] # remove stop words
    
    ps = PorterStemmer()
    tokens_stem = [ps.stem(word) for word in tokens_without_sw]
    return tokens_stem

why do i get easily bored with everything?


In [47]:
for i in range(5):
    print(preprocess(train_df["question1"][i]))

Why do I get easily bored with everything
why do i get easily bored with everything
['why', 'do', 'i', 'get', 'easily', 'bored', 'with', 'everything']
['get', 'easili', 'bore', 'everyth']
How do I study for Honeywell company recruitment
how do i study for honeywell company recruitment
['how', 'do', 'i', 'study', 'for', 'honeywell', 'company', 'recruitment']
['studi', 'honeywel', 'compani', 'recruit']
Which search engine algorithm is Quora using
which search engine algorithm is quora using
['which', 'search', 'engine', 'algorithm', 'is', 'quora', 'using']
['search', 'engin', 'algorithm', 'quora', 'use']
How can I smartly cut myself
how can i smartly cut myself
['how', 'can', 'i', 'smartly', 'cut', 'myself']
['smartli', 'cut']
How do I see who is viewing my Instagram videos
how do i see who is viewing my instagram videos
['how', 'do', 'i', 'see', 'who', 'is', 'viewing', 'my', 'instagram', 'videos']
['see', 'view', 'instagram', 'video']
