# Quora Question Pair Similarity
### Kaggle Competition link: https://www.kaggle.com/c/quora-question-pairs

<p>We have built features to train the model on. Here we will load data with all our 627 features. We will first build a random or simple (Naive Bayes) base model and then will try out different machine learning algorithms and compare against our base model. After that, we will choose the best one and tune it to generalize it on future data.</p> 
<p> The metrics we will evaluate the models on are:<br>
* log-loss <br>
* Binary Confusion Matrix <br> 
</p>

Our strategy is:
1. Load the data
2. Split data into train test (70:30)
3. Normalize data
4. <b>Build random model:</b> A model that randomly assigns probabilities.
5. Apply models with default parameters:<br>
   i. <b>Build Logistic Regression:</b> A statistical model that uses a logistic function to model the probability of a binary response based on one or more predictor variables.<br>
   ii. <b>Build Naive Bayes:</b> A probabilistic algorithm based on Bayes' theorem that assumes the independence of the features in the input data<br>
   iii. <b>Build Support Vector Machines:</b> Works by finding the best hyperplane that separates different classes of data points<br>
   iv. <b>Build Gradient Boosting:</b> A powerful ensemble method that combines multiple weak models to create a strong classifier<br>
   v: <b>XG Boost:</b>



In [2]:
# Imports

# General
from datetime import datetime 
import pickle

# Data 
import pandas as pd
import numpy as np 
import sqlite3
from sqlalchemy import create_engine
from collections import Counter
from sklearn.model_selection import train_test_split

# Vectorization
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler 
from sklearn.impute import SimpleImputer

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# CV
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix


#### 1. Load data from SQLite

In [3]:
start = datetime.now()
try:
    conn = sqlite3.connect("train.db")
    data = pd.read_sql_query("SELECT * FROM train_data ORDER BY RANDOM() LIMIT 100000", conn)
    conn.commit()
    conn.close()
    print("Data loaded!\nTime taken: {0}".format(datetime.now()-start))
except Exception as e:
    print(e)

Data loaded!
Time taken: 0:02:47.199440


In [4]:
print("Shape of data: {0}".format(data.shape))

Shape of data: (100000, 634)


In [5]:
# Remove unnecessary columns
data = data.iloc[:,6:]
print("Shape of data after removing unnecessary columns: {0}".format(data.shape))

Shape of data after removing unnecessary columns: (100000, 628)


In [6]:
data.describe()

Unnamed: 0,is_duplicate,q1_frequency,q2_frequency,q1_length,q2_length,q1_tokens_count,q2_tokens_count,q1_words_count,q2_words_count,q1_nonstopwords_count,...,q2_feat_291,q2_feat_292,q2_feat_293,q2_feat_294,q2_feat_295,q2_feat_296,q2_feat_297,q2_feat_298,q2_feat_299,q2_feat_300
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.37056,2.84368,3.06252,59.5165,60.03607,12.43623,12.67911,10.94449,11.172,5.64594,...,46.830614,-27.859523,12.504153,-11.290065,-43.511224,-3.299467,26.084432,-20.810463,-66.57113,36.151218
std,0.482957,4.511923,6.084101,29.852672,33.841389,6.0597,7.071772,5.410749,6.300228,3.065338,...,60.902564,50.678152,69.079162,60.120607,56.651796,59.411909,57.067377,68.73701,67.284168,58.686727
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,-1181.601844,-869.310289,-1082.525471,-1091.648775,-773.917976,-768.730118,-402.046199,-864.229748,-2196.106435,-364.145416
25%,0.0,1.0,1.0,39.0,39.0,9.0,8.0,7.0,7.0,4.0,...,10.555332,-54.93907,-21.404714,-42.356623,-69.761781,-36.626309,-6.141048,-57.998502,-96.638794,0.665372
50%,0.0,1.0,1.0,52.0,51.0,11.0,11.0,10.0,10.0,5.0,...,41.185087,-26.160044,14.176917,-10.020232,-35.516233,-5.113182,22.14294,-18.186085,-57.04795,28.246542
75%,1.0,3.0,2.0,72.0,72.0,15.0,15.0,13.0,13.0,7.0,...,77.560943,-0.450381,49.923865,21.925329,-8.349561,27.59417,54.677847,18.421707,-24.613595,62.833737
max,1.0,50.0,120.0,354.0,1169.0,100.0,272.0,71.0,237.0,37.0,...,1487.891279,774.577598,779.584843,852.355989,569.228259,907.38918,661.97204,574.29996,371.700851,1080.074251


#### 2. Split data into train test (70:30)

In [7]:
# Split data into X & y first
X = data.drop('is_duplicate', axis=1)
y = data['is_duplicate']

print("Shape of X: {0}".format(X.shape))
print("Shape of y: {0}".format(y.shape))

Shape of X: (100000, 627)
Shape of y: (100000,)


In [8]:
# Split into train & test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)
print("Shape of X_train: {0}".format(X_train.shape))
print("Shape of X_test: {0}".format(X_test.shape))
print("Shape of y_train: {0}".format(y_train.shape))
print("Shape of y_test: {0}".format(y_test.shape))

Shape of X_train: (70000, 627)
Shape of X_test: (30000, 627)
Shape of y_train: (70000,)
Shape of y_test: (30000,)


In [9]:
print("Distribution of target variable in train")
train_counter = Counter(y_train)
train_len = len(y_train)
print("Class 0: {0} % \nClass 1: {1} %".format((train_counter[0]/train_len)*100, (train_counter[1]/train_len)*100))


print("\nDistribution of target variable in test")
test_counter = Counter(y_test)
test_len = len(y_test)
print("Class 0: {0} % \nClass 1: {1} %".format((test_counter[0]/test_len)*100, (test_counter[1]/test_len)*100))


Distribution of target variable in train
Class 0: 62.94428571428572 % 
Class 1: 37.05571428571429 %

Distribution of target variable in test
Class 0: 62.94333333333333 % 
Class 1: 37.056666666666665 %


In [10]:
# Replace NaN with 0
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)


#### 3. Normalize data
Before we proceed to build the models, lets normalize all the features first

In [11]:
numerical_features = list(X_train.columns)

In [12]:
numerical_pipeline = Pipeline(steps=[("normalizer", MinMaxScaler()), (("imputer", SimpleImputer(strategy="most_frequent")))])

vectorizer = ColumnTransformer([("num_pipeline", numerical_pipeline, numerical_features)])

start = datetime.now()
print("Vectorizing X_train")
X_train = vectorizer.fit_transform(X_train)
print("Normalization of X_train is completed.\n\nTime taken: {0}".format(datetime.now()-start))

start = datetime.now()
print("\nNormalizing X_test")
X_test = vectorizer.transform(X_test)
print("Normalization of X_test is completed.\n\nTime taken: {0}".format(datetime.now()-start))


Vectorizing X_train
Normalization of X_train is completed.

Time taken: 0:00:05.002509

Normalizing X_test
Normalization of X_test is completed.

Time taken: 0:00:00.436034


In [13]:
# Lets save our vectorizer to .pkl file
vectorizer_file = "../models/vectorizer.pkl"
with open(vectorizer_file, 'wb') as f:
    pickle.dump(vectorizer, f)
print('Dumped the vectorizer in {} file'.format(vectorizer_file))

Dumped the vectorizer in ../models/vectorizer.pkl file


#### 4. Build random model
Here we will randomly assign a class based on random probability to each test data point and measure its log loss.<br>
A strategy we will follow for this is:
1. Generatea list of 2 random numbers for each test row
2. Divide each random number by its sum so we get their sum as 1
3. Take the index of maximum of the 2 numbers in the list
4. This index will be the class of given test row 

In [14]:
y_pred_prob = np.zeros((test_len,2))
for i in range(test_len):
    random_probs = np.random.rand(1,2)
    y_pred_prob[i] = ((random_probs/sum(sum(random_probs)))[0])

print("Test log-loss of random model: {0}".format(log_loss(y_test, y_pred_prob, eps=1e-15)))

y_pred = np.argmax(y_pred_prob, axis=1)

print("\nTest accuracy score of random model: {0}".format(accuracy_score(y_test, y_pred)))

print("\nTest confusion matrix of random model: \n{0}".format(confusion_matrix(y_test, y_pred)))

print("\nTest confusion matrix of random model (%): \n{0}".format(np.round(confusion_matrix(y_test, y_pred)/len(y_test)*100,2)))

    

Test log-loss of random model: 0.89532403134099

Test accuracy score of random model: 0.4961

Test confusion matrix of random model: 
[[9373 9554]
 [5563 5510]]

Test confusion matrix of random model (%): 
[[31.24 31.85]
 [18.54 18.37]]


We wil ltake this as benchmark to compare our future models

#### 5. Apply ML models

In [15]:
result = []
for classifier in [LogisticRegression(solver='lbfgs', max_iter=3000), BernoulliNB(), SVC(), GradientBoostingClassifier() ]:
    
    # Training
    start = datetime.now()
    clf_str = str(classifier).split("(")[0]
    print("{0} started.".format(clf_str))
    classifier.fit(X_train, y_train)
    print("{0} training completed. Time taken: {1}\n".format(clf_str, datetime.now()-start))
    
    # Prediction
    y_pred = classifier.predict(X_test)
    
    # Evaluation
    lg_loss = log_loss(y_test,y_pred)
    acc = accuracy_score(y_test,y_pred)
    cm = np.round((confusion_matrix(y_test,y_pred)/len(X_test)*100),2)
    
    # Add to result
    temp = list()
    temp.append(clf_str)
    temp.append("Default")
    temp.append(lg_loss)
    temp.append(acc)
    temp.append(cm)
    temp.append(datetime.now()-start)
    result.append(temp)



LogisticRegression started.
LogisticRegression training completed. Time taken: 0:00:47.715623

BernoulliNB started.
BernoulliNB training completed. Time taken: 0:00:00.554632

SVC started.
SVC training completed. Time taken: 0:22:43.016640

GradientBoostingClassifier started.
GradientBoostingClassifier training completed. Time taken: 0:51:01.605331



Now we will train XGBoost using sklearn API (not Learning API). 

In [33]:
# Training
start = datetime.now()
xgbst = XGBClassifier()

print("XGBoost started.")
xgbst.fit(X_train, y_train)
print("XGBoost training completed. Time taken: {0}\n".format(datetime.now()-start))

# Prediction
y_pred = xgbst.predict(X_test)

# Evaluation
lg_loss = log_loss(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
cm = np.round((confusion_matrix(y_test,y_pred)/len(X_test)*100),2)

# Add to result
temp = list()
temp.append("XGBoost")
temp.append("Default")
temp.append(lg_loss)
temp.append(acc)
temp.append(cm)
temp.append(datetime.now()-start)
result.append(temp)

XGBoost started.
XGBoost training completed. Time taken: 0:06:35.676606



In [34]:
pd.DataFrame(result, columns=['Algorithm', 'Hyperparameters', 'Log-loss', 'Accuracy', 'Confusion Matrix (TP,FP,FN,TN)', 'Time taken'])

Unnamed: 0,Algorithm,Hyperparameters,Log-loss,Accuracy,"Confusion Matrix (TP,FP,FN,TN)",Time taken
0,LogisticRegression,Default,8.010101,0.777767,"[[53.57, 9.52], [12.7, 24.21]]",0 days 00:00:47.783151
1,BernoulliNB,Default,12.445874,0.6547,"[[38.9, 24.19], [10.34, 26.57]]",0 days 00:00:00.805230
2,SVC,Default,7.282019,0.797967,"[[53.89, 9.2], [11.0, 25.91]]",0 days 00:34:24.827754
3,GradientBoostingClassifier,Default,6.320855,0.824633,"[[56.07, 7.02], [10.51, 26.4]]",0 days 00:51:01.899364
4,XGBoost,Default,5.989254,0.833833,"[[55.71, 7.38], [9.23, 27.68]]",0 days 00:06:35.918235


As per above, XGBoostClassifier has given us the best result with minimal training time. We will hypertune this to see how we can improve this. 

In [44]:
pd.DataFrame(zip(numerical_features, xgbst.feature_importances_), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)[:10]

Unnamed: 0,Feature,Importance
15,common_nonstopwords_share,0.075802
0,q1_frequency,0.022391
1,q2_frequency,0.020055
24,common_words_count_max,0.013781
18,fuzz_token_sort_ratio,0.012756
21,common_tokens_count_min,0.007614
14,common_nonstopwords_count,0.007455
10,common_tokens_count,0.007186
25,common_nonstopwords_count_min,0.007142
19,fuzz_token_set_ratio,0.007078


In [45]:
print(xgbst)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


In [13]:
start = datetime.now()
print("GridSearchCV started.")
#     'gamma' : [i/10.0 for i in range(0,4)],
parameters = {
    'n_estimators': (100,200),
    'max_depth': [i for i in range(3,9,2)],
    'reg_alpha' : [1e-2, 0.1, 1],
    'learning_rate': [1e-2, 0.1, 0.25]
}

bst = XGBClassifier()
clf = GridSearchCV(bst, parameters, return_train_score=True, scoring="neg_log_loss", cv=5)
clf.fit(X_train, y_train)
print("GridSearchCV completed. Time taken: {0}".format(datetime.now()-start))


GridSearchCV started.
GridSearchCV completed. Time taken: 1 day, 2:30:28.404633


In [19]:
cv_result = pd.DataFrame.from_dict(clf.cv_results_)
cv_result.to_csv("../models/cv_result.csv")

In [20]:
clf.best_estimator_

In [21]:
clf.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1}

In [18]:
cv_result.sort_values(by='mean_test_score', ascending=False)[:3]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
35,625.874169,15.887039,0.094382,0.014431,0.1,7,200,1.0,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",-0.326015,...,-0.329985,0.006394,1,-0.148817,-0.145842,-0.148142,-0.148319,-0.152427,-0.148709,0.002123
33,627.068382,34.131511,0.099374,0.019783,0.1,7,200,0.01,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",-0.328543,...,-0.330771,0.004665,2,-0.152286,-0.149122,-0.147583,-0.149894,-0.150018,-0.14978,0.001524
34,657.869789,20.192025,0.096362,0.009753,0.1,7,200,0.1,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",-0.329161,...,-0.331602,0.005049,3,-0.147848,-0.144877,-0.149461,-0.151994,-0.149598,-0.148755,0.002348


In [27]:
file = open('gridsearchcv.pkl','wb')
pickle.dump(clf, file)
print('Dumped: {}'.format(clf))
file.close()

Dumped: GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
           

In [23]:
# Training
start = datetime.now()
xgbst = XGBClassifier(learning_rate= 0.1, max_depth= 7, n_estimators= 200, reg_alpha= 1)

print("XGBoost started with parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1}")
xgbst.fit(X_train, y_train)
print("XGBoost training completed. Time taken: {0}\n".format(datetime.now()-start))

# Prediction
y_pred = xgbst.predict(X_test)

# Evaluation
lg_loss = log_loss(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)
cm = np.round((confusion_matrix(y_test,y_pred)/len(X_test)*100),2)

# Add to result
temp = list()
temp.append("XGBoost")
temp.append("'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1")
temp.append(lg_loss)
temp.append(acc)
temp.append(cm)
temp.append(datetime.now()-start)
result.append(temp)

XGBoost started with parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1}
XGBoost training completed. Time taken: 0:14:57.686727



NameError: name 'result' is not defined

In [25]:
temp

['XGBoost',
 "'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1",
 5.871511137087184,
 0.8371,
 array([[56.21,  6.73],
        [ 9.56, 27.5 ]]),
 datetime.timedelta(seconds=897, microseconds=960767)]

So we brought log_loss down to 5.8715111 from 5.989254 after hyperparameters tuning. Lets save this model.

In [28]:
file = open('xgboost_tuned.pkl','wb')
pickle.dump(xgbst, file)
print('Dumped: {}'.format(xgbst))

file.close()

Dumped: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
