# Quora Insincere Questions Classification
# Part 2: Classifier

## Introduction

In this part, we will create and tune several classifiers, and test different techniques such as weight balancing on the Quora Insincere Questions Classification challenge.
Split and projected train and test data are already ready to use. 

## Simple Model
Let's start with a simple model. No weight balancing, only one or two layers and no regularization. 

In [3]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import src.quora.preproc as pp

from importlib import reload

reload(pp)

<module 'src.quora.preproc' from '/home/marc/work/kaggle/01-quora/src/quora/preproc.py'>

In [12]:
data_dir = "data/embedded"
train_index,train_data, train_targets = pp.load_projections(os.path.join(data_dir,"training_90p_50d_50000.proj"))
print(train_data.shape)

(50000, 50)


In [6]:
def set_perceptron(X,input_dim):
    
    # Simple perceptron
    W = tf.get_variable("W",[1,input_dim],regularizer=tf.contrib.layers.l2_regularizer(0.01),
                        initializer=tf.contrib.layers.xavier_initializer(seed = 1))
    b = tf.get_variable("b",[1,1],regularizer=tf.contrib.layers.l2_regularizer(0.01),
                        initializer=tf.zeros_initializer())
    
    Z = tf.matmul(W,X) + b
    #A = tf.sigmoid(Z,name="sig")
    
    model_dict = {"W":W,"b":b,"Z":Z}
        
    return model_dict

def run_perceptron(train,targets,epochs,epoch_print=500,learning_rate=0.0001):
    
    # Input dim -> vector length (proj_dim,1), minibatch size is determined at runtime... I think
    # We know output is a log regression so 1
    X = tf.placeholder(name="X",dtype=tf.float32,shape=[train.shape[0],None])
    Y = tf.placeholder(name="Y",dtype=tf.float32,shape=[1,None])
    
    model_dict = set_perceptron(X,train.shape[0])
    
    # Weighted loss function
    ratio = 1-np.sum(targets)/targets.shape[1]
    print("Label learning weight ration = {}".format(ratio))
    wp = tf.multiply(Y,tf.constant(ratio,dtype=tf.float32))
    wn = tf.multiply(1-Y,tf.constant(1-ratio,dtype=tf.float32))
    weights = tf.add(wp,wn)
    #cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_dict["Z"],labels=Y))
    cost = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(Y,model_dict["Z"],weights=weights))
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(epochs):
            _ , epoch_cost = sess.run([optimizer, cost], feed_dict={X:train, Y:targets})
            if(epoch % epoch_print == 0 or epoch == epochs - 1):
                print("Epoch: {}, cost={}".format(epoch,epoch_cost))
                correct_prediction = tf.equal(tf.round(tf.sigmoid(model_dict["Z"])),Y)
                accuracy = tf.reduce_mean(tf.cast(correct_prediction,"float"))
                print("Accuracy = {}".format(accuracy.eval({X:train,Y:targets})))
                
        parameters = {"W":model_dict["W"],"b":model_dict["b"]}
        model = sess.run(parameters)
        
        # printing proportion of correctly classified by class
        nb_p = np.sum(targets)
        nb_n = np.sum(1-targets)
        mask_p = targets.astype(bool)
        mask_n = np.logical_not(mask_p)
        predictions = tf.equal(tf.round(tf.sigmoid(model_dict["Z"])),Y).eval({X:train,Y:targets})
        print("positive ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_p,predictions)),
                                                     nb_p,
                                                     np.sum(np.logical_and(mask_p,predictions))/nb_p))
        print("negative ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_n,predictions)),
                                                     nb_n,
                                                     np.sum(np.logical_and(mask_n,predictions))/nb_n))
        
        
        
        
    return model
    


In [13]:
tf.reset_default_graph()
%time little_model = run_perceptron(train_data.T,train_targets.reshape(1,len(train_targets)),5000,epoch_print=1000)

Label learning weight ration = 0.94004
Epoch: 0, cost=0.07986059039831161
Accuracy = 0.12421999871730804
Epoch: 1000, cost=0.05576595664024353
Accuracy = 0.7840999960899353
Epoch: 2000, cost=0.05272633954882622
Accuracy = 0.7897400259971619
Epoch: 3000, cost=0.051876455545425415
Accuracy = 0.7906000018119812
Epoch: 4000, cost=0.05162222310900688
Accuracy = 0.7904999852180481
Epoch: 4999, cost=0.05153188109397888
Accuracy = 0.7910000085830688
positive ratio : 2384 / 2998.0 = 0.7951967978652434
negative ratio : 37166 / 47002.0 = 0.7907323092634356
CPU times: user 1min 47s, sys: 8.17 s, total: 1min 55s
Wall time: 1min 22s


Best results after tunning it for a while is about 80% accuracy for both group. Not great. Let's train on the whole dataset and test on test data.

In [14]:
train_index,train_data, train_targets = pp.load_projections(os.path.join(data_dir,"training_90p_50d.proj"))
test_index,test_data,test_targets = pp.load_projections(os.path.join(data_dir,"test_90p_50d.proj"))
print(train_data.shape)
print(test_data.shape)

(1175509, 50)
(130613, 50)


In [15]:
tf.reset_default_graph()
%time model = run_perceptron(train_data.T,train_targets.reshape(1,len(train_targets)),2000,epoch_print=200)

Label learning weight ration = 0.9381297803759903
Epoch: 0, cost=0.0822649747133255
Accuracy = 0.12650689482688904
Epoch: 200, cost=0.06896921247243881
Accuracy = 0.7215036153793335
Epoch: 400, cost=0.06376713514328003
Accuracy = 0.7654828429222107
Epoch: 600, cost=0.060645777732133865
Accuracy = 0.7777805328369141
Epoch: 800, cost=0.05857668071985245
Accuracy = 0.783149242401123
Epoch: 1000, cost=0.05714581534266472
Accuracy = 0.7861751914024353
Epoch: 1200, cost=0.05612807348370552
Accuracy = 0.7883895635604858
Epoch: 1400, cost=0.05538638308644295
Accuracy = 0.7894461154937744
Epoch: 1600, cost=0.054834283888339996
Accuracy = 0.7907493710517883
Epoch: 1800, cost=0.054416149854660034
Accuracy = 0.7915719747543335
Epoch: 1999, cost=0.05409679189324379
Accuracy = 0.7921623587608337
positive ratio : 57236 / 72729.0 = 0.7869763093126538
negative ratio : 873958 / 1102780.0 = 0.7925043979760242
CPU times: user 20min 27s, sys: 7min 27s, total: 27min 54s
Wall time: 27min 48s


In [51]:
# Save models
little_model_path = "data/little_model/little_model.ckpt"
W = tf.Variable(little_model.get("W"))
b = tf.Variable(little_model.get("b"))
saver = tf.train.Saver({"W":W,"b":b})
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    spath = saver.save(sess,little_model_path)
    print("Little model saved under {}".format(spath))
    
model_path = "data/model/model.ckpt"
W = tf.Variable(model.get("W"))
b = tf.Variable(model.get("b"))
saver = tf.train.Saver({"W":W,"b":b})
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    spath = saver.save(sess,model_path)
    print("Model saved under {}".format(spath))

Little model saved under data/little_model/little_model.ckpt
Model saved under data/model/model.ckpt


In [52]:
# Check saved model
tf.reset_default_graph()
W = tf.get_variable(name="W",shape=little_model.get("W").shape)
b = tf.get_variable(name="b",shape=little_model.get("b").shape)
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess,little_model_path)
    print("W = %s" % W.eval())
    print("b = %s" % b.eval())
    
tf.reset_default_graph()
W = tf.get_variable(name="W",shape=model.get("W").shape)
b = tf.get_variable(name="b",shape=model.get("b").shape)
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess,model_path)
    print("W = %s" % W.eval())
    print("b = %s" % b.eval())

INFO:tensorflow:Restoring parameters from data/little_model/little_model.ckpt
W = [[-1.5350993  -1.0646862   0.93980783 -0.05377308  1.1097102   2.1315331
   0.16534835  0.10330456 -0.9122293  -0.01724127 -1.2123718  -0.23289846
   0.9106384  -0.39506698 -1.0739824  -0.50956106  0.9283245  -0.23896891
   0.1469524   1.4991794  -0.48674285  1.9710633  -0.31396177  0.19180572
  -0.47586817 -0.9097925   0.2967707   1.6963593  -0.35719016 -0.14206137
  -0.6773655   1.2112517  -1.3445208   0.53827786 -1.1259295  -1.5763471
   0.5442923  -1.4968342  -1.970133   -0.63807523 -0.91635275 -0.5067819
   1.0440195   1.0228881   1.0398076  -0.19438055 -2.1268923  -1.2733794
   0.11653239 -1.4713463 ]]
b = [[0.27551776]]
INFO:tensorflow:Restoring parameters from data/model/model.ckpt
W = [[-1.0220009e+00 -7.4180824e-01  4.7776654e-01 -6.3137639e-01
   1.0705887e+00  1.1300802e+00  2.7454108e-01  2.4078566e-01
  -7.3606294e-01 -3.4437899e-02 -7.6920635e-01 -8.6861145e-01
   4.7860834e-01  7.5270870e-

In [53]:
predictions = tf.equal(tf.round(tf.sigmoid(model.get("Z")),Y).eval({X:test_data[:10],Y:test_targets[:10]}))
accuracy = tf.reduce_mean(tf.cast(prediction,"float"))
print("Accuracy = {}".format(accuracy))
nb_p = np.sum(targets)
nb_n = np.sum(1-targets)
mask_p = targets.astype(bool)
mask_n = np.logical_not(mask_p)
print("positive ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_p,predictions)),
                                             nb_p,
                                             np.sum(np.logical_and(mask_p,predictions))/nb_p))
print("negative ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_n,predictions)),
                                             nb_n,
                                             np.sum(np.logical_and(mask_n,predictions))/nb_n))

ValueError: None values not supported.

In [24]:
model.keys()

dict_keys(['W', 'b'])