# Quora Insincere Questions Classification
# Part 2: Classifier

## Introduction

In this part, we will create and tune several classifiers, and test different techniques such as weight balancing on the Quora Insincere Questions Classification challenge.
Split and projected train and test data are already ready to use. 

## Simple Model
Let's start with a simple model. No weight balancing, only one or two layers and no regularization. 

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import src.quora.preproc as pp

from importlib import reload

reload(pp)

<module 'src.quora.preproc' from '/home/marc/work/kaggle/01-quora/src/quora/preproc.py'>

In [2]:
data_dir = "data/embedded"
train_index,train_data, train_targets = pp.load_projections(os.path.join(data_dir,"training_90p_50d_50000.proj"))
test_index,test_data,test_targets = pp.load_projections(os.path.join(data_dir,"test_90p_50d.proj"))
print(train_data.shape)
print(test_data.shape)

(50000, 50)
(130613, 50)


In [83]:
def set_perceptron(X,input_dim):
    
    # Simple perceptron
    W = tf.get_variable("W",[1,input_dim],regularizer=tf.contrib.layers.l2_regularizer(0.01),
                        initializer=tf.contrib.layers.xavier_initializer(seed = 1))
    b = tf.get_variable("b",[1,1],regularizer=tf.contrib.layers.l2_regularizer(0.01),
                        initializer=tf.zeros_initializer())
    
    Z = tf.matmul(W,X) + b
    #A = tf.sigmoid(Z,name="sig")
    
    model_dict = {"W":W,"b":b,"Z":Z}
        
    return model_dict

def run_perceptron(train,targets,epochs,epoch_print=500,learning_rate=0.001):
    
    # Input dim -> vector length (proj_dim,1), minibatch size is determined at runtime... I think
    # We know output is a log regression so 1
    X = tf.placeholder(name="X",dtype=tf.float32,shape=[train.shape[0],None])
    Y = tf.placeholder(name="Y",dtype=tf.float32,shape=[1,None])
    
    model_dict = set_perceptron(X,train.shape[0])
    
    # Weighted loss function
    ratio = 1-np.sum(targets)/targets.shape[1]
    print("Label learning weight ration = {}".format(ratio))
    wp = tf.multiply(Y,tf.constant(ratio,dtype=tf.float32))
    wn = tf.multiply(1-Y,tf.constant(1-ratio,dtype=tf.float32))
    weights = tf.add(wp,wn)
    #cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_dict["Z"],labels=Y))
    cost = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(Y,model_dict["Z"],weights=weights))
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(epochs):
            _ , epoch_cost = sess.run([optimizer, cost], feed_dict={X:train, Y:targets})
            if(epoch % epoch_print == 0 or epoch == epochs - 1):
                print("Epoch: {}, cost={}".format(epoch,epoch_cost))
                correct_prediction = tf.equal(tf.round(tf.sigmoid(model_dict["Z"])),Y)
                accuracy = tf.reduce_mean(tf.cast(correct_prediction,"float"))
                print("Accuracy = {}".format(accuracy.eval({X:train,Y:targets})))
                
        parameters = {"W":model_dict["W"],"b":model_dict["b"]}
        model = sess.run(parameters)
        
        # printing proportion of correctly classified by class
        nb_p = np.sum(targets)
        nb_n = np.sum(1-targets)
        mask_p = targets.astype(bool)
        mask_n = np.logical_not(mask_p)
        predictions = tf.equal(tf.round(tf.sigmoid(model_dict["Z"])),Y).eval({X:train,Y:targets})
        print("positive ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_p,predictions)),
                                                     nb_p,
                                                     np.sum(np.logical_and(mask_p,predictions))/nb_p))
        print("negative ratio : {} / {} = {}".format(np.sum(np.logical_and(mask_n,predictions)),
                                                     nb_n,
                                                     np.sum(np.logical_and(mask_n,predictions))/nb_n))
        
        
        
        
    return model
    


In [85]:
tf.reset_default_graph()
model = run_perceptron(train_data.T,train_targets.reshape(1,len(train_targets)),5000,epoch_print=1000)

Label learning weight ration = 0.94004
Epoch: 0, cost=0.07986059039831161
Accuracy = 0.12421999871730804
Epoch: 1000, cost=0.05576595664024353
Accuracy = 0.7840999960899353
Epoch: 2000, cost=0.05272633954882622
Accuracy = 0.7897400259971619
Epoch: 3000, cost=0.051876455545425415
Accuracy = 0.7906000018119812
Epoch: 4000, cost=0.05162222310900688
Accuracy = 0.7904999852180481
Epoch: 4999, cost=0.05153188109397888
Accuracy = 0.7910000085830688
positive ratio : 2384 / 2998.0 = 0.7951967978652434
negative ratio : 37166 / 47002.0 = 0.7907323092634356


Best results after tunning it for a while is about 80% accuracy for both group. Not great.
Next step is to create a deeper model with estimator or keras and see if we can do better.

## Deeper Model