In [1]:
import pandas as pd
from sklearn import preprocessing

import tensorflow as tf
import numpy as np

In [56]:
adult_data = pd.read_csv("adult.data", sep=',', 
                         header=None, names=['age','workclass','fnlwgt','education','education-num', 'marital-status', 'occupation',
                                             'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                                             'native-country', 'labels'])
adult_test = pd.read_csv("adult.test", sep=',', 
                         header=None, names=['age','workclass','fnlwgt','education','education-num', 'marital-status', 'occupation',
                                             'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                                             'native-country', 'labels'])
num_classes = 2
adult_test = adult_test.drop([0])
adult_test = adult_test.reset_index(drop=True)
adult_test[['age']] = adult_test[['age']].astype('int64')
adult_test[['fnlwgt']] = adult_test[['fnlwgt']].astype('int64')
adult_test[['education-num']] = adult_test[['education-num']].astype('int64')
adult_test[['capital-gain']] = adult_test[['capital-gain']].astype('int64')
adult_test[['capital-loss']] = adult_test[['capital-loss']].astype('int64')
adult_test[['hours-per-week']] = adult_test[['hours-per-week']].astype('int64')
adult_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,labels
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [57]:
train_obj = adult_data.select_dtypes(include=[object])
train_val = adult_data.select_dtypes(include=['int64'])
train_val = np.array(train_val)

test_obj = adult_test.select_dtypes(include=[object])
test_val = adult_test.select_dtypes(include=['int64'])
print(test_val.columns)
test_val = np.array(test_val)

test_obj = test_obj.dropna()
test_obj.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'labels'],
      dtype='object')

In [58]:
train_obj.shape

(32561, 9)

In [59]:
le_features = preprocessing.LabelEncoder()
train_obj = train_obj.apply(le_features.fit_transform)
test_obj = test_obj.apply(le_features.fit_transform)
list(le_features.classes_)

[' <=50K.', ' >50K.']

In [60]:
enc_labels = preprocessing.OneHotEncoder()

enc_labels.fit(train_obj)
train_onehotlabels = enc_labels.transform(train_obj).toarray()

test_onehotlabels = enc_labels.transform(test_obj).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [61]:
train_labels_onehotlabels = train_onehotlabels[:, -2:]
train_features_onehotlabels = train_onehotlabels[:, :-2]

train_fea = np.hstack([train_features_onehotlabels, train_val])
train_lab = train_labels_onehotlabels

test_labels_onehotlabels = test_onehotlabels[:, -2:]
test_features_onehotlabels = test_onehotlabels[:, :-2]
print(train_features_onehotlabels.shape)
print(test_features_onehotlabels.shape)
test_fea = np.hstack([test_features_onehotlabels, test_val])
test_lab = test_labels_onehotlabels

print(train_fea.shape)
print(test_fea.shape)

(32561, 102)
(16281, 102)
(32561, 108)
(16281, 108)


In [68]:
tf.reset_default_graph()

#NN parameters
learning_rate = 0.01
num_steps = 1000
batch_size = 100
display_steps = 20
beta = 0

num_input = 108
num_hidden1 = 500
num_hidden2 = 500
num_hidden3 = 500
num_classes = 2
dropout = 0.8

In [69]:
class dataset():
    def __init__(self, features, labels, batch_size):
        self.fea_batches = []
        self.lab_batches = []
        self.batch_size = batch_size
        self.size = len(features)
        i = 0
        while(i < self.size - batch_size):
            self.fea_batches.append(features[i:i+batch_size])
            self.lab_batches.append(labels[i:i+ batch_size])
            i+= batch_size
        self.idx = 0
    
    def next_batch(self):
        self.idx += 1
        if (self.idx == self.size // self.batch_size):
            self.idx = 0
        return self.fea_batches[self.idx], self.lab_batches[self.idx]
    

train_data = dataset(train_fea, train_lab, batch_size)

In [70]:
X = tf.placeholder(tf.float64, shape=[None, num_input])
Y = tf.placeholder(tf.float64, shape=[None, num_classes])

def neural_net(x, weights, biases, dropout):
    #reshape x back to be an image
    #x = tf.reshape(x, [-1, 28, 28 ,1])
    
    layer1 = tf.matmul(x, weights['w1']) + biases['b1']
    layer1 = tf.contrib.nn.alpha_dropout(tf.nn.selu(layer1), dropout)
    
    layer2 = tf.matmul(layer1, weights['w2']) + biases['b2']
    layer2 = tf.contrib.nn.alpha_dropout(tf.nn.selu(layer2), dropout)
    
    layer3 = tf.matmul(layer2, weights['w3']) + biases['b3']
    layer3 = tf.contrib.nn.alpha_dropout(tf.nn.selu(layer3), dropout)
    
    output = tf.matmul(layer3, weights['w_out']) + biases['b_out']
    return output

In [71]:
weights = {
    'w1': tf.get_variable('w1', shape=[num_input, num_hidden1], dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'w2': tf.get_variable('w2', shape=[num_hidden1, num_hidden2],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'w3': tf.get_variable('w3', shape=[num_hidden2, num_hidden3],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'w_out': tf.get_variable('w_out', shape=[num_hidden3, num_classes], dtype=tf.float64,initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN'))
}

biases = {
    'b1': tf.get_variable('b1', shape=[num_hidden1],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'b2': tf.get_variable('b2', shape=[num_hidden2],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'b3': tf.get_variable('b3', shape=[num_hidden3],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN')),
    'b_out': tf.get_variable('b_out', shape=[num_classes],dtype=tf.float64, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_IN'))
}


logits = neural_net(X, weights, biases, dropout)

regularizer = tf.nn.l2_loss(weights['w1']) + tf.nn.l2_loss(weights['w2']) + tf.nn.l2_loss(weights['w_out'])
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = Y) + beta * regularizer)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
train_op = optimizer.minimize(loss_op)

correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

In [72]:
with tf.Session() as sess:
    sess.run(init)

    for step in range(1, num_steps+1):
        batch_x, batch_y = train_data.next_batch()

        sess.run(train_op, feed_dict = {X:batch_x, Y:batch_y})

        if step % display_steps == 0:
            loss, acc = sess.run([loss_op, accuracy], feed_dict = {X:batch_x, Y:batch_y})
            print("Step" + str(step) + ", Minibatch loss " +"{:.4f}".format(loss) + ", Trainning accuracy " + "{:.3f}".format(acc))

    print("Optimization done!")

    test_accu = sess.run(accuracy, feed_dict = {X:test_fea , Y:test_lab})

    print("Test accuracy:", test_accu)

Step20, Minibatch loss 59.4271, Trainning accuracy 0.510
Step40, Minibatch loss 8.4485, Trainning accuracy 0.650
Step60, Minibatch loss 3.4815, Trainning accuracy 0.790
Step80, Minibatch loss 1.7394, Trainning accuracy 0.750
Step100, Minibatch loss 0.7620, Trainning accuracy 0.820
Step120, Minibatch loss 0.8093, Trainning accuracy 0.790
Step140, Minibatch loss 1.0417, Trainning accuracy 0.750
Step160, Minibatch loss 0.5906, Trainning accuracy 0.780
Step180, Minibatch loss 0.6080, Trainning accuracy 0.710
Step200, Minibatch loss 0.6101, Trainning accuracy 0.740
Step220, Minibatch loss 0.6413, Trainning accuracy 0.800
Step240, Minibatch loss 0.5999, Trainning accuracy 0.730
Step260, Minibatch loss 0.6472, Trainning accuracy 0.690
Step280, Minibatch loss 0.5641, Trainning accuracy 0.750
Step300, Minibatch loss 0.5292, Trainning accuracy 0.760
Step320, Minibatch loss 0.5540, Trainning accuracy 0.760
Step340, Minibatch loss 0.4997, Trainning accuracy 0.800
Step360, Minibatch loss 0.6372, Tr