In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.linear_model import (LogisticRegression, SGDClassifier, 
                                  SGDRegressor, LinearRegression)
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.testing import all_estimators
from sklearn.model_selection import GridSearchCV

import time

import tensorflow as tf
from tensorflow import keras


import warnings
warnings.filterwarnings('ignore')

## Functions

In [2]:
def printScore(y1, y2, n):
    score = sum(y1 == y2)/n
    print("Score: {:.3f}".format(score))
    
    return score
    
def predict_and_save(clf, X_test, title):
    categories = clf.predict(X_test)
    ids = range(16281)
    
    pd.DataFrame(data={"Id": ids, "Category": categories}). \
        to_csv("submission_{}.csv".format(title), index=False)
    
    return categories

def get_best_classifiers(X_train, y_train, X_valid, y_valid):
    estimators = all_estimators()
    best_clf = {}

    for name, est in estimators:
        if name == "ARDRegression":
            continue
        start_time = time.time()
        try:
            if hasattr(est, 'predict'):
                print(name)
                clf = est().fit(X_train, y_train)
                y_hat = clf.predict(X_valid)
                score = printScore(y_valid, y_hat, y_hat.shape[0])
                if score >= 0.7:
                    best_clf[name] = est
        except Exception as e:
            print(e)

        print('Time taken: {}\n'.format(time.time() - start_time))
        
    
    return best_clf

## Load Data

In [3]:
attributes = {
    "age": None,
    "workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", 
                  "Federal-gov", "Local-gov", "State-gov", 
                  "Without-pay", "Never-worked"],
    "fnlwgt": None,
    "education": ["Bachelors", "Some-college", "11th", "HS-grad",
                  "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                  "7th-8th", "12th", "Masters", "1st-4th", "10th",
                  "Doctorate", "5th-6th", "Preschool"],
    "education-num": None,
    "marital-status": ["Married-civ-spouse", "Divorced", "Never-married",
                       "Separated", "Widowed", "Married-spouse-absent",
                       "Married-AF-spouse"],
    "occupation": ["Tech-support", "Craft-repair", "Other-service",
                   "Sales", "Exec-managerial", "Prof-specialty", 
                   "Handlers-cleaners", "Machine-op-inspct", 
                   "Adm-clerical", "Farming-fishing", "Transport-moving",
                   "Priv-house-serv", "Protective-serv", "Armed-Forces"],
    "relationship": ["Wife", "Own-child", "Husband", "Not-in-family",
                     "Other-relative", "Unmarried"],
    "race": ["White", "Asian-Pac-Islander", 'Amer-Indian-Eskimo',
             "Other", "Black"],
    "sex": ["Female", "Male"],
    "capital-gain": None,
    "capital-loss": None,
    "hours-per-week": None,
    "native-country": ["United-States", "Cambodia", "England", "Puerto-Rico", 
                       "Canada", "Germany", "Outlying-US(Guam-USVI-etc)",
                       "India", "Japan", "Greece", "South", "China",
                       "Cuba", "Iran", "Honduras", "Philippines", "Italy",
                       "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                       "Ireland", "France", "Dominican-Republic", "Laos",
                       "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary",
                       "Guatemala", "Nicaragua", "Scotland", "Thailand",
                       "Yugoslavia", "El-Salvador", "Trinadad&Tobago",
                       "Peru", "Hong", "Holand-Netherlands"],
    "income": None #Binary (0 means <=50K, 1 means >50K)
}
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country", "income"]
indices = [i for i in range(15)]
columns = {i: j for i, j in zip(indices, cols)}


In [4]:
train_data = pd.read_csv("data/train.data", header=None)
train_data = train_data.rename(columns=columns)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
train_data = train_data.replace('?', np.nan)
train_data = pd.get_dummies(train_data)


In [5]:
print(train_data.shape)
train_data.head()

(32561, 106)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_data = pd.read_csv("data/test.data", header=None)
test_data = test_data.rename(columns=columns)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
test_data = test_data.replace('?', np.nan)
test_data = pd.get_dummies(test_data)


In [7]:
print(test_data.shape)
test_data.head()

(16281, 104)


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Divide data

In [8]:
cols = test_data.columns

In [9]:
X = train_data[cols]
y = train_data['income']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

X_test = test_data[:]


## Tensorflow

In [10]:
y_train = np.array([y_train])
y_train = np.swapaxes(y_train, 0, 1)

In [11]:
y_valid = np.array([y_valid])
y_valid = np.swapaxes(y_valid, 0, 1)

In [12]:
feature_count = X_train.shape[1]
label_count = 1

In [16]:
training_epochs = 3000
learning_rate = 0.01
hidden_layers = X_train.shape[1] - 1
cost_history = np.empty(shape=[1], dtype=float)

X = tf.placeholder(tf.float32, [None, feature_count])
Y = tf.placeholder(tf.float32, [None, label_count])
is_training = tf.Variable(True, dtype=tf.bool)

In [17]:
initializer = tf.contrib.layers.xavier_initializer()
h0 = tf.layers.dense(X, hidden_layers, activation=tf.nn.relu, kernel_initializer=initializer)
# h0 = tf.nn.dropout(h0, 0.95)
h1 = tf.layers.dense(h0, label_count, activation=None)

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=h1)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# prediction = tf.argmax(h0, 1)
# correct_prediction = tf.equal(prediction, tf.argmax(Y_one_hot, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

predicted = tf.nn.sigmoid(h1)
correct_pred = tf.equal(tf.round(predicted), Y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [19]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(training_epochs + 1):
        sess.run(optimizer, feed_dict={X: X_train, Y: y_train})
        loss, _, acc = sess.run([cost, optimizer, accuracy], feed_dict={
                                 X: X_train, Y: y_train})
        cost_history = np.append(cost_history, acc)
        if step % 500 == 0:
            print("Step: {:5}\tLoss: {:.3f}\tAcc: {:.2%}".format(
                step, loss, acc))
            
    # Test model and check accuracy
    print('Test Accuracy:', sess.run([accuracy, tf.round(predicted)], feed_dict={X: X_valid, Y: y_valid}))
    
    # Save test result
    test_predict_result = sess.run(tf.cast(tf.round(predicted), tf.int32), feed_dict={X: X_test})
    print(test_predict_result)
    


Step:     0	Loss: 2122.853	Acc: 24.01%
Step:   500	Loss: 71.320	Acc: 24.13%
Step:  1000	Loss: 31.593	Acc: 78.88%
Step:  1500	Loss: 0.490	Acc: 80.46%
Step:  2000	Loss: 0.331	Acc: 85.00%
Step:  2500	Loss: 0.360	Acc: 83.59%
Step:  3000	Loss: 0.354	Acc: 84.05%
Test Accuracy: [0.8281538, array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]], dtype=float32)]
[[0]
 [0]
 [0]
 ...
 [0]
 [1]
 [1]]


In [21]:
test_predict_result.reshape

(16281, 1)

In [27]:
title = "nn"
ids = range(16281)
    
pd.DataFrame(data={"Id": ids, "Category": test_predict_result.reshape(-1)}). \
    to_csv("submission_{}.csv".format(title), index=False)