In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
df = pd.read_csv("datasets/census_data.csv")
df.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df["income_bracket"] = df["income_bracket"].apply(lambda x: 0 if x == " <=50K" else 1)
df["gender"] = df["gender"].apply(lambda x: 0 if " Male" else 1)

In [4]:
df.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,0,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0


In [5]:
dep_cols = ['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country']
X = df[dep_cols]
y = df["income_bracket"]

In [6]:
age = tf.feature_column.numeric_column("age")

workclass = tf.feature_column \
              .categorical_column_with_vocabulary_list("workclass", 
                    vocabulary_list=[' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
                                     ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
                                     ' Never-worked'])
    
workclass_embedding = tf.feature_column.embedding_column(workclass, dimension=9)
    
education = tf.feature_column \
              .categorical_column_with_vocabulary_list("education",
                    vocabulary_list=[' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
                                     ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
                                     ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
                                     ' Preschool', ' 12th'])
education_embedding = tf.feature_column.embedding_column(education, dimension=16)
    
education_num = tf.feature_column.numeric_column("education_num")

marital_status = tf.feature_column \
                   .categorical_column_with_vocabulary_list("marital_status",
                        vocabulary_list=[' Never-married', ' Married-civ-spouse', ' Divorced',
                                         ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
                                         ' Widowed'])

marital_status_embedding = tf.feature_column.embedding_column(marital_status, dimension=7)

occupation = tf.feature_column \
               .categorical_column_with_vocabulary_list("occupation",
                        vocabulary_list=[' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
                                        ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
                                        ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
                                        ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
                                        ' Priv-house-serv'])

occupation_embedding = tf.feature_column.embedding_column(occupation, dimension=15)

relationship = tf.feature_column \
                 .categorical_column_with_vocabulary_list("relationship",
                        vocabulary_list=[' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
                                        ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
                                        ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
                                        ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
                                        ' Priv-house-serv'])

relationship_embedding = tf.feature_column.embedding_column(relationship, dimension=15)
    
race = tf.feature_column \
         .categorical_column_with_vocabulary_list("race",
                        vocabulary_list=[' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
                                         ' Other'])

race_embedding = tf.feature_column.embedding_column(race, dimension=5)

gender = tf.feature_column.numeric_column("gender")

capital_gain = tf.feature_column.numeric_column("capital_gain")

capital_loss = tf.feature_column.numeric_column("capital_loss")

hours_per_week = tf.feature_column.numeric_column("hours_per_week")

native_country = tf.feature_column \
                   .categorical_column_with_vocabulary_list("native_country",
                        vocabulary_list=[' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
                                        ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
                                        ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
                                        ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
                                        ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
                                        ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
                                        ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
                                        ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
                                        ' Ireland', ' Hungary', ' Holand-Netherlands'])

native_country_embedding = tf.feature_column.embedding_column(native_country, dimension=42)

feat_cols = [age, workclass_embedding, education_embedding, education_num, marital_status_embedding,
             occupation_embedding, relationship_embedding, race_embedding, gender, capital_gain,
             capital_loss, hours_per_week, native_country_embedding]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [8]:
train_function = tf.estimator.inputs.pandas_input_fn(X_train, y_train, shuffle=True, num_epochs=1000,
                                                     batch_size=10)
test_function = tf.estimator.inputs.pandas_input_fn(X_test, y_test, shuffle=False)

In [9]:
dnn_estimator = tf.estimator.DNNClassifier(hidden_units=[30, 30, 30, 30], feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpa4mdugmv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6d20d89cf8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [10]:
dnn_estimator.train(input_fn=train_function, steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpa4mdugmv/model.ckpt.
INFO:tensorflow:loss = 7.3872576, step = 1
INFO:tensorflow:global_step/sec: 61.6554
INFO:tensorflow:loss = 40.076103, step = 101 (1.626 sec)
INFO:tensorflow:global_step/sec: 111.968
INFO:tensorflow:loss = 11.375727, step = 201 (0.892 sec)
INFO:tensorflow:global_step/sec: 93.3793
INFO:tensorflow:loss = 2.808196, step = 301 (1.076 sec)
INFO:tensorflow:global_step/sec: 94.6367
INFO:tensorflow:loss = 3.1784146, step = 401 (1.054 sec)
INFO:tensorflow:global_step/sec: 80.1863
INFO:tensorflow:loss = 2.0526948, step = 501 (1.253 sec)
INFO:tensorflow:global_step/sec: 91.8485
INFO:tensorflow:loss = 6.0254064, step = 601 (1.087 sec)
INFO:tensorflow:global_step/sec: 76.7898
INFO:tensorflow:l

INFO:tensorflow:global_step/sec: 58.1761
INFO:tensorflow:loss = 4.0737305, step = 8101 (1.712 sec)
INFO:tensorflow:global_step/sec: 59.4095
INFO:tensorflow:loss = 1.0111527, step = 8201 (1.710 sec)
INFO:tensorflow:global_step/sec: 56.8456
INFO:tensorflow:loss = 1.3702941, step = 8301 (1.736 sec)
INFO:tensorflow:global_step/sec: 61.1702
INFO:tensorflow:loss = 3.192575, step = 8401 (1.643 sec)
INFO:tensorflow:global_step/sec: 57.0965
INFO:tensorflow:loss = 1.3611094, step = 8501 (1.739 sec)
INFO:tensorflow:global_step/sec: 58.0734
INFO:tensorflow:loss = 2.5777082, step = 8601 (1.729 sec)
INFO:tensorflow:global_step/sec: 56.1413
INFO:tensorflow:loss = 2.0858414, step = 8701 (1.777 sec)
INFO:tensorflow:global_step/sec: 61.6146
INFO:tensorflow:loss = 3.1641831, step = 8801 (1.630 sec)
INFO:tensorflow:global_step/sec: 58.8385
INFO:tensorflow:loss = 2.792842, step = 8901 (1.697 sec)
INFO:tensorflow:global_step/sec: 66.1643
INFO:tensorflow:loss = 1.4624512, step = 9001 (1.506 sec)
INFO:tensorf

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f6d21aca470>

In [11]:
dnn_estimator.evaluate(input_fn=test_function)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-10-17:29:55
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpa4mdugmv/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-10-17:30:00
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.84072065, accuracy_baseline = 0.7592384, auc = 0.8966845, auc_precision_recall = 0.73977774, average_loss = 0.33734277, global_step = 10000, label/mean = 0.2407616, loss = 42.79872, prediction/mean = 0.25810185


{'accuracy': 0.84072065,
 'accuracy_baseline': 0.7592384,
 'auc': 0.8966845,
 'auc_precision_recall': 0.73977774,
 'average_loss': 0.33734277,
 'global_step': 10000,
 'label/mean': 0.2407616,
 'loss': 42.79872,
 'prediction/mean': 0.25810185}