# California Census Data -  Tensorflow Estimator Classification

We will be classifying whether an individual either makes <=50k or >50k

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
census = pd.read_csv('census_data.csv')

In [3]:
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
def to_zero_or_one(column):
    if column == ' <=50K':
        return 0
    else:
        return 1
    

In [5]:
census['income_bracket'] = census['income_bracket'].apply(to_zero_or_one)

In [6]:
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
x_data = census.drop('income_bracket',axis=1)

In [8]:
y_data = census['income_bracket']

In [9]:
 X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

In [10]:
census.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [11]:
workclass = tf.feature_column.categorical_column_with_hash_bucket('workclass',hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket('education',hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket('marital_status',hash_bucket_size=1000)
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation',hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket('relationship',hash_bucket_size=1000)
race = tf.feature_column.categorical_column_with_hash_bucket('race',hash_bucket_size=1000)
gender = tf.feature_column.categorical_column_with_hash_bucket('gender',hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket('native_country',hash_bucket_size=1000)
income_bracket = tf.feature_column.categorical_column_with_hash_bucket('income_bracket',hash_bucket_size=1000)

In [12]:
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')


In [13]:
feat_cols = [workclass,education,marital_status,occupation,relationship,race,gender,native_country,age,education_num,
             capital_gain,capital_loss,hours_per_week] 

In [14]:
input_func = tf.estimator.inputs.pandas_input_fn(X_train,y_train,batch_size=16,num_epochs=None,shuffle=True)

## Model 1

In [15]:
LC_model = tf.estimator.LinearClassifier(feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\nickj\\AppData\\Local\\Temp\\tmpzjaq6vfe', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [16]:
LC_model.train(input_fn=input_func,steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\nickj\AppData\Local\Temp\tmpzjaq6vfe\model.ckpt.
INFO:tensorflow:loss = 11.0904, step = 1
INFO:tensorflow:global_step/sec: 316.616
INFO:tensorflow:loss = 3.27534, step = 101 (0.319 sec)
INFO:tensorflow:global_step/sec: 468.238
INFO:tensorflow:loss = 16.818, step = 201 (0.214 sec)
INFO:tensorflow:global_step/sec: 566.668
INFO:tensorflow:loss = 304.359, step = 301 (0.176 sec)
INFO:tensorflow:global_step/sec: 533.34
INFO:tensorflow:loss = 6.78985, step = 401 (0.187 sec)
INFO:tensorflow:global_step/sec: 554.079
INFO:tensorflow:loss = 323.15, step = 501 (0.179 sec)
INFO:tensorflow:global_step/sec: 544.994
INFO:tensorflow:loss = 10.7967, step = 601 (0.183 sec)
INFO:tensorflow:global_step/sec: 573.186
INFO:tensorflow:loss = 7.39841, step = 701 (0.174 sec)
INFO:tensorflow:global_step/sec: 530.488
INFO:tensorflow:loss = 32.7954, step = 801 (0.192 sec)
INFO:tensorflow:global_step/sec: 533.351
INFO:

INFO:tensorflow:global_step/sec: 484.146
INFO:tensorflow:loss = 3.66295, step = 8401 (0.206 sec)
INFO:tensorflow:global_step/sec: 424.401
INFO:tensorflow:loss = 9.57244, step = 8501 (0.239 sec)
INFO:tensorflow:global_step/sec: 366.67
INFO:tensorflow:loss = 6.6771, step = 8601 (0.274 sec)
INFO:tensorflow:global_step/sec: 400.54
INFO:tensorflow:loss = 8.86774, step = 8701 (0.248 sec)
INFO:tensorflow:global_step/sec: 428.041
INFO:tensorflow:loss = 12.1239, step = 8801 (0.233 sec)
INFO:tensorflow:global_step/sec: 402.147
INFO:tensorflow:loss = 5.6601, step = 8901 (0.251 sec)
INFO:tensorflow:global_step/sec: 354.932
INFO:tensorflow:loss = 3.37005, step = 9001 (0.280 sec)
INFO:tensorflow:global_step/sec: 514.095
INFO:tensorflow:loss = 12.5142, step = 9101 (0.196 sec)
INFO:tensorflow:global_step/sec: 468.236
INFO:tensorflow:loss = 32.702, step = 9201 (0.214 sec)
INFO:tensorflow:global_step/sec: 519.45
INFO:tensorflow:loss = 4.43969, step = 9301 (0.192 sec)
INFO:tensorflow:global_step/sec: 530

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x273421f6da0>

## Model 2

In [17]:
embedded_workclass = tf.feature_column.embedding_column(workclass,dimension=1000)
embedded_education = tf.feature_column.embedding_column(education,dimension=1000)
embedded_marital_status = tf.feature_column.embedding_column(marital_status,dimension=1000)
embedded_occupation = tf.feature_column.embedding_column(occupation,dimension=1000)
embedded_relationship = tf.feature_column.embedding_column(relationship,dimension=1000)
embedded_race = tf.feature_column.embedding_column(race,dimension=1000)
embedded_gender = tf.feature_column.embedding_column(gender,dimension=1000)
embedded_native_country = tf.feature_column.embedding_column(native_country,dimension=1000)
embedded_income_bracket = tf.feature_column.embedding_column(income_bracket,dimension=1000)

In [18]:
feat_cols_dnn = [embedded_workclass,embedded_education,embedded_marital_status,embedded_occupation,embedded_relationship,
                 embedded_race,embedded_gender,embedded_native_country,age,education_num,
             capital_gain,capital_loss,hours_per_week] 

In [19]:
DNN_model = tf.estimator.DNNClassifier([10,10,10],feat_cols_dnn,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\nickj\\AppData\\Local\\Temp\\tmpph2wg9pq', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [20]:
DNN_model.train(input_fn=input_func,steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\nickj\AppData\Local\Temp\tmpph2wg9pq\model.ckpt.
INFO:tensorflow:loss = 9.08961, step = 1
INFO:tensorflow:global_step/sec: 284.955
INFO:tensorflow:loss = 5.23682, step = 101 (0.351 sec)
INFO:tensorflow:global_step/sec: 347.505
INFO:tensorflow:loss = 8.54874, step = 201 (0.289 sec)
INFO:tensorflow:global_step/sec: 358.756
INFO:tensorflow:loss = 10.1845, step = 301 (0.278 sec)
INFO:tensorflow:global_step/sec: 361.357
INFO:tensorflow:loss = 7.76892, step = 401 (0.278 sec)
INFO:tensorflow:global_step/sec: 328.073
INFO:tensorflow:loss = 4.95863, step = 501 (0.305 sec)
INFO:tensorflow:global_step/sec: 291.62
INFO:tensorflow:loss = 6.25512, step = 601 (0.343 sec)
INFO:tensorflow:global_step/sec: 302.225
INFO:tensorflow:loss = 7.65273, step = 701 (0.331 sec)
INFO:tensorflow:global_step/sec: 310.699
INFO:tensorflow:loss = 6.72161, step = 801 (0.321 sec)
INFO:tensorflow:global_step/sec: 361.355
INF

INFO:tensorflow:global_step/sec: 351.176
INFO:tensorflow:loss = 10.9709, step = 8401 (0.285 sec)
INFO:tensorflow:global_step/sec: 360.051
INFO:tensorflow:loss = 5.94685, step = 8501 (0.278 sec)
INFO:tensorflow:global_step/sec: 351.176
INFO:tensorflow:loss = 7.46901, step = 8601 (0.284 sec)
INFO:tensorflow:global_step/sec: 328.073
INFO:tensorflow:loss = 8.22701, step = 8701 (0.306 sec)
INFO:tensorflow:global_step/sec: 295.072
INFO:tensorflow:loss = 5.08359, step = 8801 (0.339 sec)
INFO:tensorflow:global_step/sec: 298.604
INFO:tensorflow:loss = 6.33517, step = 8901 (0.335 sec)
INFO:tensorflow:global_step/sec: 308.774
INFO:tensorflow:loss = 2.05091, step = 9001 (0.324 sec)
INFO:tensorflow:global_step/sec: 342.732
INFO:tensorflow:loss = 3.7657, step = 9101 (0.292 sec)
INFO:tensorflow:global_step/sec: 312.647
INFO:tensorflow:loss = 11.6147, step = 9201 (0.320 sec)
INFO:tensorflow:global_step/sec: 370.759
INFO:tensorflow:loss = 5.19351, step = 9301 (0.269 sec)
INFO:tensorflow:global_step/sec

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x273421f6e80>

In [21]:
pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                      batch_size=len(X_test),
                                                     shuffle=False)

In [22]:
pred_gen = LC_model.predict(pred_fn)

In [23]:
LC_predictions = list(pred_gen)

INFO:tensorflow:Restoring parameters from C:\Users\nickj\AppData\Local\Temp\tmpzjaq6vfe\model.ckpt-10000


In [24]:
LC_final_preds = [pred['class_ids'][0] for pred in LC_predictions]

In [25]:
pred_gen = DNN_model.predict(pred_fn)

In [26]:
DNN_predictions = list(pred_gen)

INFO:tensorflow:Restoring parameters from C:\Users\nickj\AppData\Local\Temp\tmpph2wg9pq\model.ckpt-10000


In [27]:
DNN_final_preds = [pred['class_ids'][0] for pred in DNN_predictions]

# Results

### Model 1

In [28]:
print(classification_report(y_test,LC_final_preds))

             precision    recall  f1-score   support

          0       0.90      0.91      0.90      7406
          1       0.70      0.68      0.69      2363

avg / total       0.85      0.85      0.85      9769



### Model 2

In [29]:
print(classification_report(y_test,DNN_final_preds))

             precision    recall  f1-score   support

          0       0.88      0.94      0.91      7406
          1       0.76      0.58      0.66      2363

avg / total       0.85      0.85      0.85      9769

