In [1]:
import pandas as pd
from six.moves import urllib
import shutil
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
TRAIN_FILE_NAME = 'census/adult.data'
TEST_FILE_NAME = 'census/adult.test'

In [3]:
urllib.request.urlretrieve(
                        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                         TRAIN_FILE_NAME)

('census/adult.data', <http.client.HTTPMessage at 0x7f0e00c85e48>)

In [4]:
urllib.request.urlretrieve(
                        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                         TEST_FILE_NAME)

('census/adult.test', <http.client.HTTPMessage at 0x7f0e00c83eb8>)

In [5]:
CSV_COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'maritial_status', 'occupation', 'relationship', 'race', 'gender',
    'capitail_gain', 'capital_loss', 'hours_per_week', 'native_country',
    'income_bracket'
]

In [6]:
df = pd.read_csv(
        TRAIN_FILE_NAME,
        names = CSV_COLUMNS,
        skipinitialspace = True,
        skiprows = 1)

In [7]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,maritial_status,occupation,relationship,race,gender,capitail_gain,capital_loss,hours_per_week,native_country,income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [8]:
TRIMMED_REORDERED_COLUMNS = [
    'age', 'workclass', 'education', 'education_num',
    'maritial_status', 'occupation', 'relationship', 'race', 'gender',
    'hours_per_week', 'native_country', 'income_bracket'
]

In [9]:
df = df[TRIMMED_REORDERED_COLUMNS]

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education_num,maritial_status,occupation,relationship,race,gender,hours_per_week,native_country,income_bracket
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
1,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
2,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
3,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
4,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,United-States,<=50K


In [11]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [12]:
df['race'].unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

In [13]:
df['workclass'].unique()

array(['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [14]:
df['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [15]:
df['maritial_status'].unique()

array(['Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
      dtype=object)

In [16]:
df['relationship'].unique()

array(['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [17]:
gender = tf.feature_column.categorical_column_with_vocabulary_list(
            'gender', ['Male', 'Female'])

race = tf.feature_column.categorical_column_with_vocabulary_list(
            'race', ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'])

education = tf.feature_column.categorical_column_with_vocabulary_list(
            'education', ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
                          'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
                          '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
maritial_status = tf.feature_column.categorical_column_with_vocabulary_list(
            'maritial_status', ['Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
                                'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
            'relationship', ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
                             'Other-relative'])

workclass = tf.feature_column.categorical_column_with_vocabulary_list(
            'workclass', ['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])

In [18]:
age = tf.feature_column.numeric_column('age')

education_num = tf.feature_column.numeric_column('education_num')

hours_per_week = tf.feature_column.numeric_column('hours_per_week')

In [19]:
age_buckets = tf.feature_column.bucketized_column(
                age, boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [20]:
df['occupation'].unique()

array(['Exec-managerial', 'Handlers-cleaners', 'Prof-specialty',
       'Other-service', 'Adm-clerical', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [21]:
df['native_country'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [22]:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
                'occupation', hash_bucket_size=1000)

native_country = tf.feature_column.categorical_column_with_hash_bucket(
                'native_country', hash_bucket_size=1000)

In [23]:
base_columns = [
        gender, race, education, maritial_status, workclass, occupation,
        native_country, age_buckets
]

In [24]:
crossed_columns = [
    tf.feature_column.crossed_column(
        ['education', 'occupation'], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ['native_country', 'occupation'], hash_bucket_size=1000)
]

In [25]:
deep_columns = [education_num, hours_per_week]

In [31]:
def input_fn(filename, num_epochs, shuffle):
    df = pd.read_csv(
        filename,
        names = CSV_COLUMNS,
        skipinitialspace = True,
        skiprows = 1)
    df = df[TRIMMED_REORDERED_COLUMNS]
    
    df = df.dropna(how = 'any', axis = 0)
    
    labels = df['income_bracket'].apply(lambda x: ">50K" in x).astype(int)
    
    return tf.estimator.inputs.pandas_input_fn(
                x = df,
                y = labels,
                batch_size=100,
                num_epochs=num_epochs,
                shuffle = shuffle,
                num_threads=5)

In [32]:
MODEL_DIR = './linear_classifier'

In [None]:
shutil.rmtree(MODEL_DIR)

In [33]:
linear_estimator = tf.estimator.LinearClassifier(
                        model_dir=MODEL_DIR, feature_columns=base_columns + crossed_columns + deep_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './linear_classifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0df9314710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [34]:
linear_estimator.train(
            input_fn=input_fn(TRAIN_FILE_NAME, num_epochs=None, shuffle=True),
            steps = 1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./linear_classifier/model.ckpt.
INFO:tensorflow:loss = 69.31472, step = 1
INFO:tensorflow:global_step/sec: 207.911
INFO:tensorflow:loss = 39.02631, step = 101 (0.482 sec)
INFO:tensorflow:global_step/sec: 309.863
INFO:tensorflow:loss = 24.85404, step = 201 (0.323 sec)
INFO:tensorflow:global_step/sec: 316.429
INFO:tensorflow:loss = 29.73993, step = 301 (0.316 sec)
INFO:tensorflow:global_step/sec: 318.324
INFO:tensorflow:loss = 37.392517, step = 401 (0.314 sec)
INFO:tensorflow:global_step/sec: 309.124
INFO:tensorflow:loss = 36.8241, step = 501 (0.323 sec)
INFO:tensorflow:global_step/sec: 315.813
INFO:tensorflow:loss = 31.21426, step = 601 (0.317 sec)
INFO:tensorflow:global_step/sec: 311.897
INFO:tensorflow:loss

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f0df92fa390>

In [35]:
result = linear_estimator.evaluate(
            input_fn=input_fn(TEST_FILE_NAME, num_epochs=1, shuffle=False),
            steps = None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-13-14:37:35
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./linear_classifier/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-13-14:37:39
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8354524, accuracy_baseline = 0.76377374, auc = 0.8850185, auc_precision_recall = 0.6955819, average_loss = 0.35199788, global_step = 1000, label/mean = 0.23622628, loss = 35.15876, prediction/mean = 0.21882468


In [37]:
for key in sorted(result):
    print('%s: %s'% (key, result[key]))

accuracy: 0.8354524
accuracy_baseline: 0.76377374
auc: 0.8850185
auc_precision_recall: 0.6955819
average_loss: 0.35199788
global_step: 1000
label/mean: 0.23622628
loss: 35.15876
prediction/mean: 0.21882468
