In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.linear_model import (LogisticRegression, SGDClassifier, 
                                  SGDRegressor, LinearRegression)
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.testing import all_estimators
from sklearn.model_selection import GridSearchCV

import time

import tensorflow as tf
from tensorflow import keras


import warnings
warnings.filterwarnings('ignore')

## Functions

In [2]:
def printScore(y1, y2, n):
    score = sum(y1 == y2)/n
    print("Score: {:.3f}".format(score))
    
    return score
    
def predict_and_save(clf, X_test, title):
    categories = clf.predict(X_test)
    ids = range(16281)
    
    pd.DataFrame(data={"Id": ids, "Category": categories}). \
        to_csv("submission_{}.csv".format(title), index=False)
    
    return categories

def get_best_classifiers(X_train, y_train, X_valid, y_valid):
    estimators = all_estimators()
    best_clf = {}

    for name, est in estimators:
        if name == "ARDRegression":
            continue
        start_time = time.time()
        try:
            if hasattr(est, 'predict'):
                print(name)
                clf = est().fit(X_train, y_train)
                y_hat = clf.predict(X_valid)
                score = printScore(y_valid, y_hat, y_hat.shape[0])
                if score >= 0.7:
                    best_clf[name] = est
        except Exception as e:
            print(e)

        print('Time taken: {}\n'.format(time.time() - start_time))
        
    
    return best_clf

## Load Data

In [3]:
attributes = {
    "age": None,
    "workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", 
                  "Federal-gov", "Local-gov", "State-gov", 
                  "Without-pay", "Never-worked"],
    "fnlwgt": None,
    "education": ["Bachelors", "Some-college", "11th", "HS-grad",
                  "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                  "7th-8th", "12th", "Masters", "1st-4th", "10th",
                  "Doctorate", "5th-6th", "Preschool"],
    "education-num": None,
    "marital-status": ["Married-civ-spouse", "Divorced", "Never-married",
                       "Separated", "Widowed", "Married-spouse-absent",
                       "Married-AF-spouse"],
    "occupation": ["Tech-support", "Craft-repair", "Other-service",
                   "Sales", "Exec-managerial", "Prof-specialty", 
                   "Handlers-cleaners", "Machine-op-inspct", 
                   "Adm-clerical", "Farming-fishing", "Transport-moving",
                   "Priv-house-serv", "Protective-serv", "Armed-Forces"],
    "relationship": ["Wife", "Own-child", "Husband", "Not-in-family",
                     "Other-relative", "Unmarried"],
    "race": ["White", "Asian-Pac-Islander", 'Amer-Indian-Eskimo',
             "Other", "Black"],
    "sex": ["Female", "Male"],
    "capital-gain": None,
    "capital-loss": None,
    "hours-per-week": None,
    "native-country": ["United-States", "Cambodia", "England", "Puerto-Rico", 
                       "Canada", "Germany", "Outlying-US(Guam-USVI-etc)",
                       "India", "Japan", "Greece", "South", "China",
                       "Cuba", "Iran", "Honduras", "Philippines", "Italy",
                       "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                       "Ireland", "France", "Dominican-Republic", "Laos",
                       "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary",
                       "Guatemala", "Nicaragua", "Scotland", "Thailand",
                       "Yugoslavia", "El-Salvador", "Trinadad&Tobago",
                       "Peru", "Hong", "Holand-Netherlands"],
    "income": None #Binary (0 means <=50K, 1 means >50K)
}
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country", "income"]
indices = [i for i in range(15)]
columns = {i: j for i, j in zip(indices, cols)}


In [4]:
train_data = pd.read_csv("data/train.data", header=None)
train_data = train_data.rename(columns=columns)
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
train_data = train_data.replace('?', np.nan)


In [5]:
print(train_data.shape)
train_data.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [6]:
test_data = pd.read_csv("data/test.data", header=None)
test_data = test_data.rename(columns=columns)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
test_data = test_data.replace('?', np.nan)


In [7]:
print(test_data.shape)
test_data.head()

(16281, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


#### Divide data

In [8]:
cols = test_data.columns

In [9]:
X = train_data[cols]
y = train_data['income']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

X_test = test_data[:]


## Tensorflow

In [47]:
X_train.dtypes[X_train.dtypes == 'object'].index

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [48]:
CONTI_FEATURES  = list(list(X_train.dtypes[X_train.dtypes != 'object'].index))
CATE_FEATURES = list(X_train.dtypes[X_train.dtypes == 'object'].index)




In [49]:
continuous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]	
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000) for k in CATE_FEATURES]


In [50]:
model = tf.estimator.LinearClassifier(
    n_classes = 2,
    model_dir="ongoing/train", 
    feature_columns=categorical_features+ continuous_features)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13679d0f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [51]:
FEATURES = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
       "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
       "hours-per-week", "native-country"]
LABEL = 'income'
def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [52]:
model.train(input_fn=get_input_fn(train_data, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ongoing/train/model.ckpt-0
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.InternalError'>, Unable to get element as bytes.
INFO:tensorflow:Saving checkpoints for 0 into ongoing/train/model.ckpt.


InternalError: Unable to get element as bytes.