In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

<h5>Load data</h5>

In [2]:
columns = ['age', 'workClass', 'fnlwgt', 'education', 'education-num','marital-status', 'occupation', 'relationship',
          'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

train_data = pd.read_csv('data/adult.data', names=columns, sep=' *, *', na_values='?')
# test data has NAN on first row so skip the first row
test_data  = pd.read_csv('data/adult.test', names=columns, skiprows=1, sep=' *, *', na_values='?')

<h5>Dealing with Missing Values</h5>

In [3]:
data = pd.concat([train_data, test_data], axis=0)
data.isnull().sum()

age                  0
workClass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

<p>There are some missing values in workClass, occupation, native-country</p>

In [4]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(data[['workClass','occupation','native-country' ]].values)
imputed_cat_data = imputer.transform(data[['workClass','occupation','native-country']].values)
imputed_cat_data.shape

(48842, 3)

In [5]:
data['workClass'] = imputed_cat_data[:,0]
data['occupation'] = imputed_cat_data[:,1]
data['native-country'] = imputed_cat_data[:,2]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
age               48842 non-null int64
workClass         48842 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    48842 non-null object
income            48842 non-null object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [6]:
data['income'] = data['income'].apply(lambda x: 0 if x == '<=50K' else 1)
data.head()

Unnamed: 0,age,workClass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


<h6>Dealing with categorical value</h6>

In [7]:
def convertCatColumn(df, col):
    df = pd.concat([df, pd.get_dummies(df[col],prefix=col,prefix_sep=':')], axis=1)
    return df

clean_data = convertCatColumn(data, 'workClass')
clean_data = convertCatColumn(clean_data, 'marital-status')
clean_data = convertCatColumn(clean_data, 'occupation')
clean_data = convertCatColumn(clean_data, 'relationship')
clean_data = convertCatColumn(clean_data, 'race')
clean_data = convertCatColumn(clean_data, 'sex')
clean_data = convertCatColumn(clean_data, 'native-country')

clean_data.drop('education', axis=1,inplace=True)
clean_data.drop('workClass', axis=1,inplace=True)
clean_data.drop('marital-status', axis=1,inplace=True)
clean_data.drop('occupation', axis=1,inplace=True)
clean_data.drop('relationship', axis=1,inplace=True)
clean_data.drop('race', axis=1,inplace=True)
clean_data.drop('sex', axis=1,inplace=True)
clean_data.drop('native-country', axis=1,inplace=True)
clean_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workClass:Federal-gov,workClass:Local-gov,workClass:Never-worked,...,native-country:Portugal,native-country:Puerto-Rico,native-country:Scotland,native-country:South,native-country:Taiwan,native-country:Thailand,native-country:Trinadad&Tobago,native-country:United-States,native-country:Vietnam,native-country:Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h5>Train Test Validate Split</h5>

In [8]:
features = clean_data.drop('income', axis=1)
labels = clean_data['income']

In [9]:
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [10]:
X_train.shape

(39073, 89)

In [11]:
from keras.models import Sequential
from keras.layers import Dense

model1_L3_12_8_1 = Sequential()
model1_L3_12_8_1.add(Dense(12, input_dim=89, activation='relu'))
model1_L3_12_8_1.add(Dense(8, activation='relu'))
model1_L3_12_8_1.add(Dense(1, activation='sigmoid'))
# compile the keras model
model1_L3_12_8_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model1_L3_12_8_1.fit(X_train, y_train, epochs=20, batch_size=10)
# evaluate the keras model
score_model1_L3_12_8_1 = model1_L3_12_8_1.evaluate(X_test, y_test, batch_size=128)


Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
model2_L3_32_12_1 = Sequential()
model2_L3_32_12_1.add(Dense(32, input_dim=89, activation='relu'))
model2_L3_32_12_1.add(Dense(12, activation='relu'))
model2_L3_32_12_1.add(Dense(1, activation='sigmoid'))
# compile the keras model
model2_L3_32_12_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2_L3_32_12_1.fit(X_train, y_train, epochs=20, batch_size=10)
# evaluate the keras model
score_model2_L3_32_12_1 = model2_L3_32_12_1.evaluate(X_test, y_test, batch_size=128)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
model3_L3_64_32_1 = Sequential()
model3_L3_64_32_1.add(Dense(64, input_dim=89, activation='relu'))
model3_L3_64_32_1.add(Dense(32, activation='relu'))
model3_L3_64_32_1.add(Dense(1, activation='sigmoid'))
# compile the keras model
model3_L3_64_32_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model3_L3_64_32_1.fit(X_train, y_train, epochs=20, batch_size=10)
# evaluate the keras model
score_model3_L3_64_32_1 = model3_L3_64_32_1.evaluate(X_test, y_test, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
model4_L4_64_32_16_1 = Sequential()
model4_L4_64_32_16_1.add(Dense(64, input_dim=89, activation='relu'))
model4_L4_64_32_16_1.add(Dense(32, activation='relu'))
model4_L4_64_32_16_1.add(Dense(16, activation='relu'))
model4_L4_64_32_16_1.add(Dense(1, activation='sigmoid'))
# compile the keras model
model4_L4_64_32_16_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model4_L4_64_32_16_1.fit(X_train, y_train, epochs=20, batch_size=10)
# evaluate the keras model
score_model4_L4_64_32_16_1 = model4_L4_64_32_16_1.evaluate(X_test, y_test, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
print('score_model1_L3_12_8_1: {}'.format(score_model1_L3_12_8_1))
print('score_model2_L3_32_12_1: {}'.format(score_model2_L3_32_12_1))
print('score_model3_L3_64_32_1: {}'.format(score_model3_L3_64_32_1))
print('score_model4_L4_64_32_16_1: {}'.format(score_model4_L4_64_32_16_1))

score_model1_L3_12_8_1: [8.055237013798969, 0.4947282075881958]
score_model2_L3_32_12_1: [7.9740785452223495, 0.5052717924118042]
score_model3_L3_64_32_1: [8.055237013798969, 0.4947282075881958]
score_model4_L4_64_32_16_1: [8.055237013798969, 0.4947282075881958]


<p> Model with three layer with (32, 16, 1) has more performance that other. it also noted that model 
with higger layer have 
    same performace as model with lower layer
</p>