# How to use this

Run each cell from top to bottom. 
View README.md for more infos. 

In [1]:
# Init global infos

import numpy as np

from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD

inputs = (
    ("age", ("continuous",)), 
    ("workclass", ("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked")), 
    ("fnlwgt", ("continuous",)), 
    ("education", ("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool")), 
    ("education-num", ("continuous",)), 
    ("marital-status", ("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse")), 
    ("occupation", ("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces")), 
    ("relationship", ("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried")), 
    ("race", ("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black")), 
    ("sex", ("Female", "Male")), 
    ("capital-gain", ("continuous",)), 
    ("capital-loss", ("continuous",)), 
    ("hours-per-week", ("continuous",)), 
    ("native-country", ("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"))
)

input_shape = []
for i in inputs:
    count = len(i[1 ])
    input_shape.append(count)
input_dim = sum(input_shape)
print("input_shape:", input_shape)
print("input_dim:", input_dim)
print()


outputs = (0, 1)  # (">50K", "<=50K")
output_dim = 2  # len(outputs)
print("output_dim:", output_dim)
print()


input_shape: [1, 8, 1, 16, 1, 7, 14, 6, 5, 2, 1, 1, 1, 41]
input_dim: 105

output_dim: 2



In [2]:
# Functions to load and prepare data

def isFloat(string):
    # credits: http://stackoverflow.com/questions/2356925/how-to-check-whether-string-might-be-type-cast-to-float-in-python
    try:
        float(string)
        return True
    except ValueError:
        return False
    
def find_means_for_continuous_types(X):
    means = []
    for col in range(len(X[0])):
        summ = 0
        count = 0.000000000000000000001
        for value in X[:, col]:
            if isFloat(value): 
                summ += float(value)
                count +=1
        means.append(summ/count)
    return means

def prepare_data(raw_data, means):
    
    X = raw_data[:, :-1]
    y = raw_data[:, -1:]
    
    # X:
    def flatten_persons_inputs_for_model(person_inputs):
        global inputs
        global input_shape
        global input_dim
        global means
        float_inputs = []

        for i in range(len(input_shape)):
            features_of_this_type = input_shape[i]
            is_feature_continuous = features_of_this_type == 1

            if is_feature_continuous:
                mean = means[i]
                if isFloat(person_inputs[i]):
                    scale_factor = 1/(2*mean)  # we prefer inputs mainly scaled from -1 to 1. 
                    float_inputs.append(float(person_inputs[i])*scale_factor)
                else:
                    float_inputs.append(mean)
            else:
                for j in range(features_of_this_type):
                    feature_name = inputs[i][1][j]

                    if feature_name == person_inputs[i]:
                        float_inputs.append(1.)
                    else:
                        float_inputs.append(0)
        return float_inputs
    
    new_X = []
    for person in range(len(X)):
        formatted_X = flatten_persons_inputs_for_model(X[person])
        new_X.append(formatted_X)
    new_X = np.array(new_X)
    
    # y:
    new_y = []
    for i in range(len(y)):
        if y[i] == ">50k":
            new_y.append((1, 0))
        else:  # y[i] == "<=50k":
            new_y.append((0, 1))
    new_y = np.array(new_y)
    
    return (new_X, new_y)

In [3]:
# Building training and test data

training_data = np.genfromtxt('data/adult.data.txt', delimiter=', ', dtype=str, autostrip=True)
print("Training data count:", len(training_data))
test_data = np.genfromtxt('data/adult.test.txt', delimiter=', ', dtype=str, autostrip=True)
print("Test data count:", len(test_data))

means = find_means_for_continuous_types(np.concatenate((training_data, test_data), 0))
print("Mean values for data types (if continuous):", means)

X_train, y_train = prepare_data(training_data, means)
X_test, y_test = prepare_data(test_data, means)

percent = sum([i[0] for i in y_train])/len(y_train)
print("Training data percentage that is >50k:", percent*100, "%")

Training data count: 32561
Test data count: 16281
Mean values for data types (if continuous): [38.64358543876172, 0.0, 189664.13459727284, 0.0, 10.078088530363212, 0.0, 0.0, 0.0, 0.0, 0.0, 1079.0676262233324, 87.50231358257237, 40.422382375824085, 0.0, 0.0]
Training data percentage that is >50k: 24.0809557446 %


In [4]:
# Explanation on data format

print("Training data format example:")
print(X_train[4])  # 4 is a random person, from cuba. 
print()

print("In fact, we just crushed the data in such a way that it will optimise the neural network (model). \n\
It is crushed according to the `input_shape` variable: \n\
    say, if there are 41 native countries in the dataset, there will be 41 input dimensions for the \n\
    neural network with a value of 0 for every 41 input node for a given person, except the \n\
    node representing the real country of the person which will have a value of 1. For continuous \n\
    values, they are normalised to a small float number.")

for i in X_train:
    if len(i) != input_dim:
        raise Exception(
            "Every person should have 105 data fields now. {} here.".format(len(i)))

Training data format example:
[ 0.36228522  1.          0.          0.          0.          0.          0.
  0.          0.          0.89212702  1.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.64496357
  1.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.          0.
  1.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          1.          1.          0.          0.
  0.          0.49477539  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  1.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          

In [32]:
# Init model

mid_dim = 12

model = Sequential()

model.add(Dense(output_dim=mid_dim, activation='sigmoid', input_dim=input_dim))
model.add(Dense(output_dim=output_dim, activation='sigmoid', input_dim=mid_dim))

model.compile(loss='binary_crossentropy', optimizer='rmsprop')


In [33]:
# Train the model

print("(training_datas, dimension):", X_train.shape)
# model.fit(new_X_train, y_train, nb_epoch=3, batch_size=16, show_accuracy=True, verbose=2)
model.fit(X_train, y_train, nb_epoch=50, batch_size=128, validation_split=0.1, show_accuracy=True, verbose=1)


(training_datas, dimension): (32561, 105)
Train on 29304 samples, validate on 3257 samples
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


<keras.callbacks.History at 0x7fd99e316828>

In [34]:
# Evaluate training

score = model.evaluate(X_test, y_test, verbose=1, show_accuracy=True)
print("\nTest Results for {} test entries \
on which we did not trained the neural network.\n".format(len(X_test)))

print("Keras evaluation result:", score[0])
print("Percentage right: {}%.".format(score[1]*100))
print("Error: {}%.\n".format((1-score[1])*100))

def evaluate_model(model, X_test, y_test):
    confusion_matrix = np.array([
        [0, 0], 
        [0, 0]
    ])
    pred = model.predict(X_train)
    for i in range(len(pred)):
        prediction = pred[i]
        if prediction[0]>prediction[1]:
            prediction = 1
        else:
            prediction = 0

        expected = y_train[i][0]

        confusion_matrix[prediction][expected] += 1
    
    return confusion_matrix

confusion_matrix = evaluate_model(model, X_test, y_test)
confusion_matrix_interpretation = np.array([
        ["true negative", "false negative"], 
        ["false positive", "true positive"]
    ])
print("Confusion matrix:")
print(confusion_matrix)
print("Confusion matrix, percentage of data:")
print(confusion_matrix*100/sum(confusion_matrix.flatten()))
print("Confusion matrix interpretation:\n", confusion_matrix_interpretation)


Test Results for 16281 test entries on which we did not trained the neural network.

Keras evaluation result: 0.31364969079
Percentage right: 85.33259627786991%.
Error: 14.66740372213009%.

Confusion matrix:
[[22911  2904]
 [ 1809  4937]]
Confusion matrix, percentage of data:
[[ 70.36331808   8.918645  ]
 [  5.55572618  15.16231074]]
Confusion matrix interpretation:
 [['true negative' 'false negative']
 ['false positive' 'true positive']]
