First attempt at neural network classification on PRISM housefolds data. Following official TensorFlow tutorial [here](https://www.tensorflow.org/tutorials/structured_data/feature_columns)

#Load Data

In [1]:
pip install -q sklearn

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [5]:
target_name = 'mobile_phone'
indicator_column_names = [
				'air_bricks',
				'animal_drawn_cart',
				'bank_account',
				'bed',
				'bike',
				'motor_boat',
				'motorless_boat',
				'car_truck',
				'cassette_player',
				'chair',
				'clock',
				'cooking_fuel',
				'cupboard',
				'drinking_water_source',
				'dwelling_type',
				'eaves',
				'electricity',
				'floor_material',
				'food_problems_weekly',
				'wealth_index_cat',
				'landline_phone',
				'lighting_source',
				'mobile_phone',
				'motocycle_scooter',
				'radio',
				'refrigerator',
				'roof_material',
				'sofa',
				'subcountry',
				'table',
				'tv',
				'wall_material',
				'watch'
]

In [6]:
df = pd.read_csv('household_categorical_encoded.csv')
df.drop('Unnamed: 0', axis=1)
df.columns = ['index', 'Household_Id']+indicator_column_names
df = df.drop(['index','Household_Id'], axis=1)
df.head()

Unnamed: 0,air_bricks,animal_drawn_cart,bank_account,bed,bike,motor_boat,motorless_boat,car_truck,cassette_player,chair,clock,cooking_fuel,cupboard,drinking_water_source,dwelling_type,eaves,electricity,floor_material,food_problems_weekly,wealth_index_cat,landline_phone,lighting_source,mobile_phone,motocycle_scooter,radio,refrigerator,roof_material,sofa,subcountry,table,tv,wall_material,watch
0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1
1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,1,0,1,0,0,1,0,1,0
2,2,0,1,1,1,0,0,0,0,1,0,0,0,2,1,1,1,1,1,1,0,2,1,0,0,0,1,0,0,1,0,1,0
3,0,0,0,1,1,0,0,0,0,1,0,0,0,3,0,1,0,0,2,0,0,0,1,0,1,0,1,0,0,1,0,0,0
4,1,0,0,1,1,0,0,0,0,1,0,0,0,4,0,1,0,0,3,1,0,0,0,0,0,0,1,0,0,1,0,0,0


In [7]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

211 train examples
53 validation examples
67 test examples


#Input Pipeline

In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, target_name, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop(target_name)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [13]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, target_name, batch_size=batch_size)
val_ds = df_to_dataset(val, target_name, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, target_name, shuffle=False, batch_size=batch_size)

In [14]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of electricity:', feature_batch['electricity'])
  print('A batch of targets:', label_batch )

Every feature: ['air_bricks', 'animal_drawn_cart', 'bank_account', 'bed', 'bike', 'motor_boat', 'motorless_boat', 'car_truck', 'cassette_player', 'chair', 'clock', 'cooking_fuel', 'cupboard', 'drinking_water_source', 'dwelling_type', 'eaves', 'electricity', 'floor_material', 'food_problems_weekly', 'wealth_index_cat', 'landline_phone', 'lighting_source', 'motocycle_scooter', 'radio', 'refrigerator', 'roof_material', 'sofa', 'subcountry', 'table', 'tv', 'wall_material', 'watch']
A batch of electricity: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int64)
A batch of targets: tf.Tensor([1 1 1 0 1], shape=(5,), dtype=int64)


#One-Hot Encoding

In [8]:
# animal_type = feature_column.categorical_column_with_vocabulary_list(
#       'Type', ['Cat', 'Dog'])

# animal_type_one_hot = feature_column.indicator_column(animal_type)
# demo(animal_type_one_hot)

In [9]:
# wealth_index = feature_column.categorical_column_with_vocabulary_list('Household wealth index, categorical [EUPATH_0000143]', ['Middle', 'Poorest', 'Least poor'])
# wealth_index_one_hot = feature_column.indicator_column(wealth_index)
# print(wealth_index_one_hot)

In [10]:
# def household_categorical_one_hot(df_cat):
# 	columns = df_cat.columns.tolist()
# 	columns.remove('Household_Id') ## experiment doesn't mess with identifier
# 	# print(type(columns))
# 	# print(columns)

# 	key = pd.Series(index=columns)

# 	for col in columns:
# 		values = list(df_cat[col].unique())
# 		## DEALING WITH CASES
# 		if values == ['Yes', 'No'] or values == ['No', 'Yes']:  ## Just ensuring Y/N is sensibly replaced
# 			df_cat[col] = df_cat[col].apply(lambda x: 1 if x=='Yes' else 0)
# 			key[col] = ['No', 'Yes']
# 		# could add bank account case here
# 		else: 
# 			df_cat[col] = df_cat[col].apply(lambda x: values.index(x))
# 			key[col] = values
# 	return df_cat, key

#Feature Columns

In [9]:
feature_columns = []
indicator_column_names.remove(target_name)
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, df[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

#Create Feature Layer

In [10]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

batch_size = 32
train_ds = df_to_dataset(train, target_name, batch_size=batch_size)
val_ds = df_to_dataset(val, target_name, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, target_name, shuffle=False, batch_size=batch_size)

#Create, Compile, Train

In [11]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(1024, activation='linear'),
  layers.Dense(1024, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4069703588>

In [12]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.8208954930305481


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features (DenseFeature multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  98304     
_________________________________________________________________
dense_1 (Dense)              multiple                  1049600   
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense_2 (Dense)              multiple                  1025      
Total params: 1,148,929
Trainable params: 1,148,929
Non-trainable params: 0
_________________________________________________________________
