# Tutorial 2 - Tensorflow - DNNClassifier

We will predict the ocean proximity (`ocean_proximity` column) of Californian districts, given a number of features from these districts.

**The unit of analysis is a DISTRICT**

In [1]:
# Common imports
import numpy as np
import pandas as pd
import tensorflow as tf

# Get the data

In [2]:
housing = pd.read_csv("housing.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
#Drop the missing values
housing.dropna(axis=0, inplace=True)

# Let's also reset the index
housing.reset_index(inplace=True, drop=True)


housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


# Prepare the data for Machine Learning algorithms

In [4]:
#Set the training and test data sets
housing_num = housing.drop("ocean_proximity", axis=1) # drop labels 

#Select the label
housing_target = housing[["ocean_proximity"]]

### Standardize the data

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

housing_num_std = scaler.fit_transform(housing_num)

In [6]:
housing_num_std

array([[-1.32731375,  1.05171726,  0.98216331, ..., -0.97683327,
         2.34516291,  2.12881864],
       [-1.32232256,  1.04235526, -0.60621017, ...,  1.67037262,
         2.33263161,  1.31362603],
       [-1.33230494,  1.03767426,  1.85576873, ..., -0.84342665,
         1.78293943,  1.25818254],
       ...,
       [-0.82320322,  1.77727236, -0.92388486, ..., -0.17377773,
        -1.14317103, -0.99247676],
       [-0.87311515,  1.77727236, -0.84446619, ..., -0.39350628,
        -1.05513604, -1.05831591],
       [-0.83318561,  1.74918635, -1.00330353, ...,  0.07995643,
        -0.78060586, -1.01759959]])

In [7]:
housing_num_std.shape

(20433, 9)

### Convert back to a dataframe

In [8]:
housing_num_std_df = pd.DataFrame(housing_num_std, columns=housing_num.columns)

housing_num_std_df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-1.327314,1.051717,0.982163,-0.803813,-0.970325,-0.97332,-0.976833,2.345163,2.128819
1,-1.322323,1.042355,-0.60621,2.04213,1.348276,0.861339,1.670373,2.332632,1.313626
2,-1.332305,1.037674,1.855769,-0.535189,-0.825561,-0.819769,-0.843427,1.782939,1.258183
3,-1.337296,1.037674,1.855769,-0.62351,-0.718768,-0.765056,-0.733562,0.93297,1.164622
4,-1.337296,1.037674,1.855769,-0.46197,-0.611974,-0.758879,-0.62893,-0.013143,1.172418


### Create the label column

Tensorflow wants the labels in integer form. So, we need to do Ordinal Encoding, then convert the numbers to integers.

In [9]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

housing_labels_ord = ordinal_encoder.fit_transform(housing_target)

housing_labels_ord[:10]

array([[3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.]])

In [10]:
# Data type is float. It needs to be integer
housing_labels_ord.dtype

dtype('float64')

In [11]:
#Convert to integer

housing_labels_int = housing_labels_ord.astype(int)

housing_labels_int.dtype

dtype('int64')

In [12]:
#We need to convert this to a 1-D array using RAVEL

housing_labels_int_1d = np.ravel(housing_labels_int)

housing_labels_int_1d

array([3, 3, 3, ..., 1, 1, 1])

# Split data (train/test)

In [13]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(housing_num_std_df, housing_labels_int_1d, test_size=0.3)

# Create feature columns

TensorFlow needs "feature columns" as a bridge between the data and the estimator. These involve columns names and data types only. 

In [14]:
longitude1= tf.feature_column.numeric_column('longitude')
latitude1= tf.feature_column.numeric_column('latitude')
housing_median_age1= tf.feature_column.numeric_column('housing_median_age')
total_rooms1= tf.feature_column.numeric_column('total_rooms')
total_bedrooms1= tf.feature_column.numeric_column('total_bedrooms')
population1= tf.feature_column.numeric_column('population')
households1= tf.feature_column.numeric_column('households')
median_income1= tf.feature_column.numeric_column('median_income')
median_house_value1= tf.feature_column.numeric_column('median_house_value')

feat_columns = [longitude1, 
                latitude1,
               housing_median_age1,
               total_rooms1,
               total_bedrooms1,
               population1,
               households1,
               median_income1,
               median_house_value1
               ]

In [15]:
feat_columns

[NumericColumn(key='longitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='latitude', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='housing_median_age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='total_bedrooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='population', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='households', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='median_house_value', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

## More sophisticated feature columns:

You can define feature columns using different ways, depending on whether they are categorical, binned, one-hot-encoding, crossed, etc. <br>Refer to: https://www.tensorflow.org/guide/feature_columns

# Multiclass classification



In [16]:
#Define the model

tf.reset_default_graph()

dnn_clf = tf.estimator.DNNClassifier(hidden_units=[50, 25, 10], 
                                     n_classes=5, 
                                     feature_columns=feat_columns) 


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpxnxccr4v', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2b32f320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
#Read data from tensor into memory

def training_input_fn ():
    dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y))
    dataset = dataset.shuffle(500).repeat().batch(100)
    dataset_iterator = dataset.make_one_shot_iterator()   
    return dataset_iterator.get_next()

# create testing input function
def valid_input_fn ():
    dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y))
    dataset = dataset.batch(100)
    dataset_iterator = dataset.make_one_shot_iterator()   
    return dataset_iterator.get_next()


In [18]:
#Train the model

dnn_clf.train(input_fn = lambda:training_input_fn(),steps=10000)

Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpxnxccr4v/model.ckpt.
INFO:tensorflow:loss = 171.32, step = 1
INFO:tensorflow:global_step/sec: 286.518
INFO:tensorflow:loss = 51.594604, step = 101 (0.350 sec)
INFO:tensorflow:global_step/sec: 460.035
INFO:tensorflow:loss = 43.19963, step = 201 (0.217 sec)
INFO:tensorflow:global_step/sec: 455.529
INFO:tensorflow:loss = 31.037985, step = 301 (0.220 sec)
INFO:tensorflow:global_step/sec: 465.735
INFO:tensorflow:loss = 26.55597, step = 401 (0.215 sec)
INFO:tensorflow:global_step/sec: 436.79
INFO:tensorflow:loss = 31.695572, step = 501 (0.229 se

INFO:tensorflow:loss = 22.342915, step = 7401 (0.273 sec)
INFO:tensorflow:global_step/sec: 364.215
INFO:tensorflow:loss = 16.823505, step = 7501 (0.274 sec)
INFO:tensorflow:global_step/sec: 392.651
INFO:tensorflow:loss = 19.227583, step = 7601 (0.255 sec)
INFO:tensorflow:global_step/sec: 396.489
INFO:tensorflow:loss = 18.692703, step = 7701 (0.252 sec)
INFO:tensorflow:global_step/sec: 391.854
INFO:tensorflow:loss = 14.808048, step = 7801 (0.255 sec)
INFO:tensorflow:global_step/sec: 393.194
INFO:tensorflow:loss = 28.748566, step = 7901 (0.254 sec)
INFO:tensorflow:global_step/sec: 402.434
INFO:tensorflow:loss = 21.649973, step = 8001 (0.248 sec)
INFO:tensorflow:global_step/sec: 392.09
INFO:tensorflow:loss = 21.891212, step = 8101 (0.255 sec)
INFO:tensorflow:global_step/sec: 393.159
INFO:tensorflow:loss = 14.768515, step = 8201 (0.254 sec)
INFO:tensorflow:global_step/sec: 404.253
INFO:tensorflow:loss = 16.421707, step = 8301 (0.247 sec)
INFO:tensorflow:global_step/sec: 410.371
INFO:tensor

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x1a2b313f28>

In [19]:
#Evaluate the model

dnn_clf.evaluate(input_fn = lambda:valid_input_fn())

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-29T05:23:09Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpxnxccr4v/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-10-29-05:23:09
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.9309951, average_loss = 0.17250624, global_step = 10000, loss = 17.055859
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10000: /var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpxnxccr4v/model.ckpt-10000


{'accuracy': 0.9309951,
 'average_loss': 0.17250624,
 'global_step': 10000,
 'loss': 17.055859}

# Classification Matrix

In [20]:
y_train_pred = list(dnn_clf.predict(input_fn = valid_input_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpxnxccr4v/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [21]:
#y_train_pred is a dictionary. We only need the "classes" from it

y_train_pred

predictions = [p['class_ids'][0] for p in y_train_pred]

In [22]:
confusion_matrix = tf.confusion_matrix(list(test_y), predictions)


Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [23]:
with tf.Session():
    print('\nConfusion Matrix:\n', tf.Tensor.eval(confusion_matrix,feed_dict=None, session=None))


Confusion Matrix:
 [[2565   54    0   10   78]
 [  59 1921    0    7    0]
 [   0    0    0    0    1]
 [  22    8    0  631    3]
 [ 138    0    0   43  590]]


# Optimizers, Learning rate, Dropout & Activation functions

In [24]:
# Select from one of the below optimizers

tf.reset_default_graph()

# Set the learning rate:
lr = 0.001

# Set the dropout rate:
# Note: in this case, dropout is reducing the accuracy
dropout = 0.1


# Available optimizers:
momentum = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
nesterov = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9, use_nesterov=True)
adagrad =  tf.train.AdagradOptimizer(learning_rate=lr)
adam = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-08)
gd = tf.train.GradientDescentOptimizer(learning_rate=lr)
rmsprop = tf.train.RMSPropOptimizer(learning_rate=lr, decay=0.9, momentum=0.9, epsilon=1e-10)


# Activation functions:
relu = tf.nn.relu
leaky_relu = tf.nn.leaky_relu
elu = tf.nn.elu
tanh = tf.math.tanh
sigmoid = tf.math.sigmoid


#Select an the optimizer and activation function in the definition below:
dnn_clf = tf.estimator.DNNClassifier(hidden_units = [50, 25, 10], 
                                     n_classes = 5, 
                                     feature_columns = feat_columns,
                                     optimizer = nesterov,
                                     activation_fn = elu,
                                     dropout = dropout) 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpif9wmp_s', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2bbc2f28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [25]:
#Train the model

dnn_clf.train(input_fn = lambda:training_input_fn(),steps=10000)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/yr/h7yx6m314m76y6r1650t9xkh0000gp/T/tmpif9wmp_s/model.ckpt.
INFO:tensorflow:loss = 159.63231, step = 1
INFO:tensorflow:global_step/sec: 254.735
INFO:tensorflow:loss = 63.059113, step = 101 (0.394 sec)
INFO:tensorflow:global_step/sec: 450.732
INFO:tensorflow:loss = 49.529594, step = 201 (0.222 sec)
INFO:tensorflow:global_step/sec: 423.912
INFO:tensorflow:loss = 49.416534, step = 301 (0.236 sec)
INFO:tensorflow:global_step/sec: 452.225
INFO:tensorflow:loss = 30.10559, step = 401 (0.221 sec)
INFO:tensorflow:global_step/sec: 440.674
INFO:tensorflow:loss = 42.458904, step = 501 (0.227 se

INFO:tensorflow:global_step/sec: 394.999
INFO:tensorflow:loss = 20.124397, step = 7701 (0.252 sec)
INFO:tensorflow:global_step/sec: 428.701
INFO:tensorflow:loss = 27.03337, step = 7801 (0.233 sec)
INFO:tensorflow:global_step/sec: 429.721
INFO:tensorflow:loss = 30.521267, step = 7901 (0.233 sec)
INFO:tensorflow:global_step/sec: 411.191
INFO:tensorflow:loss = 16.50904, step = 8001 (0.243 sec)
INFO:tensorflow:global_step/sec: 437.346
INFO:tensorflow:loss = 29.18869, step = 8101 (0.228 sec)
INFO:tensorflow:global_step/sec: 447.319
INFO:tensorflow:loss = 14.210391, step = 8201 (0.224 sec)
INFO:tensorflow:global_step/sec: 441.698
INFO:tensorflow:loss = 18.879475, step = 8301 (0.226 sec)
INFO:tensorflow:global_step/sec: 445.7
INFO:tensorflow:loss = 26.960052, step = 8401 (0.224 sec)
INFO:tensorflow:global_step/sec: 448.922
INFO:tensorflow:loss = 21.480227, step = 8501 (0.222 sec)
INFO:tensorflow:global_step/sec: 460.704
INFO:tensorflow:loss = 20.215006, step = 8601 (0.218 sec)
INFO:tensorflow

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x1a2bbc2da0>

In [None]:
dnn_clf.evaluate(input_fn = lambda:valid_input_fn())