In [0]:
# - Load a CSV file using Pandas
# - Build an input pipeline to batch and shuffle the rows using tf.data
# - Map from columns in the CSV to features used to train the model using feature column
# - Build, train, and evaluate a model using Keras.

# We use Cleveland Clinic Foundation for Heart Disease.
# Each row describeas a patient
# Each column describes an attribute.
# There are both numeric and categorical columns

In [0]:
!pip install -q sklearn

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
import pandas as pd
from tensorflow import feature_column
from tensorflow.keras import layers

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")


TensorFlow 2.x selected.
Version:  2.0.0-rc0
Eager mode:  True
Hub version:  0.6.0
GPU is available


In [4]:
# Use pandas to create a dataframe
# Pandas is a Python library with many helpful utilities for loading and working with
# structured data. We will use Pandas to download the dataset from a URL, and load it into dataframe.

URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [5]:
from sklearn.model_selection import train_test_split

# Split the dataframe into train, validation, and test
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'validaiton examples')
print(len(test), 'test examples')

193 train examples
49 validaiton examples
61 test examples


In [6]:
train.shape

(193, 14)

In [7]:
print(type(train))

<class 'pandas.core.frame.DataFrame'>


In [8]:
train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
78,62,0,4,124,209,0,0,163,0,0.0,1,0,normal,0
111,54,0,3,110,214,0,0,158,0,1.6,2,0,normal,0
273,46,0,3,142,177,0,2,160,1,1.4,3,0,normal,0
146,59,1,1,160,273,0,2,125,0,0.0,1,0,normal,0
258,51,1,3,110,175,0,0,123,0,0.6,1,0,normal,0


In [9]:
dict(dataframe)

{'age': 0      63
 1      67
 2      67
 3      37
 4      41
 5      56
 6      62
 7      57
 8      63
 9      53
 10     57
 11     56
 12     56
 13     44
 14     52
 15     57
 16     48
 17     54
 18     48
 19     49
 20     64
 21     58
 22     58
 23     58
 24     60
 25     50
 26     58
 27     66
 28     43
 29     40
        ..
 273    46
 274    58
 275    54
 276    54
 277    60
 278    60
 279    54
 280    59
 281    46
 282    59
 283    60
 284    52
 285    48
 286    45
 287    34
 288    57
 289    71
 290    49
 291    54
 292    59
 293    57
 294    61
 295    39
 296    61
 297    56
 298    52
 299    43
 300    65
 301    48
 302    63
 Name: age, Length: 303, dtype: int64, 'ca': 0      0
 1      3
 2      2
 3      0
 4      0
 5      0
 6      2
 7      0
 8      1
 9      0
 10     0
 11     0
 12     1
 13     0
 14     0
 15     0
 16     0
 17     0
 18     0
 19     0
 20     0
 21     0
 22     0
 23     2
 24     2
 25     0
 26     0
 27     

In [0]:
# Create an input pipeline using tf.data
# Wrap the dataframes with tf.data
# This will enable to sue feature columns as a bridge to map from the columns in the Pandas dataframe
# to features used to train the model. If we were working with a very large CSV file
# so large that it does not fit into memory, use tf.data to read it from dist directly.

# - Enable to use feature columns as a bridge to map from columns in the Pandas dataframe 
# to features used to train the model.
# - Enable working with large CSV file directly disk.

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [0]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [0]:
# Demonstrate several types of feature column
# how transfomr a column from the dataframe.
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [13]:
example_batch

{'age': <tf.Tensor: id=108, shape=(5,), dtype=int32, numpy=array([57, 53, 65, 58, 66], dtype=int32)>,
 'ca': <tf.Tensor: id=109, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 1], dtype=int32)>,
 'chol': <tf.Tensor: id=110, shape=(5,), dtype=int32, numpy=array([335, 234, 269, 283, 278], dtype=int32)>,
 'cp': <tf.Tensor: id=111, shape=(5,), dtype=int32, numpy=array([4, 4, 3, 1, 3], dtype=int32)>,
 'exang': <tf.Tensor: id=112, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0], dtype=int32)>,
 'fbs': <tf.Tensor: id=113, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 1, 0], dtype=int32)>,
 'oldpeak': <tf.Tensor: id=114, shape=(5,), dtype=float64, numpy=array([3. , 0. , 0.8, 1. , 0. ])>,
 'restecg': <tf.Tensor: id=115, shape=(5,), dtype=int32, numpy=array([0, 2, 0, 2, 2], dtype=int32)>,
 'sex': <tf.Tensor: id=116, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0], dtype=int32)>,
 'slope': <tf.Tensor: id=117, shape=(5,), dtype=int32, numpy=array([2, 1, 1, 1, 2], dtype=int32)>,
 'tha

In [0]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [15]:
age = feature_column.numeric_column('age')
demo(age)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[57.]
 [53.]
 [65.]
 [58.]
 [66.]]


In [16]:
# Bucketized columns
# Split its values into different categories based on numerical ranges.
age_buckets = feature_column.bucketized_column(age, 
                                               boundaries=[
                                                   18, 25, 30, 35, 40, 45, 50, 
                                                   55, 60, 65])
demo(age_buckets)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [17]:
# Categorical columns
# thla is represented as a string(e.g. 'fixed', 'normal', or 'reversible').
# We cannot feed strings directly to a model. instead, We must first map them to
# numerical values.
# The categorical vocabulary columns provide a way to represent strings as a
# one-hot vector.
thal= feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [18]:
# Embedding columns
# Suppose instead of having just a few possible strings,
# we have thousands value per category.
# It becomes infeasible to train a neural network using one-hot encoding.
# embedding column represents that data as a lower-dimensional, dense vector
# in which each cell can contain any number not just o or 1.

thal_embeding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embeding)

Instructions for updating:
Please use `layer.add_weight` method instead.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
[[ 0.3046827   0.07038603  0.27071437 -0.11826431 -0.2586297  -0.08367944
   0.4283494  -0.45772415]
 [ 0.40438667 -0.34478182  0.53307986  0.4064149   0.02908956 -0.02926443
   0.31119844 -0.21635504]
 [ 0.40438667 -0.34478182  0.53307986  0.4064149   0.02908956 -0.02926443
   0.31119844 -0.21635504]
 [ 0.40438667 -0.34478182  0.53307986  0.4064149   0.02908956 -0.02926443
   0.31119844 -0.21635504]
 [ 0.40438667 -0.34478182  0.53307986  0.4064149   0.02908956 -0.02926443
   0.31119844 -0.21635504]]


In [19]:
# Hashed feature columns
# Another way to represent a categorical column with a large number of values
#This calculates a hash value of the input, then selects one of the hash_bucket_size
# buckets to encode string.
# Not need to provide the vocabulary.
# There may be collisions in which different strings are mappe to the same bucket.

thal_hashed = feature_column.categorical_column_with_hash_bucket(
        'thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [20]:
# Crossed feature column
# Combining features into a single featue known as feature cross.
# Note that crossed_column does not build the full table of all possible
# combinations (which could be very large).
# Instead, it is backed by a hashed_column, so you can choose how large
# table is.
crossed_feature = feature_column.crossed_column([age_buckets, thal], 
                                                hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [0]:
# Now we use feature columns to train a model.

# key point: If the aim is to build an accurate model, try a large dataaset, and
# think carefully about which features are the most meaningful to include, and
# how they should be represented

feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [0]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [0]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [0]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [0]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

In [26]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f49557baac8>

In [27]:
loss, accuracy = model.evaluate(test_ds)
print('Accuracy', accuracy)

Accuracy 0.6393443


In [0]:
# When working with a small dataset, recommended using a decision tree or
# ranodm forest as a string baseline.