In [0]:
## classify structured data (taular data in csv)

# Load a CSV file using Pandas.
# Build an input pipeline to batch and shuffle the rows using tf.data.
# Map from columns in the CSV to features used to train the model using feature columns.
# Build, train, and evaluate a model using Keras.

# Predict whether a patient has heart disease

In [0]:
!pip install -q sklearn

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd

!pip install -q tensorflow==2.0.0-alpha0
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

[K     |████████████████████████████████| 79.9MB 458kB/s 
[K     |████████████████████████████████| 61kB 23.9MB/s 
[K     |████████████████████████████████| 3.0MB 31.5MB/s 
[K     |████████████████████████████████| 419kB 53.6MB/s 
[?25h

In [9]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
print(dataframe.shape)
dataframe.head()

(303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [10]:
## Split the dataframe into train, val, test

train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


In [0]:
## Create an input pipeline using tf.data

# wrap the dataframes with tf.data
# enable to use feature columns as a bridge to map
# from the colums in the Pandas dataframe to features
# used to train model

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices(
        (dict(dataframe), labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
        
    ds = ds.batch(batch_size)
    return ds

In [0]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [18]:
## Understand the input pipelines

for feature_batch, label_batch in train_ds.take(1): # take starts with 1. (not zero)
    print('Every feature: ', list(feature_batch.keys()))
    print('A batch of ages: ', feature_batch['age'])
    print('A batch of target: ', label_batch)

# We can see that the dataset returns a dictionary of column names (from the dataframe)
# that map to column value from rows in the dataframe

Every feature:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages:  tf.Tensor([41 56 66 67 62], shape=(5,), dtype=int32)
A batch of target:  tf.Tensor([0 0 1 1 1], shape=(5,), dtype=int32)


In [0]:
## Demonstrate several types of feature column

# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [0]:
# A utility method to create a feature column
# and to transform a batch of data

def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [21]:
# The output of a feature column becomes the input to the model
age = feature_column.numeric_column('age')
demo(age)

W0512 11:16:53.085058 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:2758: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[41.]
 [56.]
 [66.]
 [67.]
 [62.]]


In [27]:
# Instead of representing age as a numeric column, we could split the age 
# into several buckets using a bucketized column. 
boundaries = [ 18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
age_buckets = feature_column.bucketized_column(age, 
                                               boundaries=boundaries)
demo(age_buckets)

W0512 11:20:04.100256 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:2902: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]


In [28]:
# In this dataset, thal is represented as a string (e.g. 'fixed', 'normal', or 'reversible').
# We cannot feed strings directly to a model.
# The categorical vocabulary columns provide a way to represent strings as a one-hot vector
thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

W0512 11:22:35.660559 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4307: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0512 11:22:35.661881 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [29]:
## Embedding columns
# Suppose instead of having just a few possible strings, 
# we have thousands (or more) values per category. 
# For a number of reasons, as the number of categories grow large, it becomes 
# infeasible to train a neural network using one-hot encodings
# represents that data as a lower-dimensional, dense vector 
# in which each cell can contain any number, not just 0 or 1

thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.19893801  0.28358492  0.14529227 -0.23582691  0.26606533 -0.05954123
   0.17855044 -0.20925531]
 [-0.01416214  0.4585956  -0.03341193  0.26013473 -0.285252    0.14049746
  -0.56033534 -0.47582766]
 [-0.01416214  0.4585956  -0.03341193  0.26013473 -0.285252    0.14049746
  -0.56033534 -0.47582766]
 [-0.2588154   0.05537464 -0.01283331 -0.0233634   0.21912993 -0.5541235
  -0.09415239 -0.19263671]
 [-0.19893801  0.28358492  0.14529227 -0.23582691  0.26606533 -0.05954123
   0.17855044 -0.20925531]]


In [31]:
## Hashed feature columns
# Another way to represent a categorical column with a large number of values is 
# to use a categorical_column_with_hash_bucket.
# This feature column calculates a hash value of the input, then selects 
# one of the hash_bucket_size buckets to encode a string
# Key point: An important downside of this technique is that 
# there may be collisions in which different strings are mapped to the same bucket.
# In practice, this can work well for some datasets regardless.

thal_hashed = feature_column.categorical_column_with_hash_bucket(
        'thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

W0512 11:30:36.917069 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [32]:
## Crossed feature columns
# Combining features into a single feature, better known as feature crosses
# enables a model to learn separate weights for each combination of features
# Here, we will create a new feature that is the cross of age and thal.
# Note that crossed_column does not build the full table of 
# all possible combinations (which could be very large)
# Instead, it is backed by a hashed_column, so you can choose how large the table is.
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))

W0512 11:33:23.722586 139800794097536 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/feature_column/feature_column_v2.py:4362: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [0]:
## Choose which columns to use
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, 
                                               boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [0]:
# Create a feature layer
# Now that we have defined our feature columns, we will use a DenseFeatures layer to input them to our Keras model.
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [0]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [0]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [0]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [40]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f25a946f550>

In [41]:
loss, accuracy = model.evaluate(test_ds)
print('Accuracy', accuracy)

Accuracy 0.78688526
