In [1]:
# How to classify structured data
# 1/ Load CSV using Pandas
# 2/ Pipeline to batch and shuffle examples
# 3/ Map columns to features used to train the model
# 4/ Build, train, evaluate the model

# Dataset: Heart Disease from Cleveland Clinic
# Predict whether a patient has heart disease (binary classification)

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Helper libraries
import numpy as np
import pandas as pd # For loading, working with structured data

# TF imports
import tensorflow as tf
from tensorflow import feature_column # Map CSV column to features for the model
from tensorflow.keras import layers

# import module from sklearn
from sklearn.model_selection import train_test_split

In [4]:
### Load the dataset in a dataframe

URL = 'https://storage.googleapis.com/applied-dl/heart.csv'

dataframe = pd.read_csv(URL)
print('Dataframe shape: ', dataframe.shape)
# Display the first 5 rows of the dataframe
dataframe.head()

Dataframe shape:  (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [6]:
# Split data into train/dev/test sets

# First, get the training/test set (80/20 split)
train, test = train_test_split(dataframe, test_size=0.2) #Split the dataframe into random train and test subsets
print('Test set shape: ', test.shape)
# Next, split the training set into training/dev set(80/20 split)
train, dev = train_test_split(train, test_size=0.2)
print('Dev set shape: ', dev.shape)
print('Training set shape: ', train.shape)

Test set shape:  (61, 14)
Dev set shape:  (49, 14)
Training set shape:  (193, 14)


In [11]:
# Let's create a tf.Dataset from the dataframes

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    ''' Convert a dataframe into a tf.Dataset
    '''
    # Work on a copy of the dataframe
    dataframe = dataframe.copy()
    
    # Get the output/target column and drop it from the dataframe
    labels = dataframe.pop('target')
    
    # Perform the conversion
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size = len(dataframe))
    # Add a batch size dimension to the dataset
    dataset = dataset.batch(batch_size)
    
    return dataset
    

In [21]:
# Let's convert the dataframes to datasets

batch_size = 5

train_dataset = df_to_dataset(train, batch_size=batch_size)
dev_dataset = df_to_dataset(dev, shuffle=False, batch_size=batch_size)
test_dataset = df_to_dataset(test, shuffle=False, batch_size=batch_size)
# The above datasets are tuples

print('1st mini-batch in training dataset: ')
mini_batch = next(iter(train_dataset))
mini_batch_x = mini_batch[0]
mini_batch_y = mini_batch[1]

print('Input: ')
for key in mini_batch_x:
    print('%r: %r' %(key, mini_batch_x[key].numpy()))
print('Target: ')
for key in mini_batch_y:
    print('%r: %r' %(key, mini_batch_y[key].numpy()))

1st mini-batch in training dataset: 
Input: 
'age': array([49, 51, 38, 66, 55], dtype=int32)
'sex': array([1, 0, 1, 1, 1], dtype=int32)
'cp': array([3, 3, 1, 4, 4], dtype=int32)
'trestbps': array([120, 140, 120, 112, 160], dtype=int32)
'chol': array([188, 308, 231, 212, 289], dtype=int32)
'fbs': array([0, 0, 0, 0, 0], dtype=int32)
'restecg': array([0, 2, 0, 2, 2], dtype=int32)
'thalach': array([139, 142, 182, 132, 145], dtype=int32)
'exang': array([0, 0, 1, 1, 1], dtype=int32)
'oldpeak': array([2. , 1.5, 3.8, 0.1, 0.8])
'slope': array([2, 1, 2, 1, 2], dtype=int32)
'ca': array([3, 1, 0, 1, 1], dtype=int32)
'thal': array([b'reversible', b'normal', b'reversible', b'normal', b'reversible'],
      dtype=object)
Target: 
<tf.Tensor: id=997, shape=(), dtype=int32, numpy=1>: 0
<tf.Tensor: id=1007, shape=(), dtype=int32, numpy=0>: 1
<tf.Tensor: id=1017, shape=(), dtype=int32, numpy=1>: 0
<tf.Tensor: id=1027, shape=(), dtype=int32, numpy=1>: 0
<tf.Tensor: id=1037, shape=(), dtype=int32, numpy=1>

In [22]:
# Let's explore the datasets

#.take(count) : Creates a Dataset with at most count elements from this dataset.
for features, labels in train_dataset.take(1):
    print('Features: ', list(features.keys()))
    print('1st mini-batch of ages: ', features['age'])
    print('1st mini-batch outputs: ', labels)

Features:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
1st mini-batch of ages:  tf.Tensor([42 55 62 50 51], shape=(5,), dtype=int32)
1st mini-batch outputs:  tf.Tensor([0 1 0 0 1], shape=(5,), dtype=int32)


In [39]:
# Let's create several type of feature column

# Think of feature columns as the intermediaries between raw data and Estimators.
# Feature columns are very rich, enabling you to transform a diverse range of raw data
# into formats that Estimators can use, allowing easy experimentation.

example_batch = next(iter(train_dataset))[0] # dict of (feature, values) pairs

def demo(feature_column):
    ''' Given a feature_column, create a appropriate layer to transform the raw data.
        Perform these transformation on an batch example.
    '''
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())
    return feature_layer(example_batch).numpy()
    

In [46]:
# The output of a feature_column becomes the input to the model

### Numeric column
# Simplest type of column, the model will receive the same column value from the dataframe unchanged

# Create a numeric column called 'age'
age = feature_column.numeric_column('age')
demo(age)
print('-'*30)

### Bucketized column
#Usually you don't want to pass the numbers directly to the model, but instead
# split its value into different categories based on numerical ranges

# boundaries defines the numerical ranges
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets) # one-hot representation
print('-'*30)

### Categorical column
# We cannot feed a string to a model. Map them to numeric values using a one-hot vector
# (A specific dictionnary can be passed in desired)

thal = feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)
print('-'*30)

### Embedding columns
# 'Improved' categorical column: What is the dictionary size is huge -> One-hot representation is too sparse
# -> Use embedding column (dense vector)

# Use the same categorical column as before
thal_embedding = feature_column.embedding_column(thal, dimension=8) # dimension is the size of the embedding vector
demo(thal_embedding)
print('-'*30)


### Hashed feature columns
# Another way to represent categorical column(e.g. string) with a large number of values
# Compute the hash of the input, then encode the string using the 'hash_bucket_size'-dimensional one hot representation
# No need to provide the dictionary, (careful collision).
thal_bis = feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size = 1000)
thal_hashed = feature_column.indicator_column(thal_bis)
np.set_printoptions(threshold=10)
demo(thal_hashed)
print('-'*30)


### Crossed feature columns
# Combine features into a single feature -> Model can learn weight for each combination
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size = 1000)
demo(feature_column.indicator_column(crossed_feature))
print('-'*30)

W0906 12:28:00.533187 140735803462528 base_layer.py:1772] Layer dense_features_65 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W0906 12:28:00.541524 140735803462528 base_layer.py:1772] Layer dense_features_66 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the laye

[[60.]
 [58.]
 [59.]
 [58.]
 [46.]]
------------------------------
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
------------------------------
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
------------------------------
[[-0.05341416 -0.34452724  0.02919222 -0.07340377  0.06715149 -0.34583968
   0.34234476  0.22445856]
 [-0.05341416 -0.34452724  0.02919222 -0.07340377  0.06715149 -0.34583968
   0.34234476  0.22445856]
 [-0.05341416 -0.34452724  0.02919222 -0.07340377  0.06715149 -0.34583968
   0.34234476  0.22445856]
 [-0.05341416 -0.34452724  0.02919222 -0.07340377  0.06715149 -0.34583968
   0.34234476  0.22445856]
 [-0.6288425   0.27460635 -0.2318049  -0.1205133   0.08272018 -0.16477418
   0.10479602  0.03200069]]
------------------------------
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0.

In [52]:
### Choose which column to use
# Select a few columns to train the model arbitrarily
# The corresponding Keras layers will be created afterwards.

feature_columns = []

## Numerical columns
for feature in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(feature))
    
# Bucketized columns
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# Indicator columns
thal = feature_column.categorical_column_with_vocabulary_list('thal',
                                                              ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# Embedding columns
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# Crossed columns
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size = 1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [53]:
# Now that we have defined all our feature_columns, create the appropriate Keras layer

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# Create a new input pipeline with a larger mini-batch size
batch_size = 32
train_dataset = df_to_dataset(train, batch_size = batch_size)
dev_dataset = df_to_dataset(dev, shuffle = False, batch_size = batch_size)
test_dataset = df_to_dataset(test, shuffle = False, batch_size = batch_size)

In [65]:
### Create, compile and train the while model

# FEATURE_LAYER -> FC(128) -> RELU -> FC(128) -> RELU -> FC(1) -> SIGMOID

model = tf.keras.Sequential([feature_layer,
                             layers.Dense(128, activation = 'relu'),
                             layers.Dense(128, activation = 'relu'),
                             layers.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'],
              run_eagerly = True)

model.fit(train_dataset,
          validation_data = dev_dataset,
          epochs = 20)

W0906 12:52:17.833874 140735803462528 base_layer.py:1772] Layer sequential_6 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1313fab70>

In [66]:
# Evaluate the model on the test set

test_loss, test_accuracy = model.evaluate(test_dataset)
print('Test Loss: ', test_loss)
print('Test accuracy: ', test_accuracy)

Test Loss:  4.9449591636657715
Test accuracy:  0.6721311
