In [69]:
### LOAD A Pandas dataframe INTO A tf.Dataset

# Data: Heart Disease from Cleveland clinic

In [70]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import tensorflow as tf

In [71]:
# Download the CSV file

csv_file = tf.keras.utils.get_file('heart.csv', 
                                   'https://storage.googleapis.com/applied-dl/heart.csv')
csv_file

'/Users/nicolas/.keras/datasets/heart.csv'

In [72]:
# Load the csv file in a dataframe

df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [73]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.594059,0.676568,3.108911,131.785479,246.547855,0.148515,0.990099,149.194719,0.326733,1.057756,1.590759,0.683168,0.273927
std,9.01637,0.46856,1.028414,17.748338,52.175933,0.356198,0.988293,23.173368,0.469794,1.165025,0.617767,0.937773,0.44671
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,0.0
25%,48.0,0.0,2.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,152.0,0.0,0.8,2.0,0.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,165.5,1.0,1.6,2.0,1.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0


In [74]:
# Check the columns type
print(df.dtypes)
print('-'*30)

# thal need to be converted to a numerical value
df['thal'] = pd.Categorical(df['thal'])
print(df.dtypes)
print('-'*30)

# Use Pandas method for converting feature types
# Get the thal column with is of type: category.
# Then, replace the category by its corresponding numerical code value
df['thal'] = df.thal.cat.codes
print(df.dtypes)
print('-'*30)
print(df.head())

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
target        int64
dtype: object
------------------------------
age            int64
sex            int64
cp             int64
trestbps       int64
chol           int64
fbs            int64
restecg        int64
thalach        int64
exang          int64
oldpeak      float64
slope          int64
ca             int64
thal        category
target         int64
dtype: object
------------------------------
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal           int8
target        int64
dtype: object
------------------------------
   age  sex

In [75]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,2,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,4,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [76]:
# Now convert the dataframe into a tf.Dataset

# tf.Dataset allows you to write simple and efficient datapipeline(map(), reduce(), feature_column)

# Extract the output/target column from the dataframe
target_column = df.pop('target')

# .values -> Only the values in the DataFrame will be returned, the axes labels will be removed.
dataset = tf.data.Dataset.from_tensor_slices((df.values, target_column.values))

In [89]:
print(dataset) # (nb_features, m)
print('-'*30)
for example in dataset.batch(16).take(1):
    print(example)

<TensorSliceDataset shapes: ((13,), ()), types: (tf.float64, tf.int64)>
------------------------------
(<tf.Tensor: id=73740, shape=(16, 13), dtype=float64, numpy=
array([[ 63. ,   1. ,   1. , 145. , 233. ,   1. ,   2. , 150. ,   0. ,
          2.3,   3. ,   0. ,   2. ],
       [ 67. ,   1. ,   4. , 160. , 286. ,   0. ,   2. , 108. ,   1. ,
          1.5,   2. ,   3. ,   3. ],
       [ 67. ,   1. ,   4. , 120. , 229. ,   0. ,   2. , 129. ,   1. ,
          2.6,   2. ,   2. ,   4. ],
       [ 37. ,   1. ,   3. , 130. , 250. ,   0. ,   0. , 187. ,   0. ,
          3.5,   3. ,   0. ,   3. ],
       [ 41. ,   0. ,   2. , 130. , 204. ,   0. ,   2. , 172. ,   0. ,
          1.4,   1. ,   0. ,   3. ],
       [ 56. ,   1. ,   2. , 120. , 236. ,   0. ,   0. , 178. ,   0. ,
          0.8,   1. ,   0. ,   3. ],
       [ 62. ,   0. ,   4. , 140. , 268. ,   0. ,   2. , 160. ,   0. ,
          3.6,   3. ,   2. ,   3. ],
       [ 57. ,   0. ,   4. , 120. , 354. ,   0. ,   0. , 163. ,   1. ,
         

In [78]:
# Let's look at the first 5 entries in the dataset

# .take(count) -> Creates a Dataset with at most count elements from this dataset.
for features, target in dataset.take(5):
    print('Features: {}, Target: {}'.format(features, target))

Features: [ 63.    1.    1.  145.  233.    1.    2.  150.    0.    2.3   3.    0.
   2. ], Target: 0
Features: [ 67.    1.    4.  160.  286.    0.    2.  108.    1.    1.5   2.    3.
   3. ], Target: 1
Features: [ 67.    1.    4.  120.  229.    0.    2.  129.    1.    2.6   2.    2.
   4. ], Target: 0
Features: [ 37.    1.    3.  130.  250.    0.    0.  187.    0.    3.5   3.    0.
   3. ], Target: 0
Features: [ 41.    0.    2.  130.  204.    0.    2.  172.    0.    1.4   1.    0.
   3. ], Target: 0


In [79]:
# Note that since pd.Series implements __array__ protocol, pd.Series can be a substitute to np.array

tf.constant(df['thal'])

<tf.Tensor: id=58905, shape=(303,), dtype=int32, numpy=
array([2, 3, 4, 3, 3, 3, 3, 3, 4, 4, 2, 3, 2, 4, 4, 3, 4, 3, 3, 3, 3, 3,
       3, 4, 4, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, 4, 2, 4, 3, 4, 3, 4, 4,
       2, 3, 3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 4,
       4, 2, 3, 3, 4, 3, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3, 3, 3, 4, 4, 4,
       3, 3, 4, 3, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 3, 3, 4, 4, 4, 4, 4,
       3, 3, 3, 3, 4, 3, 4, 3, 4, 4, 3, 3, 2, 4, 4, 2, 3, 3, 4, 4, 3, 4,
       3, 3, 4, 2, 4, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 2,
       4, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 2, 4, 3, 4, 2, 4, 3,
       3, 4, 3, 3, 3, 3, 4, 3, 4, 3, 4, 2, 2, 4, 3, 4, 3, 2, 4, 3, 3, 2,
       4, 4, 4, 4, 3, 0, 3, 3, 3, 3, 1, 4, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4,
       3, 3, 4, 4, 4, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 4, 4, 3, 4, 4, 3

In [80]:
# Shuffle and batch the dataset

train_set = dataset.shuffle(len(df)).batch(1)
print(train_set)

<BatchDataset shapes: ((None, 13), (None,)), types: (tf.float64, tf.int64)>


In [81]:
### Build and train a model

def get_compiled_model():
    ''' Create and compile a model(optimizer, loss, metrics)
        FC(10) -> RELU -> FC(10) -> RELU -> FC(1) -> SIGMOID
    '''
    model = tf.keras.Sequential([tf.keras.layers.Dense(10, activation = 'relu'),
                                 tf.keras.layers.Dense(10, activation = 'relu'),
                                 tf.keras.layers.Dense(1, activation = 'sigmoid')])
    
    model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
    
    return model

In [82]:
# Construct and train the model

model = get_compiled_model()
model.fit(train_set, epochs = 15)

W0908 14:51:02.991415 140735803462528 base_layer.py:1772] Layer sequential_4 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x12de84518>

In [83]:
# So far, we used feature_column to preprocess the features.
# An alternative to feature_column is to pass a dictionary

# inputs is a dictionnary of (feature, Input layer) pairs
inputs = {key: tf.keras.layers.Input(shape = (), name = key) for key in df.keys()}

# Take all the Input layers add stack them into one tensor
x = tf.stack(list(inputs.values()), axis = -1)
print(x)

# Forward pass x to a FC(10) -> RELU layer
x = tf.keras.layers.Dense(10, activation = 'relu')(x)

# Forward pass x to a FC(1) -> SIGMOID layer
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(x)

# Create the resulting Keras model
model_func = tf.keras.Model(inputs = inputs, outputs = output)

# Define optimizer, loss and metrics
model_func.compile(optimizer = 'adam',
                   loss = 'binary_crossentropy',
                   metrics = ['accuracy'])

Tensor("stack_6:0", shape=(None, 13), dtype=float32)


In [84]:
# We have used dataset = tf.data.Dataset.from_tensor_slices((df.values, target_column.values))
# to create a tf.Dataset from a dataframe

# The best way to preserve the column structure of the dataframe is to convert the dataframe to a dict

# ('list') argument -> dict like {column -> [values]}
dataset_from_dict = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target_column.values)).batch(16)


In [87]:
# Let's explore the dataset_from_dict
print(dataset)
print('-'*30)
for example in dataset_from_dict.take(1):
    print(example) # Preserve feature name

<TensorSliceDataset shapes: ((13,), ()), types: (tf.float64, tf.int64)>
------------------------------
({'age': <tf.Tensor: id=73704, shape=(16,), dtype=int32, numpy=
array([63, 67, 67, 37, 41, 56, 62, 57, 63, 53, 57, 56, 56, 44, 52, 57],
      dtype=int32)>, 'sex': <tf.Tensor: id=73712, shape=(16,), dtype=int32, numpy=array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int32)>, 'cp': <tf.Tensor: id=73707, shape=(16,), dtype=int32, numpy=array([1, 4, 4, 3, 2, 2, 4, 4, 4, 4, 4, 2, 3, 2, 3, 3], dtype=int32)>, 'trestbps': <tf.Tensor: id=73716, shape=(16,), dtype=int32, numpy=
array([145, 160, 120, 130, 130, 120, 140, 120, 130, 140, 140, 140, 130,
       120, 172, 150], dtype=int32)>, 'chol': <tf.Tensor: id=73706, shape=(16,), dtype=int32, numpy=
array([233, 286, 229, 250, 204, 236, 268, 354, 254, 203, 192, 294, 256,
       263, 199, 168], dtype=int32)>, 'fbs': <tf.Tensor: id=73709, shape=(16,), dtype=int32, numpy=array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int3

In [91]:
# Train the model

model_func.fit(dataset_from_dict, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x12e4f8f60>