In [2]:
### LOAD A CSV FILE INTO A tf.Dataset

# Data: Titanic passenger list

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Library for higher-order function (function that acts on other function)
import functools

import numpy as np
np.set_printoptions(precision = 3, suppress = True)
import tensorflow as tf

print(tf.__version__)

2.0.0-rc0


In [4]:
# Load the data

TRAIN_DATASET_URL = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
TEST_DATASET_URL = 'https://storage.googleapis.com/tf-datasets/titanic/eval.csv'

# Downloads a file from a URL 
# Return the path to the downloaded file

train_dataset_path = tf.keras.utils.get_file('train.csv', TRAIN_DATASET_URL)
test_dataset_path = tf.keras.utils.get_file('test.csv', TEST_DATASET_URL)

In [5]:
# Let's look at the beginning of the CSV file (training set)

!head {train_dataset_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [6]:
# We can either load the datasets using pandas, and pass the numpy array to TF.
# Or, for large file, it is better to use make_csv_dataset method from TF.

# Define the target column
LABEL_COLUMN = 'survived'
LABELS = [0, 1] # 1 if the passenger survived

In [7]:
# Create a dataset from the CSV file

def get_dataset(file_path, **kwargs):
    ''' Given a file_path, convert it and return the corresponding tf.Dataset
    '''
    dataset = tf.data.experimental.make_csv_dataset(file_path,
                                                    batch_size = 5, # Keep it small to make examples easy to show
                                                    label_name = LABEL_COLUMN,
                                                    na_value = '?', # Additional string to recognize as NA
                                                    num_epochs = 1, # Number of times this dataset is repeated
                                                    ignore_errors = True,
                                                    **kwargs)
    return dataset

# Get the training/test datasets
training_set = get_dataset(train_dataset_path)
test_set = get_dataset(test_dataset_path)

W0908 09:47:52.227013 140735803462528 deprecation.py:323] From /Users/nicolas/anaconda/lib/python3.6/site-packages/tensorflow_core/python/data/experimental/ops/readers.py:521: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.


In [8]:
# Create a method to display the dataset

# Each item in the dataset is a batch, represented as a tuple of (m, nb_features)

def display_dataset(dataset):
    ''' Print the first batch in the dataset
    '''
    
    for batch, label in dataset.take(1):
        # Batch is a dictionary of pairs: (feature, Tensor(batch_size x 1))
        # Label is a Tensor(batch_size x 1)
        for key, value in batch.items():
            print('{:20s}: {}'.format(key, value.numpy()))

In [9]:
display_dataset(training_set)

sex                 : [b'male' b'male' b'male' b'female' b'female']
age                 : [28. 36. 57. 40. 28.]
n_siblings_spouses  : [1 1 0 0 0]
parch               : [0 2 0 0 0]
fare                : [ 15.85  120.     12.35   13.      7.879]
class               : [b'Third' b'First' b'Second' b'Second' b'Third']
deck                : [b'unknown' b'B' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Queenstown' b'Southampton' b'Queenstown']
alone               : [b'n' b'n' b'y' b'y' b'y']


In [10]:
# Note that make_csv_dataset pick the column/feature names automatically
# It is also possible to pass manually the a list of feature names

CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses',
               'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_dataset_path, column_names = CSV_COLUMNS)
display_dataset(temp_dataset)

sex                 : [b'male' b'male' b'female' b'male' b'female']
age                 : [16. 28. 38. 16. 25.]
n_siblings_spouses  : [1 0 0 0 1]
parch               : [3 0 0 0 1]
fare                : [ 34.375   7.225 227.525   8.05   30.   ]
class               : [b'Third' b'Third' b'First' b'Third' b'Second']
deck                : [b'unknown' b'unknown' b'C' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'y' b'y' b'n']


In [11]:
# It is also possible to ignore column in the CSV

# Specify the column you want to keep
SELECTED_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class',
                    'deck', 'alone']

temp_dataset = get_dataset(train_dataset_path, select_columns = SELECTED_COLUMNS)
display_dataset(temp_dataset)

age                 : [28.  9. 44. 23. 25.]
n_siblings_spouses  : [0 3 0 2 0]
class               : [b'Third' b'Third' b'First' b'Second' b'Third']
deck                : [b'unknown' b'unknown' b'B' b'unknown' b'unknown']
alone               : [b'y' b'n' b'n' b'n' b'y']


In [12]:
### Data preprocessing

# It is needed to convert the data types in the Dataset into data type suitable for the model
# Use tf.feature_column
# The advantage of doing the preprocessing inside the model is that, when you export your model,
# it will still include the preprocessing. This way you can pass the raw data directly to the model

### CONTINUOUS DATA 
# Just pack the data into a vector before passing it to the model

SELECTED_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0.0, 0.0, 0.0, 0.0, 0.0]

temp_dataset = get_dataset(train_dataset_path,
                           select_columns = SELECTED_COLUMNS,
                           column_defaults = DEFAULTS)

display_dataset(temp_dataset)

age                 : [29. 40. 28. 19. 38.]
n_siblings_spouses  : [0. 0. 0. 0. 0.]
parch               : [0. 0. 0. 0. 0.]
fare                : [211.337  13.      7.787   8.158 227.525]


In [13]:
features_next_batch, label_next_batch = next(iter(temp_dataset))
print('FEATURES NEXT BATCH: ', features_next_batch)
print('-'*30)
print('LABEL NEXT BATCH: ', label_next_batch)

FEATURES NEXT BATCH:  OrderedDict([('age', <tf.Tensor: id=434, shape=(5,), dtype=float32, numpy=array([36., 58., 24., 20., 25.], dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: id=436, shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 1.], dtype=float32)>), ('parch', <tf.Tensor: id=437, shape=(5,), dtype=float32, numpy=array([0., 0., 3., 0., 0.], dtype=float32)>), ('fare', <tf.Tensor: id=435, shape=(5,), dtype=float32, numpy=array([78.85 , 29.7  , 19.258,  7.854, 17.8  ], dtype=float32)>)])
------------------------------
LABEL NEXT BATCH:  tf.Tensor([0. 0. 1. 0. 0.], shape=(5,), dtype=float32)


In [14]:
# We need to method to transform the mini-batches in a suitable shape for the model

def pack(features, label):
    ''' Return a Tensor by stacking all the features, as well as a Tensor for the target
        The Tensor output size will be (m x nb_features)
    '''
    # features.values() return a the list of all features(nb_features x m)
    return tf.stack(list(features.values()), axis = -1), label

In [15]:
# Apply the pack method to the dataset
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print('-'*30)
    print(labels.numpy())

[[48.     1.     2.    65.   ]
 [35.     0.     0.     8.05 ]
 [28.     1.     2.    23.45 ]
 [30.     0.     0.     7.225]
 [20.     0.     0.     7.854]]
------------------------------
[1. 0. 0. 0. 0.]


In [16]:
display_dataset(training_set)

sex                 : [b'female' b'female' b'male' b'male' b'female']
age                 : [40. 17.  1. 17. 19.]
n_siblings_spouses  : [1 1 5 0 1]
parch               : [1 0 2 0 0]
fare                : [134.5   108.9    46.9     8.663  26.   ]
class               : [b'First' b'First' b'Third' b'Third' b'Second']
deck                : [b'E' b'C' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'n' b'n' b'y' b'n']


In [17]:
# Take the next mini-batch from the training set
batch_features, batch_feature = next(iter(temp_dataset))

In [18]:
# Define a class to preprocess all the numeric features and pack them into a a single column

class PackNumericFeatures():
    def __init__(self, names):
        ''' Initialize the object
        names        list of features names  
        '''
        self.names = names
        
    def __call__(self, features, labels):
        ''' 
        features     dictionary of the features (feature_name, mini-batch size) pairs
        labels       tensor containing the target values
        '''
        # Get a list of the numeric features, remove them from the features list
        numeric_features = [features.pop(name) for name in self.names]
        # Convert the numerical value into float32
        numeric_features = [tf.cast(feature, tf.float32) for feature in numeric_features]
        # Put all the numeric feature in a 1D Tensor
        numeric_features = tf.stack(numeric_features, axis = -1)
        # Create a new pair in the features dictionary
        features['numeric'] = numeric_features
        print(features)
        
        return features, labels

In [19]:
# Pack all the numeric features into a single feature Tensor
NUMERIC_FEATURES = ['age', 'n_siblings_spouses', 'parch', 'fare']

packed_training_dataset = training_set.map(PackNumericFeatures(NUMERIC_FEATURES))
packed_test_dataset = test_set.map(PackNumericFeatures(NUMERIC_FEATURES))

OrderedDict([('sex', <tf.Tensor 'args_8:0' shape=(None,) dtype=string>), ('class', <tf.Tensor 'args_2:0' shape=(None,) dtype=string>), ('deck', <tf.Tensor 'args_3:0' shape=(None,) dtype=string>), ('embark_town', <tf.Tensor 'args_4:0' shape=(None,) dtype=string>), ('alone', <tf.Tensor 'args_1:0' shape=(None,) dtype=string>), ('numeric', <tf.Tensor 'stack:0' shape=(None, 4) dtype=float32>)])
OrderedDict([('sex', <tf.Tensor 'args_8:0' shape=(None,) dtype=string>), ('class', <tf.Tensor 'args_2:0' shape=(None,) dtype=string>), ('deck', <tf.Tensor 'args_3:0' shape=(None,) dtype=string>), ('embark_town', <tf.Tensor 'args_4:0' shape=(None,) dtype=string>), ('alone', <tf.Tensor 'args_1:0' shape=(None,) dtype=string>), ('numeric', <tf.Tensor 'stack:0' shape=(None, 4) dtype=float32>)])


In [20]:
# Let's display the packed dataset
display_dataset(packed_training_dataset)

# Numeric is a new feature, for each example, it contains 4 values (age, n_sibling, parch and fare) -> (m x 4)

sex                 : [b'male' b'male' b'male' b'male' b'male']
class               : [b'Third' b'Second' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'y' b'y' b'n' b'y' b'y']
numeric             : [[30.     0.     0.     7.229]
 [39.     0.     0.    13.   ]
 [28.     1.     1.    15.246]
 [32.     0.     0.     7.925]
 [45.     0.     0.     6.975]]


In [21]:
# It is a good practice to normalize the numeric features (speed-up learning)

import pandas as pd

desc = pd.read_csv(train_dataset_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [22]:
# Get the mean and standard deviation for each numerical features

MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [23]:
def normalize_numeric_data(data, mean, std):
    ''' Normalize the data given their mean and standard deviation
    '''
    
    return (data - mean)/std
    

In [24]:
# Bind the normalize_numeric_data function with the MEAN and STD arrays
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

# Create the feature_column for the model
# (Is it really necessary for numeric features ?)
numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn = normalizer, shape = [len(NUMERIC_FEATURES)])
numeric_column

NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x12d4abb70>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))

In [25]:
# Let's look at a mini-batch

batch_features, batch_labels = next(iter(packed_training_dataset))
print('Numeric feature before normalization: ')
print(batch_features['numeric'])
print('-'*30)

# Now apply the normalization through Keras layer
numeric_layer = tf.keras.layers.DenseFeatures([numeric_column])
print('Numeric feature after normalization: ')
print(numeric_layer(batch_features).numpy())

# Note that the feature_column required to know the means and std of each numeric column ahead

Numeric feature before normalization: 
tf.Tensor(
[[ 6.     0.     1.    33.   ]
 [23.     0.     0.    13.   ]
 [ 5.     2.     1.    19.258]
 [28.     1.     0.    15.5  ]
 [18.     0.     2.    13.   ]], shape=(5, 4), dtype=float32)
------------------------------
Numeric feature after normalization: 
[[-1.889 -0.474  0.782 -0.025]
 [-0.53  -0.474 -0.479 -0.392]
 [-1.969  1.264  0.782 -0.277]
 [-0.13   0.395 -0.479 -0.346]
 [-0.93  -0.474  2.043 -0.392]]


In [26]:
### Categorical data

# We'll use a feature_column to deal with categorical data


CATEGORIES = {'sex': ['name', 'female'], # (features_name, domain) pairs
              'class': ['First', 'Second', 'Third'],
              'deck': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
              'embark_town': ['Cherbourg', 'Southhampton', 'Queenstown'],
              'alone': ['y', 'n']}


In [27]:
# Define the feature_column

categorical_columns = []

for feature, values in CATEGORIES.items():
    # For each categorical feature(stored as (feature, domain) pair)
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key = feature,
                                                                        vocabulary_list = values)
    # Represents multi-hot representation of given categorical column
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))
    
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('name', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [28]:
# Let's create the Keras categorical layer and apply it to the current mini-batch

categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(batch_features).numpy())

# Rows: Mini-batch size
# Columns: Concatenation of the one-hot vectors

W0908 09:47:53.833486 140735803462528 deprecation.py:323] From /Users/nicolas/anaconda/lib/python3.6/site-packages/tensorflow_core/python/feature_column/feature_column_v2.py:4273: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0908 09:47:53.835386 140735803462528 deprecation.py:323] From /Users/nicolas/anaconda/lib/python3.6/site-packages/tensorflow_core/python/feature_column/feature_column_v2.py:4328: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


[[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [29]:
# Combine the feature_column of the numerical and categorical features into a single Keras layer

preprocessing_layer = tf.keras.layers.DenseFeatures([numeric_column] + categorical_columns)

In [30]:
# Test it
print(preprocessing_layer(batch_features))

# Output shape: (mini_batch_size, features_representation)

tf.Tensor(
[[ 0.     1.     0.     1.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     0.    -1.889 -0.474
   0.782 -0.025  0.     1.   ]
 [ 1.     0.     0.     1.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     0.    -0.53  -0.474
  -0.479 -0.392  0.     0.   ]
 [ 0.     1.     0.     0.     1.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     1.     0.     0.    -1.969  1.264
   0.782 -0.277  0.     1.   ]
 [ 0.     1.     0.     0.     1.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     1.    -0.13   0.395
  -0.479 -0.346  0.     1.   ]
 [ 0.     1.     0.     1.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     0.    -0.93  -0.474
   2.043 -0.392  0.     1.   ]], shape=(5, 24), dtype=float32)


In [31]:
### Build the model
# Each feature is ready to be fed into the model

# PREPROCESS -> FC(128) -> RELU -> FC(128) -> RELU -> FC(1) -> SIGMOID

model = tf.keras.Sequential([preprocessing_layer,
                             tf.keras.layers.Dense(128, activation = 'relu'),
                             tf.keras.layers.Dense(128, activation = 'relu'),
                             tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [46]:
### Train the model

# Why shuffling the training set if there is no validation set ?
training_set = packed_training_dataset.shuffle(500)
test_set = packed_test_dataset

In [47]:
model.fit(training_set,
          epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x12ce2b898>

In [48]:
# Evalute the model on the test set

test_loss, test_accuracy = model.evaluate(test_set)
print('TEST SET LOSS: ', test_loss)
print('TEST ACCURACY: ', test_accuracy)

TEST SET LOSS:  0.5382412241174365
TEST ACCURACY:  0.82575756


In [75]:
# Let's look at the predictions on the test set
predictions = model.predict(test_set)

for mini_batch in range(len(list(test_set))):
    for prediction, survived in zip(predictions, list(test_set)[mini_batch][1]):
        print('Predicted survival: {:.2%}'.format(prediction[0]), ' | Actual outcome: ', ('SURVIVED' if bool(survived) else 'DIED'))

Predicted survival: 0.17%  | Actual outcome:  SURVIVED
Predicted survival: 86.18%  | Actual outcome:  SURVIVED
Predicted survival: 9.52%  | Actual outcome:  DIED
Predicted survival: 78.36%  | Actual outcome:  SURVIVED
Predicted survival: 97.35%  | Actual outcome:  DIED
Predicted survival: 0.17%  | Actual outcome:  DIED
Predicted survival: 86.18%  | Actual outcome:  SURVIVED
Predicted survival: 9.52%  | Actual outcome:  DIED
Predicted survival: 78.36%  | Actual outcome:  DIED
Predicted survival: 97.35%  | Actual outcome:  DIED
Predicted survival: 0.17%  | Actual outcome:  SURVIVED
Predicted survival: 86.18%  | Actual outcome:  SURVIVED
Predicted survival: 9.52%  | Actual outcome:  DIED
Predicted survival: 78.36%  | Actual outcome:  DIED
Predicted survival: 97.35%  | Actual outcome:  DIED
Predicted survival: 0.17%  | Actual outcome:  SURVIVED
Predicted survival: 86.18%  | Actual outcome:  SURVIVED
Predicted survival: 9.52%  | Actual outcome:  SURVIVED
Predicted survival: 78.36%  | Actual