## Setup

In [1]:
import functools
import numpy as np
import tensorflow as tf

In [2]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

In [3]:
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

train_file_path, test_file_path

('/home/nxhuy/.keras/datasets/train.csv',
 '/home/nxhuy/.keras/datasets/eval.csv')

In [4]:
np.set_printoptions(precision=3, suppress=True)

## Load the data

In [5]:
!head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [6]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

<div class="alert alert-info">
    <b>Note</b>: here we'll use <code>tf.data.experimental.make_csv_dataset</code>
</div>

In [7]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5,
        label_name=LABEL_COLUMN,
        na_value="?",
        num_epochs=1,
        ignore_errors=True,
        **kwargs
    )
    return dataset

In [8]:
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

type(raw_train_data), type(raw_test_data)

(tensorflow.python.data.ops.dataset_ops.PrefetchDataset,
 tensorflow.python.data.ops.dataset_ops.PrefetchDataset)

In [9]:
for batch, label in raw_train_data.take(1):
    print(label)
    print()
    print(batch)

tf.Tensor([1 0 1 1 1], shape=(5,), dtype=int32)

OrderedDict([('sex', <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'female', b'male', b'female', b'female', b'female'], dtype=object)>), ('age', <tf.Tensor: shape=(5,), dtype=float32, numpy=array([30., 28., 28., 33., 50.], dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 1, 0, 0], dtype=int32)>), ('parch', <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 2, 1], dtype=int32)>), ('fare', <tf.Tensor: shape=(5,), dtype=float32, numpy=array([106.425,   7.775,  51.862,  26.   , 247.521], dtype=float32)>), ('class', <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'First', b'Third', b'First', b'Second', b'First'], dtype=object)>), ('deck', <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'D', b'unknown', b'B'], dtype=object)>), ('embark_town', <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Cherbourg', b'Southampton', b'Southampton', b'Southampto

<div class="alert alert-info">
    <b>Note</b>: we can use <code>PrefetchDataset.take()</code> function (returns (instances, labels)) to have a view on data.
</div>

In [10]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key,value.numpy()))

Each item in the dataset is a batch, represented as a tuple of **(many examples, many labels)**. The data from the examples is organized in column-based tensors (rather than row-based tensors), each with as many elements as the batch size (**5 in this case**).

In [11]:
# View data
show_batch(raw_train_data)

sex                 : [b'female' b'male' b'male' b'male' b'male']
age                 : [28. 23. 20. 60. 28.]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 0 0 0 0]
fare                : [13.    15.046  9.5   26.55   7.896]
class               : [b'Second' b'Second' b'Third' b'First' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'y' b'y' b'y' b'y' b'y']


In [12]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 
               'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

sex                 : [b'male' b'female' b'female' b'male' b'male']
age                 : [25. 28. 21. 36. 21.]
n_siblings_spouses  : [1 0 2 0 0]
parch               : [0 0 2 0 0]
fare                : [  7.775   8.05  262.375   7.496  73.5  ]
class               : [b'Third' b'Third' b'First' b'Third' b'Second']
deck                : [b'unknown' b'unknown' b'B' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'n' b'y' b'y']


If we need to omit some columns from the dataset, create a list of just the columns you plan to use, and pass it into the (optional) `select_columns` argument of the constructor.

In [13]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

age                 : [47. 28. 26. 37. 33.]
n_siblings_spouses  : [0 0 0 0 1]
class               : [b'Third' b'Third' b'Third' b'Third' b'First']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'E']
alone               : [b'y' b'y' b'y' b'y' b'n']


## Data preprocessing

### Continuous data

In [14]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]

temp_dataset = get_dataset(train_file_path,
                           select_columns=SELECT_COLUMNS,
                           column_defaults=DEFAULTS)

show_batch(temp_dataset)

age                 : [29. 44. 28. 19. 19.]
n_siblings_spouses  : [0. 0. 0. 0. 0.]
parch               : [0. 1. 0. 0. 0.]
fare                : [ 7.896 16.1   13.     6.75   7.65 ]


In [15]:
example_batch, labels_batch = next(iter(temp_dataset))

In [16]:
example_batch

OrderedDict([('age',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([38., 34., 39., 25., 28.], dtype=float32)>),
             ('n_siblings_spouses',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 1., 0., 0.], dtype=float32)>),
             ('parch',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>),
             ('fare',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([71.283,  8.05 , 83.158,  7.742,  7.229], dtype=float32)>)])

In [17]:
list(example_batch.values())

[<tf.Tensor: shape=(5,), dtype=float32, numpy=array([38., 34., 39., 25., 28.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 1., 0., 0.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([71.283,  8.05 , 83.158,  7.742,  7.229], dtype=float32)>]

In [18]:
labels_batch

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 1, 0, 1], dtype=int32)>

In [19]:
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

In [20]:
# Test pack
pack(example_batch, labels_batch)

(<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
 array([[38.   ,  1.   ,  0.   , 71.283],
        [34.   ,  0.   ,  0.   ,  8.05 ],
        [39.   ,  1.   ,  1.   , 83.158],
        [25.   ,  0.   ,  0.   ,  7.742],
        [28.   ,  0.   ,  0.   ,  7.229]], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 1, 0, 1], dtype=int32)>)

In [21]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[24.     0.     0.     7.896]
 [32.5    1.     0.    30.071]
 [48.     1.     0.    52.   ]
 [26.     0.     0.    78.85 ]
 [22.     0.     0.     7.796]]

[0 0 1 1 0]


In [22]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names
        
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features
        
        return features, labels

In [23]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [24]:
show_batch(packed_train_data)

sex                 : [b'male' b'female' b'female' b'male' b'female']
class               : [b'First' b'Second' b'Second' b'First' b'Third']
deck                : [b'D' b'unknown' b'unknown' b'C' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone               : [b'n' b'n' b'n' b'n' b'y']
numeric             : [[ 21.      0.      1.     77.287]
 [ 29.      1.      0.     26.   ]
 [ 30.      3.      0.     21.   ]
 [ 18.      1.      0.    108.9  ]
 [ 28.      0.      0.      8.05 ]]


In [25]:
example_batch, labels_batch = next(iter(packed_train_data)) 

In [26]:
example_batch

OrderedDict([('sex',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'male', b'male', b'female', b'male', b'male'], dtype=object)>),
             ('class',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Third', b'Third', b'First', b'Third', b'First'], dtype=object)>),
             ('deck',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'B', b'unknown', b'A'], dtype=object)>),
             ('embark_town',
              <tf.Tensor: shape=(5,), dtype=string, numpy=
              array([b'Southampton', b'Southampton', b'Cherbourg', b'Southampton',
                     b'Southampton'], dtype=object)>),
             ('alone',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'y', b'y', b'y', b'y', b'y'], dtype=object)>),
             ('numeric',
              <tf.Tensor: shape=(5, 4), dtype=float32, numpy=
              array([[26.   ,  0.   ,  0.   ,  8.05 ],
                     [16.   ,  0.   

In [27]:
labels_batch

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 1, 0, 1], dtype=int32)>

### Data normalization

In [29]:
import pandas as pd

desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [30]:
MEAN = np.array(desc.T['mean']) # .T for transpose
STD  = np.array(desc.T['std'])

In [31]:
def normalize_numeric_data(data, mean, std):
    # Center the data
    return (data - mean) / std

Now create a numeric column. The `tf.feature_columns.numeric_column` API accepts a `normalizer_fn` argument, which will be run on each batch.

Bind the MEAN and STD to the `normalizer fn` using `functools.partial`.

In [38]:
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', 
                                                  normalizer_fn=normalizer, 
                                                  shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x7f4f381e6d90>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))

In [39]:
numeric_columns

[NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x7f4f381e6d90>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))]

In [33]:
example_batch['numeric']

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[26.   ,  0.   ,  0.   ,  8.05 ],
       [16.   ,  0.   ,  0.   ,  9.217],
       [44.   ,  0.   ,  0.   , 27.721],
       [42.   ,  0.   ,  0.   ,  7.55 ],
       [28.   ,  0.   ,  0.   , 35.5  ]], dtype=float32)>

In [34]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

array([[-0.29 , -0.474, -0.479, -0.482],
       [-1.089, -0.474, -0.479, -0.461],
       [ 1.148, -0.474, -0.479, -0.122],
       [ 0.989, -0.474, -0.479, -0.492],
       [-0.13 , -0.474, -0.479,  0.02 ]], dtype=float32)

### Categorical data

In [35]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class': ['First', 'Second', 'Third'],
    'deck': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town': ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone': ['y', 'n']
}

In [36]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=vocab)
    categorical_columns.append(
        tf.feature_column.indicator_column(cat_col)
    )
    
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [37]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
categorical_layer(example_batch).numpy()

array([[1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0.]], dtype=float32)

### Combined preprocessing layer

In [40]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)
preprocessing_layer(example_batch).numpy()

array([[ 1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   , -0.29 , -0.474, -0.479, -0.482,  1.   ,  0.   ],
       [ 1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   , -1.089, -0.474, -0.479, -0.461,  1.   ,  0.   ],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         0.   ,  0.   ,  1.148, -0.474, -0.479, -0.122,  0.   ,  1.   ],
       [ 1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.989, -0.474, -0.479, -0.492,  1.   ,  0.   ],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0. 

## Build the model

In [43]:
model = tf.keras.Sequential([
    preprocessing_layer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

## Train, evaluate and predict

In [44]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [45]:
train_data

<ShuffleDataset shapes: (OrderedDict([(sex, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,)), (numeric, (None, 4))]), (None,)), types: (OrderedDict([(sex, tf.string), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string), (numeric, tf.float32)]), tf.int32)>

In [46]:
model.fit(train_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4eeacae198>

In [47]:
test_loss, test_accuracy = model.evaluate(test_data)
print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))



Test Loss 0.4578329622745514, Test Accuracy 0.8446969985961914


In [48]:
predictions = model.predict(test_data)

for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
    prediction = tf.sigmoid(prediction).numpy()
    print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))

Predicted survival: 84.17%  | Actual outcome:  DIED
Predicted survival: 9.27%  | Actual outcome:  DIED
Predicted survival: 8.95%  | Actual outcome:  DIED
Predicted survival: 9.31%  | Actual outcome:  SURVIVED
Predicted survival: 92.82%  | Actual outcome:  SURVIVED


## References
- https://www.tensorflow.org/tutorials/load_data/csv