In [5]:
# https://www.tensorflow.org/tutorials/load_data/csv

In [6]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0-preview is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [7]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
LABEL_COLUMN_TRAIN = 'Survived'

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
train_file_path

'/home/oonisim/.keras/datasets/train.csv'

In [8]:
# PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
!head {train_file_path}

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S


In [9]:
np.set_printoptions(precision=3, suppress=True)

# Load data

In [10]:
LABELS = [0, 1]

In [11]:
def get_dataset(file_path, label_column_name, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=label_column_name,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      header=True,
      **kwargs
  )
  return dataset

In [None]:
dataset = tf.data.experimental.make_csv_dataset(
    file_path,
    batch_size=5, # Artificially small to make examples easier to show.
    label_name=label_column_name,
    na_value="?",
    num_epochs=1,
    ignore_errors=True, 
    header=True,
    **kwargs
)

In [13]:
raw_train_data = get_dataset(train_file_path, LABEL_COLUMN_TRAIN)

In [22]:
print(type(raw_train_data))
raw_train_data

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


<PrefetchDataset shapes: (OrderedDict([(PassengerId, (None,)), (Pclass, (None,)), (Name, (None,)), (Sex, (None,)), (Age, (None,)), (SibSp, (None,)), (Parch, (None,)), (Ticket, (None,)), (Fare, (None,)), (Cabin, (None,)), (Embarked, (None,))]), (None,)), types: (OrderedDict([(PassengerId, tf.int32), (Pclass, tf.int32), (Name, tf.string), (Sex, tf.string), (Age, tf.float32), (SibSp, tf.int32), (Parch, tf.int32), (Ticket, tf.string), (Fare, tf.float32), (Cabin, tf.string), (Embarked, tf.string)]), tf.int32)>

In [163]:
def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

In [164]:
show_batch(raw_train_data)

PassengerId         : [309 426 516 223 848]
Pclass              : [2 3 1 3 3]
Name                : [b'Abelson, Mr. Samuel' b'Wiseman, Mr. Phillippe'
 b'Walker, Mr. William Anderson' b'Green, Mr. George Henry'
 b'Markoff, Mr. Marin']
Sex                 : [b'male' b'male' b'male' b'male' b'male']
Age                 : [30.  0. 47. 51. 35.]
SibSp               : [1 0 0 0 0]
Parch               : [0 0 0 0 0]
Ticket              : [b'P/PP 3381' b'A/4. 34244' b'36967' b'21440' b'349213']
Fare                : [24.     7.25  34.021  8.05   7.896]
Cabin               : [b'' b'' b'D46' b'' b'']
Embarked            : [b'C' b'S' b'S' b'S' b'C']


# Select contiuous numeric features

In [165]:
NUMERIC_FEATURES = ['Survived', 'Age', 'SibSp', 'Fare']
DEFAULTS = [0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(
    train_file_path, 
    label_column_name=LABEL_COLUMN_TRAIN, 
    select_columns=NUMERIC_FEATURES,
    column_defaults=DEFAULTS
)
show_batch(temp_dataset)

Age                 : [34. 28.  0.  0. 31.]
SibSp               : [0. 0. 0. 0. 0.]
Fare                : [10.5   26.55   7.733  0.    13.   ]


## Convert the numeric features into a vector (packing)

In [166]:
def pack(features, label):
  return tf.stack(list(features.values()), axis=-1), label

In [167]:
packed_dataset = temp_dataset.map(pack)

In [168]:
for features, labels in packed_dataset.take(1):
  print(features.numpy())
  print(labels.numpy())

[[ 0.     0.     7.229]
 [40.     0.     7.225]
 [18.     0.    79.65 ]
 [16.     0.    26.   ]
 [57.     0.    12.35 ]]
[0 0 1 0 0]


In [170]:
packed_dataset.take(1)

<TakeDataset shapes: ((None, 3), (None,)), types: (tf.float32, tf.int32)>

In [176]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
    numeric_features = tf.stack(numeric_features, axis=-1)
    features['numeric'] = numeric_features

    return features, labels

In [177]:
packed_train_data = temp_dataset.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)

KeyError: in converted code:

    <ipython-input-134-85ea56f80c91>:6 __call__  *
        numeric_features = [features.pop(name) for name in self.names]
    /home/oonisim/conda/envs/handson_ml2/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py:416 converted_call
        return py_builtins.overload_of(f)(*args)

    KeyError: 1


In [178]:
show_batch(packed_train_data)

AttributeError: 'PackNumericFeatures' object has no attribute 'take'