
# TensorFlow 2.0 Example for creating CSV Datapipelines


### Install TensorFlow 2.0

In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
  # Load the TensorBoard extension
  %load_ext tensorboard
except Exception:
  pass

In [2]:
# !pip install tensorflow==2.0.0

In [3]:
import tensorflow as tf
keras = tf.keras
print(tf.__version__)

2.0.0-rc1


## CSV Pipeline

In [4]:
import pandas as pd

csv_path_cancer = 'Datasets/cancer-preprocessed.csv'
df = pd.read_csv(csv_path_cancer, sep=',')
df.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,-1.182427,-1.14936,-1.1513,-1.023273,0.049985,-0.375109,-0.796449,-0.878524,-0.724524,0.993557,...,-1.094477,-1.177144,-1.003439,0.015001,-0.6757,-1.073233,-1.280839,-0.797506,-0.036883,0
1,-0.541214,-0.555931,-0.570787,-0.582594,0.339947,-0.645157,-0.842845,-0.861506,-0.463235,-0.432539,...,-0.83607,-0.683043,-0.653427,0.120781,-0.928913,-0.933412,-1.056059,-0.413157,-0.874145,0
2,-0.599748,-0.759851,-0.509071,-0.617157,-0.218121,0.699846,0.62161,-0.195596,0.860864,1.440521,...,-0.847236,-0.430027,-0.64344,-0.800389,0.772509,0.850482,0.094025,0.408503,0.932248,0
3,0.009537,1.304548,-0.051989,-0.114684,-1.120794,-0.726443,-0.733791,-0.637834,0.006379,-0.857155,...,0.948849,-0.16425,-0.247522,-1.307253,-0.830972,-0.661409,-0.26933,-0.361708,-0.838562,0
4,-1.203712,-0.503233,-1.192958,-1.045529,0.580368,-0.493786,-1.013242,-0.908185,0.253544,0.858071,...,-0.558523,-1.148014,-0.973641,0.217747,-0.792751,-1.100387,-1.060031,0.142182,0.124813,0


### High-Level Approach (make_csv_dataset)


In [5]:
def create_csv_dataset_hl(csv_path, label_name, epochs=1, batch_size=8, buffer_size=10000, train=True):
    return tf.data.experimental.make_csv_dataset(csv_path,
                                               batch_size=batch_size,
                                               field_delim=',',
                                               label_name=label_name,
                                               num_epochs=epochs,
                                               shuffle=False)        

csv_ds = create_csv_dataset_hl(csv_path_cancer, 'diagnosis', epochs=1)

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.


for features, labels in csv_ds.take(1):
    print(f'Labels: {labels.numpy()}\n')
    print(features.keys())
    print(features)

## Training

### 1. Attempt
This will raise the error "<i>Passing a dictionary input to a Sequential Model which doesn't have FeatureLayer as the first layer is an error</i>".
<code></code> doesn't return just numerical Tensorns, but a dictionary that maps feature column names to Tensors containing the corresponding column. This makes sense if we have categorical features and want to use feature_columns to encode them. https://colab.research.google.com/github/adammichaelwood/tf-docs/blob/csv-feature-columns/site/en/r2/tutorials/load_data/csv.ipynb#scrollTo=Co7UJ7gpNADC  

In our case we only have numerical features, so the behaviour is kind of annoying, as keras.layers.Dense only accept numerical Tensors as input.

One solution is to create numerical feature columns and pass them to `tf.keras.layers.DenseFeatures` to create an input layer that can deal with this dictionary-type dataset.

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential([
    Dense(32, activation='relu'),
    Dense(10, activation='softmax')
])

nr_epochs = 10
batch_size = 8

# | create dataset
train_ds = create_csv_dataset_hl(csv_path_cancer, 'diagnosis', epochs=nr_epochs, batch_size=batch_size)
val_ds = create_csv_dataset_hl(csv_path_cancer, 'diagnosis', epochs=nr_epochs, batch_size=batch_size, train=False)

optimizer = tf.keras.optimizers.Adam(lr=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=[])


steps_per_epoch = len(df) // batch_size
validation_steps = len(df) // batch_size

history = model.fit(train_ds,
                    epochs = nr_epochs,
                    steps_per_epoch = steps_per_epoch,
                    callbacks = [])

ValueError: Passing a dictionary input to a Sequential Model which doesn't have FeatureLayer as the first layer is an error.

### 2. Attempt: Feature Columns
Here we get the error <i>"concave points_mean' is not a valid scope name"</i>.
Apparently our header-names are not allowed to contain white spaces...

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

nr_epochs = 10
batch_size = 8

# | create dataset
dummy_ds = create_csv_dataset_hl(csv_path_cancer, 'diagnosis', epochs=1, batch_size=batch_size)

for batch, label in dummy_ds.take(1):
    pass

numerical_columns = []

for feature in batch.keys():
    num_col = tf.feature_column.numeric_column(feature)
    numerical_columns.append(num_col)

preprocessing_layer = tf.keras.layers.DenseFeatures(numerical_columns)

model = Sequential([
    preprocessing_layer,
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

train_ds = create_csv_dataset_hl(csv_path_cancer, 'diagnosis', epochs=nr_epochs, batch_size=batch_size)

optimizer = tf.keras.optimizers.Adam(lr=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


steps_per_epoch = len(df) // batch_size

history = model.fit(train_ds,
                    epochs = nr_epochs,
                    steps_per_epoch = steps_per_epoch,
                    callbacks = [])
    

ValueError: 'concave points_mean' is not a valid scope name

Let's try the same again, but with whitespaces removed, now it should finally work.

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

csv_path_cancer_no_ws = 'Datasets/cancer-preprocessed-header-without-spaces.csv'

nr_epochs = 10
batch_size = 8

dummy_ds = create_csv_dataset_hl(csv_path_cancer_no_ws, 'diagnosis', epochs=1, batch_size=batch_size)

for batch, label in dummy_ds.take(1):
    pass

numerical_columns = []

for feature in batch.keys():
    num_col = tf.feature_column.numeric_column(feature)
    numerical_columns.append(num_col)

preprocessing_layer = tf.keras.layers.DenseFeatures(numerical_columns)

model = Sequential([
    preprocessing_layer,
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

train_ds = create_csv_dataset_hl(csv_path_cancer_no_ws, 'diagnosis', epochs=nr_epochs, batch_size=batch_size)

optimizer = tf.keras.optimizers.Adam(lr=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


steps_per_epoch = len(df) // batch_size

history = model.fit(train_ds,
                    epochs = nr_epochs,
                    steps_per_epoch = steps_per_epoch,
                    callbacks = [])
    

Train for 53 steps
Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### 3. Attempt
As we already have numerical features, and don't need feature columns, let's try to just convert the collections.OrderedDict to Tensors.

In [11]:
def preprocessing(data, label):    
    # list(data.values()) is a list of scalar tensors (one per column) but we need to return 1D tensor arrays
    data = tf.stack(list(data.values()), axis=-1) # axis=-1 is crucial --> (None, #features) instead of (#features, None)
    print(data.shape)
    return data, label

batch_size = 8
nr_epochs = 10

dataset = tf.data.experimental.make_csv_dataset(csv_path_cancer_no_ws,
                                           batch_size=1,
                                           field_delim=',',
                                           label_name='diagnosis',
                                           num_epochs=1,
                                           shuffle=False)   

dataset = dataset.map(preprocessing)
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.repeat(nr_epochs)
dataset = dataset.batch(batch_size)

(None, 30)


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential([
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])


optimizer = tf.keras.optimizers.Adam(lr=3e-4)

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


steps_per_epoch = len(df) // batch_size

history = model.fit(dataset,
                    epochs = nr_epochs,
                    steps_per_epoch = steps_per_epoch,
                    callbacks = [])

Train for 53 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
