# Preprocess tabular data

We have 3 datasets about taxi fares. We have training, test and validation files:

In [1]:
! ls ../../datasets/Taxi

taxi-test.csv  taxi-train.csv taxi-valid.csv


In [2]:
# Notice how this files doesn't have a tile for the columns.
import tensorflow as tf
import numpy as np
import csv

try:
    with open('../../datasets/Taxi/taxi-train.csv') as file:
        csv_file = csv.reader(file)
        print("------> First row of tranining dataset:")
        print(next(csv_file))
except:
    print("not able to open csv")


------> First row of tranining dataset:
['11.3', '2011-01-28 20:42:59 UTC', '-73.999022', '40.739146', '-73.990369', '40.717866', '1', '0']


We are going to be using the ```make_csv_dataset function``` , so we will need a couple of parameters.
- A File Pattern-> This is a string that express the pattern in which the dataset's name was defined.
- Name of the columns: Optional parameter that specifies the names of the columns on the csv data
- batch size -> The number of values each batch will contain
- Defaults-> The csv will have NaN values sometimes, when this happens we need to define some default values so the function can substitue
- Target Column-> name of our target label

In [3]:
#File Pattern
file_pattern = '../../datasets/Taxi/taxi-*.csv'
CSV_COLUMNS = [
    'fare_amount',
    'pickup_datetime',
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count',
    'key'
]
TARGET_LABEL = 'fare_amount'
# Defining the default values into a list `DEFAULTS`
DEFAULTS = [[0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0], ['na']]



With this parameters defined lets define a function that will return a dataset and lets examine the shape and type of it

In [4]:
def build_dataset(file_pattern, batch_size, column_names, column_defaults, label_name = None):
    dataset = tf.data.experimental.make_csv_dataset(file_pattern, batch_size, column_names, column_defaults, label_name)
    return dataset 

dataset = build_dataset(file_pattern, 1, CSV_COLUMNS, DEFAULTS)

# Examine the newly created dataset:
# Type:
print(f'--> dataset type: {type(dataset)}')

# Now Examine the internal items of the dataset
# YOU WILL FIND THAT IT IS A DICTIONARY
first_dataset_dictionary =next(iter(dataset))
print(f'--> Type of item dataset: {type(first_dataset_dictionary)}')
print(f'    length:{len(first_dataset_dictionary)}')




Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

--> dataset type: <class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>
--> Type of item dataset: <class 'collections.OrderedDict'>
    length:8


2021-12-02 12:17:39.652363: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-02 12:17:39.652522: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-02 12:17:39.692925: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-02 12:17:39.693176: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [14]:
# Examine the values of the dictionary
for key, value in iter(first_dataset_dictionary.items()):
    print(f'key:{key}  %20s-> value:{value}')

key:fare_amount  %20s-> value:[7.]
key:pickup_datetime  %20s-> value:[b'2013-12-09 15:03:00 UTC']
key:pickup_longitude  %20s-> value:[-73.96412]
key:pickup_latitude  %20s-> value:[40.76221]
key:dropoff_longitude  %20s-> value:[-73.970695]
key:dropoff_latitude  %20s-> value:[40.764526]
key:passenger_count  %20s-> value:[1.]
key:key  %20s-> value:[b'5473']


In [43]:
# now lets create a new dataset with features and target labels divided
dataset_feature_label = build_dataset(file_pattern, 1, CSV_COLUMNS, DEFAULTS, TARGET_LABEL)


def print_features_and_label(dataset):
    features, label = next(iter(dataset))
    print("Features: ")
    for (key, value) in features.items():
        print(f'key:{key}    value:{value}')
    print("Label: ")
    print(label)

print_features_and_label(dataset_feature_label)

    
        

Features: 
key:pickup_datetime    value:[b'2012-07-05 14:18:00 UTC']
key:pickup_longitude    value:[-73.98929]
key:pickup_latitude    value:[40.748703]
key:dropoff_longitude    value:[-73.98122]
key:dropoff_latitude    value:[40.755363]
key:passenger_count    value:[1.]
key:key    value:[b'1002']
Label: 
tf.Tensor([4.9], shape=(1,), dtype=float32)


# Remove unwated columns

We will repeat what we did before but we will do it manually. And also we will remove some unwated columns.
1. remove unwanted columns
2. Separate target label in a new dataset

In [53]:
UNWANTED_COLS = ['pickup_datetime', 'key']
raw_ds = build_dataset(file_pattern, 1, CSV_COLUMNS, DEFAULTS)

# lets apply these to one 
def proccess_just_one(dataset):
    for features in dataset.take(1):
        label = features.pop(TARGET_LABEL)
        for col in UNWANTED_COLS:
            features.pop(col)
    return (features, label)


features, label = proccess_just_one(raw_ds)
print(features)
print(label)


OrderedDict([('pickup_longitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.982315], dtype=float32)>), ('pickup_latitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.776146], dtype=float32)>), ('dropoff_longitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.98469], dtype=float32)>), ('dropoff_latitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.768337], dtype=float32)>), ('passenger_count', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)])
tf.Tensor([4.], shape=(1,), dtype=float32)


In [74]:
# lets create a map function
def map_features_label(features, target_label,unwanted_columns):
    label = features.pop(target_label)
    for col in unwanted_columns:
        features.pop(col)    
    return (features, label)

raw_ds = build_dataset(file_pattern, 1, CSV_COLUMNS, DEFAULTS)
#features_labels_ds = raw_ds.map(lambda x : map_features_label(x, target_label=TARGET_LABEL, unwanted_columns=UNWANTED_COLS))
features_labels_ds = raw_ds.map(lambda features : map_features_label(features, target_label=TARGET_LABEL, unwanted_columns=UNWANTED_COLS))

# check everything went well
for ds in features_labels_ds.take(1):
    features, label = ds

    assert UNWANTED_COLS[0] not in features.keys()
    assert UNWANTED_COLS[1] not in features.keys()
    assert label.shape == [1]    
    print("----OK---")
    print(features)
    print(label.shape)


    


----OK---
OrderedDict([('pickup_longitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.94956], dtype=float32)>), ('pickup_latitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.781025], dtype=float32)>), ('dropoff_longitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-73.953926], dtype=float32)>), ('dropoff_latitude', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.779068], dtype=float32)>), ('passenger_count', <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>)])
(1,)


# Batch and shuffle

Finally lets create a function that receives batch size as parameters, divides the dataset into features and columns and finally shuffle the data for training