# Preprocessing with tensorflow pipelines

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv('data/autos.csv', encoding='cp1252', index_col=None)

In [21]:
train, test = train_test_split(data, test_size=0.7, random_state=666)
# fill NAs with "NA"
train = train.fillna('NA')

In [16]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('price')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
    return ds

In [23]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)

In [25]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of vehicleType:', feature_batch['vehicleType'])
    print('A batch of targets:', label_batch )

Every feature: ['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'kilometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen']
A batch of ages: tf.Tensor([b'kleinwagen' b'coupe' b'bus' b'kleinwagen' b'kleinwagen'], shape=(5,), dtype=string)
A batch of targets: tf.Tensor([3000 3340 2200 1850 3850], shape=(5,), dtype=int64)


In [32]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [34]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [42]:
powerPS = tf.feature_column.numeric_column("powerPS")
demo(powerPS)

[[101.]
 [107.]
 [ 98.]
 [150.]
 [ 50.]]


In [46]:
vehicleType = feature_column.categorical_column_with_vocabulary_list(
      'vehicleType', train.vehicleType.unique().tolist())

vehicleType_one_hot = tf.feature_column.indicator_column(vehicleType)
demo(vehicleType_one_hot)

[[0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [48]:
# Notice the input to the embedding column is the categorical column
# we previously created
vehicleType_embedding = feature_column.embedding_column(vehicleType, dimension=3)
demo(vehicleType_embedding)

[[-0.05821997 -0.83590156 -0.3073879 ]
 [-0.05821997 -0.83590156 -0.3073879 ]
 [ 0.02701506  0.2944842  -0.7061393 ]
 [ 0.47805908  0.01964838 -0.6644369 ]
 [ 0.8896897   0.36731052 -0.40896273]]
