In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import utils

In [2]:
df_train, df_val, df_test = utils.load_data()
print(f"len(df_train) = {len(df_train)}, len(df_val) = {len(df_val)}, len(df_test) = {len(df_test)}")

len(df_train) = 7383, len(df_val) = 1846, len(df_test) = 2308


In [3]:
df_train.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
2219,Dog,8,Mixed Breed,Female,Brown,No Color,Medium,Medium,Yes,No,Minor Injury,0,2,1
2015,Dog,5,Doberman Pinscher,Female,Black,Brown,Medium,Medium,Yes,Yes,Healthy,0,5,1
4584,Cat,12,Domestic Medium Hair,Male,Yellow,White,Large,Medium,Yes,Not Sure,Healthy,0,6,0
4695,Dog,3,Mixed Breed,Female,Brown,Cream,Large,Short,Yes,No,Healthy,0,15,1
8499,Dog,2,Mixed Breed,Female,Brown,No Color,Medium,Medium,Yes,No,Healthy,0,7,1


In [4]:
x = df_train.copy()
y = x.pop('target')
batch_size = 4

In [5]:
ds = tf.data.Dataset.from_tensor_slices((dict(x), y))
# ds = ds.shuffle(buffer_size=len(df_train_x))
ds = ds.batch(batch_size)

In [6]:
for batch_x, batch_y in ds.take(1):
    print(f"type(batch_x) = {type(batch_x)}")
    print(f"batch_x = {batch_x}")
    print(f"batch_y.shape = {batch_y.shape}")
    print(f"batch_y = {batch_y}")

type(batch_x) = <class 'dict'>
batch_x = {'Type': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'Dog', b'Dog', b'Cat', b'Dog'], dtype=object)>, 'Age': <tf.Tensor: shape=(4,), dtype=int64, numpy=array([ 8,  5, 12,  3])>, 'Breed1': <tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'Mixed Breed', b'Doberman Pinscher', b'Domestic Medium Hair',
       b'Mixed Breed'], dtype=object)>, 'Gender': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'Female', b'Female', b'Male', b'Female'], dtype=object)>, 'Color1': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'Brown', b'Black', b'Yellow', b'Brown'], dtype=object)>, 'Color2': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'No Color', b'Brown', b'White', b'Cream'], dtype=object)>, 'MaturitySize': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'Medium', b'Medium', b'Large', b'Large'], dtype=object)>, 'FurLength': <tf.Tensor: shape=(4,), dtype=string, numpy=array([b'Medium', b'Medium', b'Medium', b'Short'], dtype=object)>,

## Numeric columns

In [7]:
photo_amt = tf.feature_column.numeric_column('PhotoAmt')
photo_amt_layer = keras.layers.DenseFeatures(photo_amt)
photo_amt_layer(batch_x)

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[ 2.],
       [ 5.],
       [ 6.],
       [15.]], dtype=float32)>

In [8]:
age = tf.feature_column.numeric_column('Age')
age_layer = keras.layers.DenseFeatures(age)
age_layer(batch_x)

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[ 8.],
       [ 5.],
       [12.],
       [ 3.]], dtype=float32)>

## Bucketized columns

In [9]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[1, 11, 21])
age_buckets_layer = keras.layers.DenseFeatures(age_buckets)
age_buckets_layer(batch_x)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

## Categorical columns

In [10]:
animal_type = tf.feature_column.categorical_column_with_vocabulary_list('Type', ['Cat', 'Dog'])
animal_type_one_hot = tf.feature_column.indicator_column(animal_type)
animal_type_one_hot_layer = keras.layers.DenseFeatures(animal_type_one_hot)
animal_type_one_hot_layer(batch_x)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)>

## Embedding columns

In [11]:
breed1 = tf.feature_column.categorical_column_with_vocabulary_list( 'Breed1', df_train['Breed1'].unique())
breed1_embedding = tf.feature_column.embedding_column(breed1, dimension=8)
breed1_embedding_layer = keras.layers.DenseFeatures(breed1_embedding)
breed1_embedding_layer(batch_x)

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[-0.15347214, -0.60907835,  0.48046753, -0.14703505,  0.0536376 ,
        -0.36827904,  0.1981471 , -0.09632463],
       [-0.35149214,  0.05514221, -0.6010382 ,  0.09352157, -0.15485992,
         0.36382204,  0.22900997,  0.28699714],
       [-0.5057705 , -0.28656134, -0.3124    , -0.2752822 ,  0.17508066,
        -0.6928375 , -0.30052555,  0.30505255],
       [-0.15347214, -0.60907835,  0.48046753, -0.14703505,  0.0536376 ,
        -0.36827904,  0.1981471 , -0.09632463]], dtype=float32)>

## Hashed feature columns

In [12]:
breed1_hashed = tf.feature_column.categorical_column_with_hash_bucket('Breed1', hash_bucket_size=10)
breed1_hashed_ind = tf.feature_column.indicator_column(breed1_hashed)
breed1_hashed_layer = keras.layers.DenseFeatures(breed1_hashed_ind)
breed1_hashed_layer(batch_x)

<tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)>

## Crossed feature columns

In [13]:
crossed_feature = tf.feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=10)
crossed_feature_ind = tf.feature_column.indicator_column(crossed_feature)
crossed_feature_ind_layer = keras.layers.DenseFeatures(crossed_feature_ind)
crossed_feature_ind_layer(batch_x)

<tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>