In [1]:
import tensorflow as tf
import numpy as np
import random

In [2]:
N_SAMPLES = 1000

# Numeric column

In [3]:
numeric_data1 = np.random.rand(N_SAMPLES, 10, 10)
numeric_data2 = np.random.rand(N_SAMPLES, 1)

In [4]:
features = {
    'feature1': numeric_data1,
    'feature2': numeric_data2
}

In [5]:
feature_columns = [
    tf.feature_column.numeric_column(key='feature1', shape=(10,10), 
                                     normalizer_fn=lambda x: (x - tf.reduce_mean(x)) / tf.math.reduce_std(x)),
    tf.feature_column.numeric_column(key='feature2')
]

In [6]:
# inputs = tf.feature_column.input_layer(features, feature_columns) # old feature column api

inputs = tf.keras.layers.DenseFeatures(feature_columns)(features) # new feature column api

inputs_feature1 = tf.keras.layers.DenseFeatures([feature_columns[0]])(features) # choose feature1 only

inputs_feature2 = tf.keras.layers.DenseFeatures([feature_columns[1]])(features) # choose feature2 only

In [7]:
with tf.Session() as sess:
    f, f1, f2 = sess.run([inputs, inputs_feature1, inputs_feature2])
    print('concatenated input shape: ',f.shape)
    print('feature 1 shape:', f1.shape)
    print('feature 1 value:\n', f1)
    print('feature 2 shape:', f2.shape)
    print('feature 2 value:\n', f2)

concatenated input shape:  (1000, 101)
feature 1 shape: (1000, 100)
feature 1 value:
 [[ 0.13960977  1.717567   -0.22334877 ... -0.5878275  -1.2625245
   1.2483596 ]
 [ 1.05701    -1.2229859  -0.49405834 ... -0.20993076  0.46550375
  -1.487517  ]
 [-1.0635902  -1.1714975   0.10464408 ...  1.3426212   0.83771247
   0.6696852 ]
 ...
 [ 0.95487833  1.0591125  -0.04107469 ... -0.6802772   1.2699447
   0.732014  ]
 [ 0.88459957 -1.3132019   0.35951266 ... -0.0935734   0.98929673
  -0.00619706]
 [ 0.8740767   0.591025   -1.3675158  ... -0.42972657 -0.1734219
   1.3166394 ]]
feature 2 shape: (1000, 1)
feature 2 value:
 [[6.68044150e-01]
 [1.03850132e-02]
 [3.16076577e-01]
 [5.75511217e-01]
 [4.63941127e-01]
 [5.14111519e-01]
 [1.41016111e-01]
 [2.63828933e-01]
 [7.00461715e-02]
 [5.14497459e-01]
 [3.40543091e-01]
 [7.10360110e-01]
 [4.13701117e-01]
 [5.16726792e-01]
 [3.21524203e-01]
 [3.34047943e-01]
 [9.63347673e-01]
 [9.75353658e-01]
 [3.95739794e-01]
 [3.02188009e-01]
 [8.09446350e-02]
 [

# Categorical vocabulary column

In [8]:
vocabulary1 = ['red', 'blue', 'green']
vocabulary2 = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']

In [9]:
categorical_data1 = random.choices(vocabulary1, k=N_SAMPLES)
categorical_data2 = random.choices(vocabulary2, k=N_SAMPLES)

In [10]:
features = {
    'feature1': categorical_data1,
    'feature2': categorical_data2
}

In [11]:
categorical_columns = [
    tf.feature_column.categorical_column_with_vocabulary_list(key='feature1', vocabulary_list=vocabulary1),
    tf.feature_column.categorical_column_with_vocabulary_list(key='feature2', vocabulary_list=vocabulary2)
]

Before feeding into a DenseFeatures layer, we must wrap categorical column with an indicator column (one-hot encoding) or an embedding column.

In [12]:
embedding_dimensions = round(len(vocabulary2) ** 0.25) # rule of thumb for embedding dimension

feature_columns = [
    tf.feature_column.indicator_column(categorical_columns[0]), # one-hot encoding
    tf.feature_column.embedding_column(categorical_columns[1], dimension=embedding_dimensions) # embedding
]

In [13]:
inputs_onehot = tf.keras.layers.DenseFeatures(feature_columns[0])(features)
inputs_embed = tf.keras.layers.DenseFeatures(feature_columns[1])(features)

W0718 12:15:52.724051 14436 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0718 12:15:52.736049 14436 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4207: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0718 12:15:52.737017 14436 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4262: VocabularyLis

Feature columns can have internal state, like layers, so they often need to be initialized. Categorical columns use tf.contrib.lookup internally and these require a separate initialization op, tf.tables_initializer.

In [14]:
var_init = tf.global_variables_initializer()
table_init = tf.tables_initializer()

In [15]:
with tf.Session() as sess:
    sess.run([var_init, table_init]) # initialize categorical columns
    f_onehot, f_embed = sess.run([inputs_onehot, inputs_embed])
    print('First 5 one-hot samples: \n', f_onehot[:5])
    print('First 5 embed samples: \n', f_embed[:5])

First 5 one-hot samples: 
 [[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
First 5 embed samples: 
 [[-0.15050048  1.2738632 ]
 [ 0.00187978  0.21390086]
 [ 0.00187978  0.21390086]
 [ 0.66565     0.09505693]
 [-0.15050048  1.2738632 ]]


# Bucketized column

In [16]:
numeric_data = np.random.randint(low=1960, high=2000, size=(N_SAMPLES, 1))

In [17]:
features = {
    'year': numeric_data
}

First, convert the raw input to a numeric column.

In [18]:
numeric_columns = [
    tf.feature_column.numeric_column(key='year', shape=(1,))
]

Then, bucketize the numeric column on the years 1970, 1980, and 1990.

In [19]:
feature_columns = [
    tf.feature_column.bucketized_column(source_column=numeric_columns[0], boundaries=[1970, 1980, 1990])
]

Note that specifying a three-element boundaries vector creates a four-element bucketized vector.

In [20]:
inputs = tf.keras.layers.DenseFeatures(feature_columns)(features)

In [21]:
with tf.Session() as sess:
    f = sess.run(inputs)
    print('Inputs shape:', f.shape)
    print('First 5 bucketized samples:\n', f[:5])
    print('First 5 raw values:\n', numeric_data[:5])

Inputs shape: (1000, 4)
First 5 bucketized samples:
 [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
First 5 raw values:
 [[1960]
 [1997]
 [1991]
 [1980]
 [1989]]


# Categorical identity column

In [37]:
vocabulary = {'cat': 0, 'dog': 1, 'bird': 2, 'fish': 3}
num_buckets = len(vocabulary)

In [38]:
categorical_data = random.choices(list(vocabulary.keys()), k=N_SAMPLES)
integer_data = np.array([vocabulary[i] for i in categorical_data]).reshape(N_SAMPLES,1)

In [39]:
features = {
    'category_id': integer_data
}

In [40]:
categorical_columns = [
    tf.feature_column.categorical_column_with_identity(key='category_id', num_buckets=num_buckets)
]

In [41]:
feature_columns = [
    tf.feature_column.indicator_column(categorical_columns[0])
]

In [42]:
inputs = tf.keras.layers.DenseFeatures(feature_columns)(features)

W0718 13:30:46.433690 14436 deprecation.py:323] From f:\anaconda3\envs\tensorflow1.14\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py:4262: IdentityCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


In [43]:
with tf.Session() as sess:
    f = sess.run(inputs)
    print('Inputs shape:', f.shape)
    print('First 5 one-hot samples:\n', f[:5])
    print('First 5 raw values:\n', categorical_data[:5])

Inputs shape: (1000, 4)
First 5 one-hot samples:
 [[0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]
First 5 raw values:
 ['fish', 'cat', 'fish', 'fish', 'bird']
