## Sparse and variable length features to fixed-size embeddings

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

print(tf.__version__)
sess = tf.InteractiveSession()

1.1.0


In [2]:
user_history = pd.DataFrame(
    data=[[0, 1], [0, 10], [1, 3]],
    columns=['user', 'item'])

user_history

Unnamed: 0,user,item
0,0,1
1,0,10
2,1,3


In [3]:
from scipy import sparse

sparse_history = sparse.coo_matrix((np.ones_like(user_history.user), (user_history.user, user_history.item)))
sparse_history

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in COOrdinate format>

In [4]:
sparse_history.toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

## How to look-up embeddings for our particular users?

In [5]:
N_USER = 3
K = 4
user_embeddings = tf.Variable(tf.random_normal([N_USER, K], stddev=0.01, mean=0))                              
sess.run(user_embeddings.initializer)

user_embeddings.eval()

array([[ 0.02118576,  0.00100403, -0.01279619, -0.0022322 ],
       [-0.00010485,  0.00948657,  0.00267715,  0.00854996],
       [-0.00351496, -0.00669028,  0.00896358,  0.00918774]], dtype=float32)

In [6]:
# no problem if fixed size inputs
batch_user_ids = [0, 2]
tf.nn.embedding_lookup(user_embeddings, batch_user_ids).eval()

array([[ 0.02118576,  0.00100403, -0.01279619, -0.0022322 ],
       [-0.00351496, -0.00669028,  0.00896358,  0.00918774]], dtype=float32)

## Feeding `tf.SparseTensorValue` to `tf.nn.embedding_lookup_sparse`

In [7]:
row_indexes = np.array([0, 0, 1, 1, 1, 2])

def intra_row_index(row_indexes):
    count_by_row = np.bincount(row_indexes)
    shift_by_row = np.concatenate([[0], np.cumsum(count_by_row)])
    return np.arange(len(row_indexes)) - shift_by_row[row_indexes]

intra_row_index(row_indexes)

array([0, 1, 0, 1, 2, 0])

In [8]:
def sparse_features_to_tensor(batch_features):
    """ from https://github.com/tensorflow/tensorflow/issues/342#issuecomment-160354041
    not very sparse, but rather a kind of jagged array where every batch sample can have 1, N_FEATURES features
    """
    batch_features_as_coo = batch_features.tocoo()
    batch_features_sparse_tensor = tf.SparseTensorValue(
        indices=np.vstack([
            batch_features_as_coo.row, 
            intra_row_index(batch_features_as_coo.row)]).T,
        values=batch_features_as_coo.col,
        dense_shape=batch_features_as_coo.shape
    )
    return batch_features_sparse_tensor

sparse_features_to_tensor(sparse_history)

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0]]), values=array([ 1, 10,  3], dtype=int32), dense_shape=(2, 11))

In [9]:
N_ITEMS = user_history.item.max() + 1
K = 5
item_embeddings = tf.constant(np.arange(N_ITEMS * K, dtype=float).reshape((N_ITEMS, K)))

item_embeddings.eval()

array([[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.],
       [ 25.,  26.,  27.,  28.,  29.],
       [ 30.,  31.,  32.,  33.,  34.],
       [ 35.,  36.,  37.,  38.,  39.],
       [ 40.,  41.,  42.,  43.,  44.],
       [ 45.,  46.,  47.,  48.,  49.],
       [ 50.,  51.,  52.,  53.,  54.]])

In [10]:
user_history_embeddings = tf.nn.embedding_lookup_sparse(
    item_embeddings,
    sp_ids=tf.SparseTensor(*sparse_features_to_tensor(sparse_history)), 
    sp_weights=None, 
    combiner='sum').eval()

In [11]:
# to enable lazy feeding with batches
sparse_history_holder = tf.sparse_placeholder(tf.int32, name='user_history')

def to_feed_dict(sparse_history):
    return {sparse_history_holder: sparse_features_to_tensor(sparse_history)}

user_history_embeddings = tf.nn.embedding_lookup_sparse(
    item_embeddings,
    sp_ids=sparse_history_holder, 
    sp_weights=None, 
    combiner='sum')


user_history_embeddings.eval(feed_dict=to_feed_dict(sparse_history))

array([[ 55.,  57.,  59.,  61.,  63.],
       [ 15.,  16.,  17.,  18.,  19.]])

## Using `pandas` raw string together with `tf.contrib.layers` columns 

In [12]:
exploded_item_features = pd.DataFrame.from_dict({'item': [1, 1, 2], 'genre': ['rock', 'pop', 'classical']})\
    .set_index('item')

item_features = exploded_item_features.reset_index()\
    .groupby('item').genre.apply(np.array).to_frame('genres')
    
item_features

Unnamed: 0_level_0,genres
item,Unnamed: 1_level_1
1,"[rock, pop]"
2,[classical]


In [13]:
exploded_item_features

Unnamed: 0_level_0,genre
item,Unnamed: 1_level_1
1,rock
1,pop
2,classical


In [14]:
user_features = pd.DataFrame.from_dict({'user': [3, 4], 'age': ['young', 'old']}).set_index('user')
user_features

Unnamed: 0_level_0,age
user,Unnamed: 1_level_1
3,young
4,old


In [15]:
batch_events = pd.DataFrame(
    [[3, 1, True, 0],
     [4, 2, False, 1]],
    columns=['user', 'item', 'is_clicked', 'sample_index'])\
    .set_index('sample_index', drop=False)

batch_events

Unnamed: 0_level_0,user,item,is_clicked,sample_index
sample_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3,1,True,0
1,4,2,False,1


In [16]:
batch_item_featurised_events = batch_events.set_index('item')\
    .join(exploded_item_features)\
    .sort_index()\
    .assign(intra_sample_index=lambda df: intra_row_index(df.index))

batch_item_featurised_events[['sample_index', 'intra_sample_index', 'genre']]

Unnamed: 0_level_0,sample_index,intra_sample_index,genre
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,rock
1,0,1,pop
2,1,0,classical


In [27]:
def to_sparse_tensor(featurised_samples, column):
    sample_sparse_features = tf.SparseTensor(
        indices=featurised_samples[['sample_index', 'intra_sample_index']].values,
        values=featurised_samples[column].values,
        dense_shape=(
            featurised_samples.sample_index.max() + 1, 
            featurised_samples.intra_sample_index.max() + 1)
    )
    return sample_sparse_features

to_sparse_tensor(batch_item_featurised_events, column='genre').eval()

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0]]), values=array([b'rock', b'pop', b'classical'], dtype=object), dense_shape=array([2, 2]))

In [18]:
from tensorflow.contrib import layers

genres_feature = layers.sparse_column_with_keys(column_name='genres', keys=(b'pop', b'rock', b'classical'))

embedding_dim = 2
embedding_matrix = np.arange(genres_feature.length * embedding_dim).reshape((genres_feature.length, -1))
genres_embedding_column = layers.embedding_column(
    genres_feature, 
    dimension=embedding_dim, 
    initializer=tf.constant_initializer(embedding_matrix)
)

genres_one_hot_column = layers.one_hot_column(genres_feature)

genres_embeddings = layers.input_from_feature_columns({'genres': sparse_genres}, feature_columns=[genres_embedding_column])
genres_on_hot = layers.input_from_feature_columns({'genres': sparse_genres}, feature_columns=[genres_one_hot_column])

sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

print('batch one-hot tensor:')
print(genres_on_hot.eval())
print('genres embeddings:')
print(embedding_matrix)
print('batch embeddings:')
print(genres_embeddings.eval())

batch one-hot tensor:
[[ 1.  1.  0.]
 [ 0.  0.  1.]]
genres embeddings:
[[0 1]
 [2 3]
 [4 5]]
batch embeddings:
[[ 1.  2.]
 [ 4.  5.]]


## Cross interactions column

In [28]:
batch_user_featurised_events = batch_events.set_index('user')\
    .join(user_features)\
    .sort_index()\
    .assign(intra_sample_index=lambda df: intra_row_index(df.index))

sparse_age = to_sparse_tensor(batch_user_featurised_events, column='age')
batch_user_featurised_events[['age', 'sample_index', 'intra_sample_index']]

Unnamed: 0_level_0,age,sample_index,intra_sample_index
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,young,0,0
4,old,1,0


In [24]:
age_feature = layers.sparse_column_with_keys(column_name='age', keys=('young', 'old'))

age_x_genres_feature = layers.crossed_column([age_feature, genres_feature], hash_key=0, hash_bucket_size=int(1e4))
age_x_genres_feature

_CrossedColumn(columns=(_SparseColumnKeys(column_name='age', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('young', 'old'), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='genres', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=(b'pop', b'rock', b'classical'), num_oov_buckets=0, vocab_size=3, default_value=-1), combiner='sum', dtype=tf.string)), hash_bucket_size=10000, hash_key=0, combiner='sum', ckpt_to_load_from=None, tensor_name_in_ckpt=None)

In [30]:
logits, var_by_cols, bias = layers.weighted_sum_from_feature_columns(
    {'genres': sparse_genres, 'age': sparse_age},
    feature_columns=[age_feature, genres_feature, age_x_genres_feature],
    num_outputs=1,
    scope='linear'
)

INFO:tensorflow:Created variable linear/bias_weight:0, with device=


In [32]:
sess.run(tf.global_variables_initializer())

logits.eval()

array([[ 0.],
       [ 0.]], dtype=float32)