## Sparse and variable length features to fixed-size embeddings

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd

sess = tf.InteractiveSession()

In [34]:
user_history = pd.DataFrame(
    data=[[0, 1], [0, 10], [1, 3]],
    columns=['user', 'item'])

user_history

Unnamed: 0,user,item
0,0,1
1,0,10
2,1,3


In [35]:
from scipy import sparse

sparse_history = sparse.coo_matrix((np.ones_like(user_history.user), (user_history.user, user_history.item)))
sparse_history

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in COOrdinate format>

In [36]:
sparse_history.toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

## How to look-up embeddings for our particular users?

In [39]:
N_USER = 3
K = 4
user_embeddings = tf.Variable(tf.random_normal([N_USER, K], stddev=0.01, mean=0))                              
sess.run(user_embeddings.initializer)

user_embeddings.eval()

array([[ 0.01827744, -0.00507568,  0.00115156,  0.00355386],
       [ 0.00328594,  0.00637743,  0.01012458, -0.01201905],
       [-0.01115778, -0.00643606,  0.00215107, -0.00931697]], dtype=float32)

In [40]:
# no problem if fixed size inputs
batch_user_ids = [0, 2]
tf.nn.embedding_lookup(user_embeddings, batch_user_ids).eval()

array([[ 0.01827744, -0.00507568,  0.00115156,  0.00355386],
       [-0.01115778, -0.00643606,  0.00215107, -0.00931697]], dtype=float32)

## Feeding `tf.SparseTensorValue` to `tf.nn.embedding_lookup_sparse`

In [41]:
row_indexes = np.array([0, 0, 1, 1, 1, 2])

def intra_row_index(row_indexes):
    count_by_row = np.bincount(row_indexes)
    shift_by_row = np.concatenate([[0], np.cumsum(count_by_row)])
    return np.arange(len(row_indexes)) - shift_by_row[row_indexes]

intra_row_index(row_indexes)

array([0, 1, 0, 1, 2, 0])

In [42]:
def sparse_features_to_tensor(batch_features):
    """ from https://github.com/tensorflow/tensorflow/issues/342#issuecomment-160354041
    not very sparse, but rather a kind of jagged array where every batch sample can have 1, N_FEATURES features
    """
    batch_features_as_coo = batch_features.tocoo()
    batch_features_sparse_tensor = tf.SparseTensorValue(
        indices=np.vstack([
            batch_features_as_coo.row, 
            intra_row_index(batch_features_as_coo.row)]).T,
        values=batch_features_as_coo.col,
        shape=batch_features_as_coo.shape
    )
    return batch_features_sparse_tensor

sparse_features_to_tensor(sparse_history)

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0]]), values=array([ 1, 10,  3], dtype=int32), shape=(2, 11))

In [52]:
N_ITEMS = user_history.item.max() + 1
K = 5
item_embeddings = tf.constant(np.arange(N_ITEMS * K, dtype=float).reshape((N_ITEMS, K)))

item_embeddings.eval()

array([[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.],
       [ 25.,  26.,  27.,  28.,  29.],
       [ 30.,  31.,  32.,  33.,  34.],
       [ 35.,  36.,  37.,  38.,  39.],
       [ 40.,  41.,  42.,  43.,  44.],
       [ 45.,  46.,  47.,  48.,  49.],
       [ 50.,  51.,  52.,  53.,  54.]])

In [51]:
tf.nn.embedding_lookup_sparse(
    item_embeddings,
    sp_ids=tf.SparseTensor(*sparse_item_features_to_tensor(sparse_history)), 
    sp_weights=None, 
    combiner='sum').eval()

array([[ 55.,  57.,  59.,  61.,  63.],
       [ 15.,  16.,  17.,  18.,  19.]])