**Understanding data processing across one model API(Here DeepFM for example) pipeline** using varied datasets.
* Movielens dataset

In [69]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [72]:
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat,get_fixlen_feature_names

In [240]:
dataframe = pd.read_csv("./movielens_sample.txt")
dataframe.head(7)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009
5,2996,2799,1,972769867,Problem Child 2 (1991),Comedy,M,18,0,63011
6,3087,837,5,969738869,Matilda (1996),Children's|Comedy,F,1,1,90802


In [76]:
sparse_features = ["movie_id", "user_id","gender", "age", "occupation", "zip"]
y = ['rating']

In [82]:
dataframe[sparse_features].head(5)#before ordinal encoding of sparse features

Unnamed: 0,movie_id,user_id,gender,age,occupation,zip
0,235,3299,F,25,4,19119
1,3256,3630,M,18,4,77005
2,105,517,F,25,14,55408
3,2115,785,M,18,19,29307
4,909,5848,M,50,20,20009


In [84]:
for feat in sparse_features:#performs an ordinal encoding
    lbe = LabelEncoder()
    dataframe[feat]= lbe.fit_transform(dataframe[feat])
dataframe[sparse_features].head(5)#After ordinal encoding of sparse features

Unnamed: 0,movie_id,user_id,gender,age,occupation,zip
0,12,107,0,2,4,35
1,169,123,1,1,4,118
2,6,12,0,2,13,99
3,112,21,1,1,18,55
4,45,187,1,5,19,41


In [86]:
#This intakes the unique numb of features in each field & returns a named tuple
#containing params for every sparse field
fixlen_feature_columns = [SparseFeat(feat, dataframe[feat].nunique())
                          for feat in sparse_features]

fixlen_feature_columns

[SparseFeat(name='movie_id', dimension=187, use_hash=False, dtype='int32', embedding_name='movie_id', embedding=True),
 SparseFeat(name='user_id', dimension=193, use_hash=False, dtype='int32', embedding_name='user_id', embedding=True),
 SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True),
 SparseFeat(name='age', dimension=7, use_hash=False, dtype='int32', embedding_name='age', embedding=True),
 SparseFeat(name='occupation', dimension=20, use_hash=False, dtype='int32', embedding_name='occupation', embedding=True),
 SparseFeat(name='zip', dimension=188, use_hash=False, dtype='int32', embedding_name='zip', embedding=True)]

In [96]:
??SparseFeat

In [94]:
import inspect
inspect.getargspec(SparseFeat)

  


ArgSpec(args=['cls', 'name', 'dimension', 'use_hash', 'dtype', 'embedding_name', 'embedding'], varargs=None, keywords=None, defaults=(False, 'int32', None, True))

* Following is analogous to class `SpareseFeat`;
  But Assigned diff. names to class & namedtuple just for test.

In [150]:
from collections import namedtuple


class SpFt(namedtuple('SparFt', ['name', 'dimension', 'use_hash', 'dtype','embedding_name','embedding'])):
    __slots__ = ()

    def __new__(cls, name, dimension, use_hash=False, dtype="int32", embedding_name=None,embedding=True):
        if embedding and embedding_name is None:
            embedding_name = name
        return super(SpFt, cls).__new__(cls, name, dimension, use_hash, dtype, embedding_name,embedding)

SpFt(sparse_features[0],dataframe[sparse_features[0]].nunique()) 

SpFt(name='movie_id', dimension=187, use_hash=False, dtype='int32', embedding_name='movie_id', embedding=True)

In [152]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

#basically same feature column is added twice under different column names(linear & dnn)

fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
print(fixlen_feature_names)

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip']


In [151]:
??get_fixlen_feature_names

* `get_fixlen_feature_names(fixlen_feature_columns)` internally calls `build_input_features(feature_cols)` and Creates an
`Input()` layer shaped (None,1) for each `sparse_features` but just returns the name of those layers which is equivalent to:

`[fixlen_feature_columns[idx].name for idx in len(fixlen_feature_columns)]`


* Now `fixlen_feature_columns` is passed to `DeepFM(fixlen_feature_columns)` to create `model` for training: Piecewise executing DeepFM() code


In [155]:
??DeepFM
#DeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, use_fm=True,
#dnn_hidden_units=(128, 128), l2_reg_linear=0.00001, l2_reg_embedding=0.00001,
#l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, dnn_activation='relu',
#dnn_use_bn=False, task='binary'):

* **`DeepFM(. . .)` line 1 & 2**

In [158]:
from deepctr.inputs import *

In [165]:
features = build_input_features(linear_feature_columns + dnn_feature_columns)
features

OrderedDict([('movie_id',
              <tf.Tensor 'movie_id_7:0' shape=(None, 1) dtype=int32>),
             ('user_id',
              <tf.Tensor 'user_id_7:0' shape=(None, 1) dtype=int32>),
             ('gender', <tf.Tensor 'gender_7:0' shape=(None, 1) dtype=int32>),
             ('age', <tf.Tensor 'age_7:0' shape=(None, 1) dtype=int32>),
             ('occupation',
              <tf.Tensor 'occupation_7:0' shape=(None, 1) dtype=int32>),
             ('zip', <tf.Tensor 'zip_7:0' shape=(None, 1) dtype=int32>)])

In [166]:
list(features.values())

[<tf.Tensor 'movie_id_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'user_id_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'gender_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'age_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'occupation_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'zip_7:0' shape=(None, 1) dtype=int32>]

* **`DeepFM(. . .)` line 3**

`deepctr.inputs.input_from_feature_columns(. . .)`
takes input :
* `features`: OrderDict containing `Input layers` from  `build_input_features(. . .)` line 1.
* `fixlen_features_Columns` namedtuple of each SparseFeat object.
* embedding_size= 8
* l2_reg_embedding=0.00001
* init_std=0.0001

In [167]:
??input_from_feature_columns

In [171]:
sparse_embedding_list, dense_value_list = input_from_feature_columns(features, dnn_feature_columns,
            8,0.00001, 0.0001, 1024)
sparse_embedding_list

[<tf.Tensor 'sparse_emb_movie_id/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse_emb_user_id/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse_emb_gender/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse_emb_age/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse_emb_occupation/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse_emb_zip/Identity:0' shape=(None, 1, 8) dtype=float32>]

* input_from_feature_columns(. . .)  >  `create_embedding_matrix( )`


* **Note:** Output from `create_embedding_matrix( )` is same as `create_embedding_dict( )` i.e., can be directly called inside `input_from_feature_columns( )`

In [186]:
#create_embedding_matrix( ) is called inside input_from_feature_columns( )
#create_embedding_matrix(feature_columns,l2_reg,init_std,seed,embedding_size, prefix=prefix,seq_mask_zero=seq_mask_zero)


create_embedding_matrix(dnn_feature_columns, 0.00001,0.0001,1024,8, prefix='',seq_mask_zero=True)

{'movie_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed01a8a58>,
 'user_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed01a87f0>,
 'gender': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed01a85f8>,
 'age': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed01a8390>,
 'occupation': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed01a8080>,
 'zip': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed019f860>}

* input_from_feature_columns(. . .) > create_embedding_matrix(. . .) > `create_embedding_dict(. . .)`

.
* Requires input: `sparse_feature_columns`, `embedding_size`


.
* **Note**:  `Sparse_Feature_columns` is computed by filtering `SparseFeat` objects from `VarLensFeat` Objects, since here for `movielens_Sample`, Only `Sparse_features` were used; Therefore **`sparse_feature_columns`= `fixlen_feature_columns`**
and can be used interchangeably


In [188]:
sparse_feature_columns = list(filter(lambda x:isinstance(x,SparseFeat),fixlen_feature_columns)) if fixlen_feature_columns else []
#same as fixlen_feature_columns

[SparseFeat(name='movie_id', dimension=187, use_hash=False, dtype='int32', embedding_name='movie_id', embedding=True),
 SparseFeat(name='user_id', dimension=193, use_hash=False, dtype='int32', embedding_name='user_id', embedding=True),
 SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True),
 SparseFeat(name='age', dimension=7, use_hash=False, dtype='int32', embedding_name='age', embedding=True),
 SparseFeat(name='occupation', dimension=20, use_hash=False, dtype='int32', embedding_name='occupation', embedding=True),
 SparseFeat(name='zip', dimension=188, use_hash=False, dtype='int32', embedding_name='zip', embedding=True)]

In [192]:
from tensorflow.python.keras.initializers import RandomNormal
from tensorflow.python.keras.layers import  Embedding
from tensorflow.python.keras.regularizers import l2

sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, 8,
                                                 embeddings_initializer=RandomNormal(
                                                     mean=0.0, stddev=0.0001, seed=1024),
                                                 embeddings_regularizer=l2(
                                                     0.00001),
                                                 name='sparse_' + '_emb_'  + feat.name) for feat in
                            fixlen_feature_columns}

sparse_embedding

{'movie_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed0195940>,
 'user_id': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed0195d30>,
 'gender': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fecf4cf0f0>,
 'age': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fecff5f278>,
 'occupation': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fed0185b70>,
 'zip': <tensorflow.python.keras.layers.embeddings.Embedding at 0x1fecfed5048>}

In [382]:
??Embedding

* input_from_feature_columns(. . .) > `embedding_lookup(. . .)`
.


* Takes input:
* `embedding_dict`: `sparse_embedding` (i.e., output of `create_embedding_matrix()` )
* `features`: OrderedDict from `build_input_features (. . .)`
* `sparse_feature_columns` : `fixlen_feature_columns`

In [212]:
??embedding_lookup

In [194]:
sparse_embedding_list = embedding_lookup(sparse_embedding, features, fixlen_feature_columns)
sparse_embedding_list

[<tf.Tensor 'sparse__emb_movie_id_1/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse__emb_user_id_1/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse__emb_gender_1/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse__emb_age_1/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse__emb_occupation_1/Identity:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'sparse__emb_zip_1/Identity:0' shape=(None, 1, 8) dtype=float32>]

* `lookup_idx` in `embedding_lookup()` pertains to `Input_layers` in `OrderedDict` obtained from `build_input_features(. . .)` in line 1

In [219]:
features#First Input layers

OrderedDict([('movie_id',
              <tf.Tensor 'movie_id_7:0' shape=(None, 1) dtype=int32>),
             ('user_id',
              <tf.Tensor 'user_id_7:0' shape=(None, 1) dtype=int32>),
             ('gender', <tf.Tensor 'gender_7:0' shape=(None, 1) dtype=int32>),
             ('age', <tf.Tensor 'age_7:0' shape=(None, 1) dtype=int32>),
             ('occupation',
              <tf.Tensor 'occupation_7:0' shape=(None, 1) dtype=int32>),
             ('zip', <tf.Tensor 'zip_7:0' shape=(None, 1) dtype=int32>)])

In [220]:
lookup_idx = features[fixlen_feature_columns[0].name]
#Same as
#feature_name  = fc.name
#lookup_idx = sparse_input_dict[feature_name]

lookup_idx

<tf.Tensor 'movie_id_7:0' shape=(None, 1) dtype=int32>

* `lookup_idx` serves to lookup in `sparse_embedding` dictionary obtained from `create_embedding_dict(. . .)`

In [221]:
sparse_embedding[fixlen_feature_columns[0].embedding_name](lookup_idx)

#same as
#embedding_name = fc.embedding_name
#sparse_embedding_dict[embedding_name](lookup_idx)

<tf.Tensor 'sparse__emb_movie_id_3/Identity:0' shape=(None, 1, 8) dtype=float32>

In [222]:
model = DeepFM(linear_feature_columns, dnn_feature_columns,task='regression')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
______________________________________________________________________________________________

In [223]:
train, test = train_test_split(dataframe, test_size=0.2)
train_model_input = [train[name].values for name in fixlen_feature_names]
test_model_input = [test[name].values for name in fixlen_feature_names]

In [229]:
len(train_model_input)

6

In [259]:
model.compile("adam", "mse", metrics=['mse'], )

In [268]:
model.fit(train_model_input, train[y].values,
                        batch_size=256, epochs=10, verbose=0, validation_split=0.2, )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


<tensorflow.python.keras.callbacks.History at 0x1fed0477588>

In [270]:
pred_ans = model.predict(test_model_input, batch_size=256)
pred_ans[:4]

array([[0.25272667],
       [0.21650271],
       [0.21586153],
       [0.21676101]], dtype=float32)

In [265]:
test.head(4)

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
19,99,183,3,972413564,F/X (1986),Action|Crime|Thriller,1,1,0,108
24,103,127,3,969324381,Cocoon (1985),Comedy|Sci-Fi,1,2,2,135
112,155,62,4,996034747,Apocalypse Now (1979),Drama|War,1,6,1,71
152,178,170,2,959833154,Mission to Mars (2000),Sci-Fi,0,2,6,8


In [275]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
______________________________________________________________________________________________

In [432]:
test_model_input[0]

array([183, 127,  62, 170,  74, 150,  22,  61, 104,  84, 177, 113, 173,
       179, 143, 135,  16,  34,   5,  54,  48, 101,  19,  97, 139, 167,
       155,  28,   9,  56,  91, 138, 111,  89, 147, 126,  44,  24, 182,
        39], dtype=int64)

* **Following results in embedded output after a test value passes from `movie_id (InputLayer)` to `sparse_emb_movie_id (Embedding)`**

In [316]:
from tensorflow.keras import backend as K
sparse_movie_layer= K.function([model.layers[0].input], [model.layers[6].output])
sparse_movie_output = sparse_movie_layer([test_model_input[0][0]])

print('Input layer name:', model.layers[0].name,'\nIntercepted layer name:', model.layers[6].name, '\nEmbedded Input value:',test_model_input[0][0],
      '\nSparse_emb_movie_id weights matrix shape:',model.layers[6].get_weights()[0].shape, '\nEmbedding output shape:',sparse_movie_output[0].shape)

Input layer name: movie_id 
Intercepted layer name: sparse_emb_movie_id 
Embedded Input value: 183 
Sparse_emb_movie_id weights matrix shape: (187, 8) 
Embedding output shape: (8,)


In [338]:
#movie_id : 183, is embedding as follows at sparse_emb_movie_id layer(layer 6)
print('Intermediate output at Sparse_emb_movie_id layer with input value: {}\n\nis as:\n{}'.format(test_model_input[0][0],
                                                                                             sparse_movie_output[0]))

Intermediate output at Sparse_emb_movie_id layer with input value: 183

is as:
[ 4.89938393e-05  1.42261624e-05 -1.21795136e-04 -5.80309752e-05
  1.34360107e-06  7.94252230e-07 -6.24333770e-05  4.32524612e-05]


* **Inferring equivalence with above `sparse_movie_output` embeddings by a vector product between the same test value(but one-hot encoded) and weight matrix that follows between `movie_id (InputLayer)` & `sparse_emb_movie_id (Embedding)`**:

In [329]:
input_to_sparse_emb_movie_wt = model.layers[6].get_weights()[0]
print('Weights shape for sparse_emb_movie_id layer:', input_to_sparse_emb_movie_wt.shape)

Weights shape for sparse_emb_movie_id layer: (187, 8)


In [433]:
from tensorflow.keras.utils import to_categorical

test_onehot_rep = to_categorical(test_model_input[0][0], num_classes=dataframe['movie_id'].nunique())
print('Embedded Input value: {} has a shape of: {}'.format(test_model_input[0][0], test_onehot_rep.shape))

Embedded Input value: 183 has a shape of: (187,)


In [339]:
print('Vector product output with Sparse_emb_movie_id layer weights and one-hot encoded input value:{}\n\nis as:\n{}'.format(test_model_input[0][0],
                                            np.dot(test_onehot_rep,input_to_sparse_emb_movie_wt)))

Vector product output with Sparse_emb_movie_id layer weights and one-hot encoded input value:183

is as:
[ 4.89938393e-05  1.42261624e-05 -1.21795136e-04 -5.80309752e-05
  1.34360107e-06  7.94252230e-07 -6.24333770e-05  4.32524612e-05]


* **`DeepFM()` line 4**

`deepctr.inputs.get_linear_logit(. . .)`
takes input :
* `features`: OrderDict containing `Input layers` from  `build_input_features(. . .)` line 1.
* `linear_feature_columns`:`fixlen_features_Columns` namedtuple of each SparseFeat object.
* l2_reg_linear=0.00001
* init_std=0.0001

In [392]:
linear_logit = get_linear_logit(features, linear_feature_columns, l2_reg=0.00001, init_std=0.0001,
                 seed=1024, prefix='linear')
linear_logit

<tf.Tensor 'linear_1/Identity:0' shape=(None, 1, 1) dtype=float32>

* line 1 in `get_linear_logit(. . .)` function, produces Embeddings with size 1
.
Intakes:
* `features`: OrderDict containing `Input layers` from  `build_input_features(. . .)` line 1.
* `linear_feature_columns`:`fixlen_features_Columns` namedtuple of each SparseFeat object.

Outputs list of Embeddings Shaped `(None,1,1)` like `sparse_embedding_list` of shape `(None,1,8)` in line 3 `DeepFM`

In [402]:
units=1
linear_emb_list = [input_from_feature_columns(features,linear_feature_columns,1,0.00001,0.0001,1024,prefix='linear'+str(i))[0] for i in range(units)]
linear_emb_list

[[<tf.Tensor 'linear0sparse_emb_movie_id_2/Identity:0' shape=(None, 1, 1) dtype=float32>,
  <tf.Tensor 'linear0sparse_emb_user_id_2/Identity:0' shape=(None, 1, 1) dtype=float32>,
  <tf.Tensor 'linear0sparse_emb_gender_2/Identity:0' shape=(None, 1, 1) dtype=float32>,
  <tf.Tensor 'linear0sparse_emb_age_2/Identity:0' shape=(None, 1, 1) dtype=float32>,
  <tf.Tensor 'linear0sparse_emb_occupation_2/Identity:0' shape=(None, 1, 1) dtype=float32>,
  <tf.Tensor 'linear0sparse_emb_zip_2/Identity:0' shape=(None, 1, 1) dtype=float32>]]

In [414]:
_, dense_input_list = input_from_feature_columns(features, linear_feature_columns, 1,0.00001,0.0001,1024,prefix='linear')
dense_input_list#No Dense Features

[]

In [417]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
______________________________________________________________________________________________

In [416]:
from deepctr.layers.utils import concat_fun,Linear
concat_fun(linear_emb_list[0])#correspondes to concatenate[0][0] connected to linear (Linear) layer  

<tf.Tensor 'concatenate_4/Identity:0' shape=(None, 1, 6) dtype=float32>