### Import packages and data

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

In [4]:
data = pd.read_csv('./data/criteo_sample.txt')
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,...,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de


In [5]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I'+str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0,)
target = ['label']

### Simple preprocessing
Usually we have two methods to encode the sparse categorical feature for embedding.

* Label Encoding: map the features to integer value from 0 ~ len(#unique) - 1

In [7]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

* Hash Encoding: map the features to a fix range, like 0 ~ 9999. We have 2 methods to do that:

    * Do feature hashing before training

In [9]:
# for feat in sparse_features:
#     lbe = HashEncoder()
#     data[feat] = lbe.transform(data[feat])

* * Do feature hashing on the fly in training process

We can do feature hashing by setting `use_hash=True` in `SparseFeat` or `VarlenSparseFeat` in next step.

And for dense numerical features,they are usually discretized to buckets,here we use normalization.

In [10]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [11]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,...,8,66,0,0,3,0,1,96,0,0
1,0,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47


### Generate feature columns
For sparse features, we transform them into dense vectors by embedding techniques. For dense numerical features, we concatenate them to the input tensors of fully connected layer.

And for varlen(multi-valued) sparse features,you can use `VarlenSparseFeat`. 

* Label Encoding

In [53]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                       for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

In [14]:
fixlen_feature_columns[:5]

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001918FD62D48>, embedding_name='C1', group_name='default_group', trainable=True),
 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001918FD62348>, embedding_name='C2', group_name='default_group', trainable=True),
 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000001918ECC45C8>, embedding_name='C3', group_name='default_group', trainable=True),
 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, vocabulary_path=None, 

* Feature Hashing on the fly

In [38]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=int(1e6), embedding_dim=4, use_hash=True, dtype='string')  # the input is string
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                          for feat in dense_features]

In [39]:
fixlen_feature_columns[:5]

[SparseFeat(name='C1', vocabulary_size=1000000, embedding_dim=4, use_hash=True, vocabulary_path=None, dtype='string', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000191900E0488>, embedding_name='C1', group_name='default_group', trainable=True),
 SparseFeat(name='C2', vocabulary_size=1000000, embedding_dim=4, use_hash=True, vocabulary_path=None, dtype='string', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000191900E0F88>, embedding_name='C2', group_name='default_group', trainable=True),
 SparseFeat(name='C3', vocabulary_size=1000000, embedding_dim=4, use_hash=True, vocabulary_path=None, dtype='string', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000191900E0FC8>, embedding_name='C3', group_name='default_group', trainable=True),
 SparseFeat(name='C4', vocabulary_size=1000000, embedding_dim=4, use_hash=True, vocab

* Generate feature columns

In [54]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

### Generate the training samples and train the model

In [55]:
train, test = train_test_split(data, test_size=0.2)

In [56]:
train.head(3)

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
168,1,0.027027,0.000999,0.026998,0.045977,0.0,0.001899,0.003322,0.081633,0.003868,...,8,23,0,0,80,0,6,24,0,0
130,1,0.0,0.000666,0.003552,0.068966,0.022993,0.0,0.0,0.204082,0.005803,...,1,27,0,0,134,0,0,12,0,0
13,0,0.216216,0.0,0.0,0.0,0.001443,0.00095,0.07309,0.040816,0.001934,...,8,75,0,0,0,0,0,0,0,0


In [57]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [58]:
type(train_model_input['C1'][0])

numpy.int32

In [59]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

In [60]:
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
1/1 - 6s - loss: 0.6466 - binary_crossentropy: 0.6466 - val_loss: 0.6674 - val_binary_crossentropy: 0.6673
Epoch 2/10
1/1 - 0s - loss: 0.6303 - binary_crossentropy: 0.6303 - val_loss: 0.6626 - val_binary_crossentropy: 0.6626
Epoch 3/10
1/1 - 0s - loss: 0.6152 - binary_crossentropy: 0.6151 - val_loss: 0.6583 - val_binary_crossentropy: 0.6583
Epoch 4/10
1/1 - 0s - loss: 0.6003 - binary_crossentropy: 0.6002 - val_loss: 0.6545 - val_binary_crossentropy: 0.6545
Epoch 5/10
1/1 - 0s - loss: 0.5853 - binary_crossentropy: 0.5853 - val_loss: 0.6512 - val_binary_crossentropy: 0.6512
Epoch 6/10
1/1 - 0s - loss: 0.5704 - binary_crossentropy: 0.5704 - val_loss: 0.6488 - val_binary_crossentropy: 0.6487
Epoch 7/10
1/1 - 0s - loss: 0.5556 - binary_crossentropy: 0.5555 - val_loss: 0.6469 - val_binary_crossentropy: 0.6469
Epoch 8/10
1/1 - 0s - loss: 0.5405 - binary_crossentropy: 0.5405 - val_loss: 0.6459 - val_binary_crossentropy: 0.6459
Epoch 9/10
1/1 - 0s - loss: 0.5254 - binary_crossentropy