In [73]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names


def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


In [74]:
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
                    "gender", "age", "occupation", "zip", ]
target = ['rating']

# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].max() + 1, embedding_dim=4)
                            for feat in sparse_features]

use_weighted_sequence = True
if use_weighted_sequence:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
else:
    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

In [75]:
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
if use_weighted_sequence:
    feature_names.remove("genres_weight")
feature_names

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']

In [76]:

# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}  #
model_input["genres"] = genres_list
model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
1/1 - 2s - loss: 14.3003 - mse: 14.3003 - val_loss: 13.3990 - val_mse: 13.3990
Epoch 2/10
1/1 - 0s - loss: 14.1769 - mse: 14.1769 - val_loss: 13.2737 - val_mse: 13.2737
Epoch 3/10
1/1 - 0s - loss: 14.0260 - mse: 14.0260 - val_loss: 13.1432 - val_mse: 13.1432
Epoch 4/10
1/1 - 0s - loss: 13.8698 - mse: 13.8698 - val_loss: 13.0046 - val_mse: 13.0046
Epoch 5/10
1/1 - 0s - loss: 13.7045 - mse: 13.7045 - val_loss: 12.8545 - val_mse: 12.8545
Epoch 6/10
1/1 - 0s - loss: 13.5260 - mse: 13.5260 - val_loss: 12.6917 - val_mse: 12.6917
Epoch 7/10
1/1 - 0s - loss: 13.3321 - mse: 13.3321 - val_loss: 12.5148 - val_mse: 12.5148
Epoch 8/10
1/1 - 0s - loss: 13.1219 - mse: 13.1219 - val_loss: 12.3214 - val_mse: 12.3214
Epoch 9/10
1/1 - 0s - loss: 12.8932 - mse: 12.8932 - val_loss: 12.1095 - val_mse: 12.1095
Epoch 10/10
1/1 - 0s - loss: 12.6437 - mse: 12.6437 - val_loss: 11.8777 - val_mse: 11.8777


### Model inference

In [77]:
if use_weighted_sequence == False:
    print(feature_names)
    print()
    sample_data = data.iloc[0]
    sample_input_data = {feat: np.array([sample_data[feat]]) for feat in feature_names}
    sample_input_data["genres"] = np.array([genres_list[0]])
    print(sample_input_data)
    print()
    print(model.predict(sample_input_data))

In [78]:
sample_data = data.iloc[0]


In [79]:
model_input["genres_weight"][0]

array([[-1.19243571],
       [ 2.50912772],
       [-0.37220681],
       [ 0.16265287],
       [ 0.07435371]])

In [80]:
if use_weighted_sequence == True:
    print(feature_names)
    print()
    sample_data = data.iloc[0]
    print(sample_data["rating"])
    print()
    sample_input_data = {feat: np.array([sample_data[feat]]) for feat in feature_names}
    sample_input_data["genres"] = np.array([genres_list[0]])
    sample_input_data["genres_weight"] = np.array([model_input["genres_weight"][0]])
    # sample_input_data["genres_weight"] = np.random.randn(1, max_len, 1)
    print(sample_input_data)
    print()
    print(model.predict(sample_input_data))

['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres']

4

{'movie_id': array([12]), 'user_id': array([107]), 'gender': array([0]), 'age': array([2]), 'occupation': array([4]), 'zip': array([35]), 'genres': array([[1, 2, 0, 0, 0]], dtype=int32), 'genres_weight': array([[[-1.19243571],
        [ 2.50912772],
        [-0.37220681],
        [ 0.16265287],
        [ 0.07435371]]])}

[[0.27892944]]
