This is an example of how we can apply DeepFM using 'deepctr' to solve a binary classification problem.

We will utilize the frappe dataset (https://www.baltrunas.info/context-aware).




# Imports and Constants

In [4]:
!pip install deepctr==0.7.5



In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tqdm import tqdm
import os 
from deepctr.models import *
from deepctr.inputs import  SparseFeat, DenseFeat, get_feature_names

import tensorflow as tf
import tensorflow.keras.backend as K

In [7]:

# Mount google drive
from google.colab import drive

# Mount Drive and change paths

In [8]:
#Mount Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#Make sure you downloaded the dataset and you create such a path, alternativly - changed this path
DATA_PATH = '/content/drive/My Drive/Recommender Systems/Datasets/'
os.chdir(DATA_PATH)

# Load Data

In [13]:
data = pd.read_csv('frappe_all.csv')
data['label'] *= 1.0
data

Unnamed: 0,user,item,daytime,weekday,isweekend,homework,cost,weather,country,city,label
0,66,2471,morning,sunday,weekend,unknown,free,rainy,United States,0,1.0
1,269,116,afternoon,thursday,weekend,unknown,paid,sunny,Palestine,434,0.0
2,225,354,noon,saturday,workday,unknown,free,stormy,Hungary,347,0.0
3,108,5,sunset,wednesday,workday,home,free,drizzle,Japan,391,0.0
4,257,33,morning,thursday,workday,unknown,free,cloudy,United States,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
288604,133,28,evening,friday,weekend,unknown,free,snowy,Finland,1009,0.0
288605,583,2445,afternoon,friday,workday,unknown,free,unknown,unknown,0,1.0
288606,264,16,sunrise,tuesday,weekend,home,free,snowy,Lebanon,306,0.0
288607,70,336,sunrise,tuesday,workday,unknown,free,sleet,Japan,919,0.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288609 entries, 0 to 288608
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user       288609 non-null  int64  
 1   item       288609 non-null  int64  
 2   daytime    288609 non-null  object 
 3   weekday    288609 non-null  object 
 4   isweekend  288609 non-null  object 
 5   homework   288609 non-null  object 
 6   cost       288609 non-null  object 
 7   weather    288609 non-null  object 
 8   country    288609 non-null  object 
 9   city       288609 non-null  int64  
 10  label      288609 non-null  float64
dtypes: float64(1), int64(3), object(7)
memory usage: 24.2+ MB


## Define dense and sparse features

In [16]:
sparse_features = ['user', 'item', 'daytime', 'weekday', 'isweekend', 'homework', 'cost',
       'weather', 'country', 'city']
dense_features = []

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

At the moment all features are defined as sparse. As a code task, use alternative data (e.g., Crieto, Avazu, etc') or enrich this data with additional dense features.


In [18]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """        
        
        # for unique_item in data_list.unique():
        #     if unique_item not in self.label_encoder.classes_:
        #         new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]
        new_data_list = data_list.apply(lambda x: 'Unknown' if x not in self.label_encoder.classes_ else x)        
        return self.label_encoder.transform(new_data_list)    

In order to use encoding we first have to split the data intro train/validation/test sets. 

Motivation: we do not want to encode data of the training set utilizing data from the test set for example.

In [19]:
train, test = train_test_split(data, test_size=0.1)
train, val = train_test_split(data, test_size=0.2)

In [20]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in tqdm(sparse_features):
    lbe = LabelEncoderExt()
    lbe.fit(train[feat])    
    print('finished fitting')
    train[feat] = lbe.transform(train[feat])
    print('finished transform train')
    val[feat] = lbe.transform(val[feat])
    print('finished transform validation')
    test[feat] = lbe.transform(test[feat])
    print('finished transform test')




finished fitting


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished transform train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished transform validation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
 10%|█         | 1/10 [00:02<00:25,  2.85s/it]

finished transform test
finished fitting
finished transform train
finished transform validation


 20%|██        | 2/10 [00:05<00:22,  2.85s/it]

finished transform test
finished fitting
finished transform train


 30%|███       | 3/10 [00:07<00:16,  2.41s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 40%|████      | 4/10 [00:08<00:12,  2.11s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 50%|█████     | 5/10 [00:09<00:09,  1.89s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 60%|██████    | 6/10 [00:11<00:06,  1.73s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 70%|███████   | 7/10 [00:12<00:04,  1.61s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 80%|████████  | 8/10 [00:13<00:03,  1.55s/it]

finished transform validation
finished transform test
finished fitting
finished transform train


 90%|█████████ | 9/10 [00:15<00:01,  1.59s/it]

finished transform validation
finished transform test
finished fitting
finished transform train
finished transform validation


100%|██████████| 10/10 [00:18<00:00,  1.85s/it]

finished transform test





In [26]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique()+1,embedding_dim=256)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [27]:
# 3.generate input data for model
train_model_input = {name:train[name] for name in feature_names}
val_model_input = {name:val[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [28]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [31]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',
        dnn_dropout=0.3, l2_reg_embedding=0.2, l2_reg_dnn=0.2)
model.compile(optimizer="adam", loss=root_mean_squared_error,
              metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()], )

history = model.fit(train_model_input, train[target].values,batch_size=128, epochs=5, verbose=1, validation_data=(val_model_input, val[target].values))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.1655
test AUC 0.9823


# Todo: Utizling DeepFM and expand it's use by applying it with different data such as Criteo or Avazu.
Notably, we did not perform k-fold and any cleansing of data. 

Implement 5-fold within this experiments and make sure to use for sparse and dense features.

More info regarding DeepFM can be found at:
* https://github.com/shenweichen/DeepCTR
* https://deepctr-doc.readthedocs.io/en/latest/