# Imports and Load data

In [13]:
# Imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, get_feature_names, DenseFeat
from tensorflow.keras.metrics import AUC

data = pd.read_csv('./data/criteo/criteo_train.csv')

In [3]:
sparse_features = [f'cat_{i}' for i in range(1,27)]
dense_features = [f'int_{i}' for i in range(1,14)]

data[sparse_features] = data[sparse_features].fillna('-1', ).astype('string')
data[dense_features] = data[dense_features].fillna(0,).astype('int64')
target = ['click']
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 40 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   click   100000 non-null  int64 
 1   int_1   100000 non-null  int64 
 2   int_2   100000 non-null  int64 
 3   int_3   100000 non-null  int64 
 4   int_4   100000 non-null  int64 
 5   int_5   100000 non-null  int64 
 6   int_6   100000 non-null  int64 
 7   int_7   100000 non-null  int64 
 8   int_8   100000 non-null  int64 
 9   int_9   100000 non-null  int64 
 10  int_10  100000 non-null  int64 
 11  int_11  100000 non-null  int64 
 12  int_12  100000 non-null  int64 
 13  int_13  100000 non-null  int64 
 14  cat_1   100000 non-null  string
 15  cat_2   100000 non-null  string
 16  cat_3   100000 non-null  string
 17  cat_4   100000 non-null  string
 18  cat_5   100000 non-null  string
 19  cat_6   100000 non-null  string
 20  cat_7   100000 non-null  string
 21  cat_8   100000 non-null  string
 2

# Simple preprocessing

In this step we scale the numerical features. Sparse categorical features are encoded on the fly in the next step

In [4]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

# Generate Feature Columns

In [5]:
# Hash encoding on the fly
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=100,embedding_dim=4, use_hash=True, dtype='string')  # the input is string
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                          for feat in dense_features]

In [6]:
# Generate feature columns
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# Generate the training samples and train the model

In [10]:
# Generate training samples
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [14]:
# Create the model
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy',AUC()], )

In [15]:
# Fit the model
history = model.fit(train_model_input, train[target].values, batch_size=100,epochs=10, validation_split=0.2, )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
