In [1]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [18]:
data = pd.read_csv('./data/criteo_sample.txt')

In [19]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,,,0.0,,05db9164,08d6d899,9143c832,f56b7dd5,25c83c98,7e0ccccf,df5c2d18,0b153874,a73ee510,8f48ce11,a7b606c4,ae1bb660,eae197fd,b28479f6,bfef54b3,bad5ee18,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,,1.0,,35.0,68fd1e64,04e09220,95e13fd4,a1e6a194,25c83c98,fe6b92e5,f819e175,062b5529,a73ee510,ab9456b4,6153cf57,8882c6cd,769a1844,b28479f6,69f825dd,23056e4f,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,0.0,3.0,,18.0,05db9164,38a947a1,3f55fb72,5de245c7,30903e74,7e0ccccf,b72ec13d,1f89b562,a73ee510,acce978c,3547565f,a5b0521a,12880350,b28479f6,c12fc269,95a8919c,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,,2.0,,4.0,05db9164,8084ee93,02cf9876,c18be181,25c83c98,,e14874c9,0b153874,7cc72ec2,2462946f,636405ac,8fe001f4,31b42deb,07d13a8f,422c8577,36103458,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,0.0,1.0,,27.0,05db9164,207b2d81,5d076085,862b5ba0,25c83c98,fbad5c96,17c22666,0b153874,a73ee510,534fc986,feb49a68,f24b551c,8978af5c,64c94865,32ec6582,b6d021e8,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de


In [20]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

In [21]:
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [22]:
# Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [23]:
# Count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                            for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                        for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [24]:
# Generate input data for model
train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [26]:
# Define Model
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
                metrics=['binary_crossentropy'], )

In [27]:
# Train Model
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
1/1 - 6s - loss: 0.6464 - binary_crossentropy: 0.6464 - val_loss: 0.6284 - val_binary_crossentropy: 0.6284
Epoch 2/10
1/1 - 0s - loss: 0.6293 - binary_crossentropy: 0.6293 - val_loss: 0.6197 - val_binary_crossentropy: 0.6197
Epoch 3/10
1/1 - 0s - loss: 0.6128 - binary_crossentropy: 0.6128 - val_loss: 0.6113 - val_binary_crossentropy: 0.6112
Epoch 4/10
1/1 - 0s - loss: 0.5968 - binary_crossentropy: 0.5968 - val_loss: 0.6032 - val_binary_crossentropy: 0.6032
Epoch 5/10
1/1 - 0s - loss: 0.5810 - binary_crossentropy: 0.5810 - val_loss: 0.5960 - val_binary_crossentropy: 0.5960
Epoch 6/10
1/1 - 0s - loss: 0.5654 - binary_crossentropy: 0.5653 - val_loss: 0.5894 - val_binary_crossentropy: 0.5894
Epoch 7/10
1/1 - 0s - loss: 0.5498 - binary_crossentropy: 0.5498 - val_loss: 0.5834 - val_binary_crossentropy: 0.5834
Epoch 8/10
1/1 - 0s - loss: 0.5343 - binary_crossentropy: 0.5343 - val_loss: 0.5784 - val_binary_crossentropy: 0.5783
Epoch 9/10
1/1 - 0s - loss: 0.5189 - binary_crossentropy

In [28]:
# Predict
pred_ans = model.predict(test_model_input, batch_size=256)

In [30]:
# Evaluate the Model
print("test LogLoss:", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC:", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss: 0.5541
test AUC: 0.4695
