In [111]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [112]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [113]:
data = pd.read_csv('data/criteo_sample.txt')
data.tail()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
195,0,,0,113.0,3.0,3036.0,575.0,2.0,3.0,214.0,...,07c540c4,9880032b,21ddcdc9,5840adea,34cc61bb,c9d4222a,32c7478e,e5ed7da2,ea9a246c,984e0db0
196,1,0.0,1,1.0,1.0,1607.0,12.0,1.0,12.0,15.0,...,1e88c74f,3972b4ed,,,d1aa4512,,32c7478e,9257f75f,,
197,1,1.0,0,6.0,3.0,0.0,0.0,19.0,3.0,3.0,...,3486227d,5aed7436,54591762,a458ea53,4a2c3526,,32c7478e,1793a828,e8b83407,1a02cbe1
198,0,0.0,22,6.0,22.0,203.0,153.0,80.0,18.0,508.0,...,3486227d,13145934,55dd3565,5840adea,bf647035,,32c7478e,1481ceb4,e8b83407,988b0775
199,0,1.0,-1,,,138.0,0.0,1.0,0.0,0.0,...,d4bb7bd8,908eaeb8,,,,,32c7478e,,,


In [87]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
sparse_features

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26']

In [88]:
dense_features = ['I' + str(i) for i in range(1, 14)]
dense_features

['I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13']

In [84]:
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [85]:
data

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.000000,0.001332,0.092362,0.000000,0.034825,0.000000,0.000000,0.673469,0.000000,...,8,66,0,0,3,0,1,96,0,0
1,0,0.000000,0.000000,0.006750,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.000000,0.000333,0.000710,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.000000,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.000000,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0.000000,0.000333,0.040142,0.034483,0.005984,0.273029,0.006645,0.061224,0.206963,...,0,74,5,1,30,5,0,118,17,48
196,1,0.000000,0.000666,0.000355,0.011494,0.003168,0.005698,0.003322,0.244898,0.014507,...,1,25,0,0,138,0,0,68,0,0
197,1,0.027027,0.000333,0.002131,0.034483,0.000000,0.000000,0.063123,0.061224,0.002901,...,4,40,17,2,41,0,0,12,16,11
198,0,0.000000,0.007662,0.002131,0.252874,0.000400,0.072650,0.265781,0.367347,0.491296,...,4,7,18,1,123,0,0,10,16,49


## Label Encoding

In [61]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [62]:
lbe.inverse_transform([0]), lbe.inverse_transform([11])

(array(['-1'], dtype=object), array(['1a02cbe1'], dtype=object))

In [63]:
data

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,3,260.0,0.0,17668.0,0.0,0.0,33.0,0.0,...,8,66,0,0,3,0,1,96,0,0
1,0,0.0,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,7,52,0,0,47,0,7,112,0,0
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,8,49,0,0,25,0,6,53,0,0
3,0,0.0,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,8,37,0,0,156,0,0,32,0,0
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,8,14,5,3,9,0,0,5,1,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0.0,0,113.0,3.0,3036.0,575.0,2.0,3.0,214.0,...,0,74,5,1,30,5,0,118,17,48
196,1,0.0,1,1.0,1.0,1607.0,12.0,1.0,12.0,15.0,...,1,25,0,0,138,0,0,68,0,0
197,1,1.0,0,6.0,3.0,0.0,0.0,19.0,3.0,3.0,...,4,40,17,2,41,0,0,12,16,11
198,0,0.0,22,6.0,22.0,203.0,153.0,80.0,18.0,508.0,...,4,7,18,1,123,0,0,10,16,49


In [64]:
mms = MinMaxScaler(feature_range=(0,1))

In [65]:
data[dense_features] = mms.fit_transform(data[dense_features])
data.tail()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
195,0,0.0,0.000333,0.040142,0.034483,0.005984,0.273029,0.006645,0.061224,0.206963,...,0,74,5,1,30,5,0,118,17,48
196,1,0.0,0.000666,0.000355,0.011494,0.003168,0.005698,0.003322,0.244898,0.014507,...,1,25,0,0,138,0,0,68,0,0
197,1,0.027027,0.000333,0.002131,0.034483,0.0,0.0,0.063123,0.061224,0.002901,...,4,40,17,2,41,0,0,12,16,11
198,0,0.0,0.007662,0.002131,0.252874,0.0004,0.07265,0.265781,0.367347,0.491296,...,4,7,18,1,123,0,0,10,16,49
199,0,0.027027,0.0,0.0,0.0,0.000272,0.0,0.003322,0.0,0.0,...,7,72,0,0,0,0,0,0,0,0


In [89]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

In [91]:
d = fixlen_feature_columns[0]

In [103]:
d.embedding_name

'C1'

<function SparseFeat.index(value, start=0, stop=9223372036854775807, /)>

In [95]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

In [96]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [99]:
train_model_input['C1']

170    39af2607
190    68fd1e64
2      05db9164
180    8cf07265
197    05db9164
         ...   
192    5a9ed9b0
4      05db9164
14     5bfa8ab5
95     52f1e825
43     8cf07265
Name: C1, Length: 160, dtype: object

In [102]:
linear_feature_columns[2]

SparseFeat(name='C3', vocabulary_size=171, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group')

In [70]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [81]:
model_fm = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary', l2_reg_embedding=1e-5, device=device)

In [82]:
model_fm.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"], )
model_fm.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

cpu
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
0s - loss:  0.5945 - binary_crossentropy:  0.5945 - auc:  0.5633
Epoch 2/10
0s - loss:  0.4814 - binary_crossentropy:  0.4814 - auc:  0.9376
Epoch 3/10
0s - loss:  0.3397 - binary_crossentropy:  0.3397 - auc:  0.9951
Epoch 4/10
0s - loss:  0.1777 - binary_crossentropy:  0.1777 - auc:  0.9989
Epoch 5/10
0s - loss:  0.0866 - binary_crossentropy:  0.0866 - auc:  1.0000
Epoch 6/10
0s - loss:  0.0520 - binary_crossentropy:  0.0520 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0356 - binary_crossentropy:  0.0356 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0253 - binary_crossentropy:  0.0253 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0196 - binary_crossentropy:  0.0196 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0157 - binary_crossentropy:  0.0157 - auc:  1.0000


<tensorflow.python.keras.callbacks.History at 0x1715d3fd0>

In [83]:
pred_ans = model_fm.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


test LogLoss 0.6306
test AUC 0.5887


In [77]:
model_dcn = DCN(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                task='binary', l2_reg_embedding=1e-5, device=device)

In [79]:
model_dcn.compile("adagrad", "binary_crossentropy",
                  metrics=["binary_crossentropy", "auc"], )
model_dcn.fit(train_model_input,train[target].values,batch_size=32,epochs=10,verbose=2,validation_split=0.0)

cpu
Train on 160 samples, validate on 0 samples, 5 steps per epoch
Epoch 1/10
0s - loss:  0.6283 - binary_crossentropy:  0.6283 - auc:  0.4483
Epoch 2/10
0s - loss:  0.5136 - binary_crossentropy:  0.5136 - auc:  0.9935
Epoch 3/10
0s - loss:  0.4224 - binary_crossentropy:  0.4224 - auc:  0.9912
Epoch 4/10
0s - loss:  0.3009 - binary_crossentropy:  0.3009 - auc:  0.9990
Epoch 5/10
0s - loss:  0.1960 - binary_crossentropy:  0.1960 - auc:  1.0000
Epoch 6/10
0s - loss:  0.1360 - binary_crossentropy:  0.1360 - auc:  1.0000
Epoch 7/10
0s - loss:  0.0949 - binary_crossentropy:  0.0949 - auc:  1.0000
Epoch 8/10
0s - loss:  0.0683 - binary_crossentropy:  0.0683 - auc:  1.0000
Epoch 9/10
0s - loss:  0.0511 - binary_crossentropy:  0.0511 - auc:  1.0000
Epoch 10/10
0s - loss:  0.0388 - binary_crossentropy:  0.0388 - auc:  1.0000


<tensorflow.python.keras.callbacks.History at 0x17166aba8>

In [80]:
pred_ans = model_dcn.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


test LogLoss 0.5599
test AUC 0.5411
