In [29]:
import time
from tqdm import tqdm
import pickle
import scipy.stats 
import pandas as pd
import numpy as np
from datetime import datetime
from deepctr.models import xDeepFM, DeepFM
from deepctr.feature_column import  SparseFeat, DenseFeat, get_feature_names
import tensorflow
from tensorflow.python.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.python.keras.metrics import AUC
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt

# reproducable
seed = 123
np.random.seed(seed)
tensorflow.random.set_seed(seed)

DeepFM-Stat
=======================

Step1: Prepare data for training and testing DeepFM-Stat
--------------------------------

In [30]:
%%time

# load train users
with open('../data/a_users_s2.data', 'rb') as filehandle:
        # store the data as binary data stream
        train_users = pickle.load(filehandle)
        print(len(train_users))

# test
with open('../data/a_users_s3.data', 'rb') as filehandle:
        # store the data as binary data stream
        test_users = pickle.load(filehandle)
        print(len(test_users))
            
# test
with open('../data/a_users_s4.data', 'rb') as filehandle:
        # store the data as binary data stream
        active_test_users = pickle.load(filehandle)
        print(len(active_test_users))
        
            
# train-test to index
with open('../data/train-test-users.pkl', 'rb') as f:
        distinct_users = pickle.load(f)
        
# train_users active
train_u_active = [x for x in train_users if x in test_users]
# train users inactive
train_u_inactive = [x for x in train_users if x not in test_users]
print(len(train_users),len(train_u_active),len(train_u_inactive))

# test_users active
test_u_active = [x for x in test_users if x in active_test_users]
# test users inactive
test_u_inactive = [x for x in test_users if x not in active_test_users]
print(len(test_users),len(test_u_active),len(test_u_inactive))


######### Training data preparation
# load
with open('../data/ours_traindf_list.pkl', 'rb') as f:
    train_df = pickle.load(f)
# display(train_df)

    
### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy']
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
train_df = train_df[columns_consider]
# display(train_df)
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)
print(non_entropy_col_indices)
###
    
# y_train
y_labels = [x in test_users for x in train_users]
y_df = pd.DataFrame({
    'user': train_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

train_df = pd.concat([train_df, y_df], axis=1)
train_df = train_df.fillna(0)
train_df.shape

# X_train
X_train_ = train_df.values[:,:-1]
y_train = train_df.values[:,-1 ]
y_train = 1 - y_train # 0 for active
np.nan_to_num(X_train_, copy=False, nan=0)

X_train_ = X_train_.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.int8)
X_train = np.log10(X_train_+1.)

# shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=28)
# # transform
# mms = MinMaxScaler(feature_range=(0, 1))
# X_train = mms.fit_transform(X_train)
print(X_train.shape, y_train.shape)


############# Test data preparation
with open('../data/ours_testdf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    test_df = pickle.load(filehandle)
    
print(test_df.shape)

#### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy']
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
test_df = test_df[columns_consider]
# display(test_df.head())
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)
####


# y_test
y_labels = [x in active_test_users for x in test_users]
y_df = pd.DataFrame({
    'user': test_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

test_df = pd.concat([test_df, y_df], axis=1)
test_df = test_df.fillna(0)

# display(test_df.head())

# X_test
X_test_ = test_df.values[:,:-1]
y_test = test_df.values[:,-1]
y_test = 1-y_test # 1 for inactive
np.nan_to_num(X_test_, copy=False, nan=0)

X_test = X_test_.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.int8)
X_test = np.log10(X_test+1.)

# transform
# X_test = mms.transform(X_test)
print(X_test.shape, y_test.shape)

60792
65568
76277
60792 29509 31283
65568 32068 33500
[0, 1, 2, 6, 7, 8, 12, 13, 14, 18, 19, 20, 24, 25, 26, 30, 31, 32, 36, 37, 38, 42, 43, 44, 48, 49, 50, 54, 55, 56, 60, 61]
(60792, 62) (60792,)
(65568, 82)
(65568, 62) (65568,)
CPU times: user 15min 53s, sys: 607 ms, total: 15min 54s
Wall time: 16min 25s


Step2: Training DeepFM-Stat
--------------
(Go to Step3: Testing DeepFM to run the one already trained for the paper)
--------------

In [28]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_all_features.h5'

X_train_df = pd.DataFrame(X_train, columns = ['f'+str(i) for i in range(X_train.shape[1])])

###
dense_features = X_train_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
print('# of dense features', len(dense_features))
train_model_input = {name: X_train_df[name].values for name in dense_features}
###

##### define model
tensorflow.keras.backend.clear_session()
model = DeepFM(dense_feature_columns, dense_feature_columns, 
               task='binary', seed=28, 
#                dnn_dropout=0.2, 
#                dnn_hidden_units=(128,128,64)
              )
print('# of params',model.count_params())

# compiling the model
es = EarlyStopping(monitor='val_auc', mode='max', patience=10)
mc = ModelCheckpoint(model_weight_path, 
                     monitor='val_auc', mode='max', verbose=1, save_best_only=True)
model.compile("adam", "binary_crossentropy", metrics=[AUC()], )

# training the model
history = model.fit(train_model_input, y_train, batch_size=256, epochs=1000, 
                    validation_split=0.2, verbose=2, callbacks=[es, mc])

# of dense features 62
# of params 24767
Epoch 1/1000
190/190 - 13s - loss: 0.4557 - auc: 0.8611 - val_loss: 0.4371 - val_auc: 0.8757

Epoch 00001: val_auc improved from -inf to 0.87573, saving model to DeepFM_w_seed28_b1_all_features.h5
Epoch 2/1000
190/190 - 5s - loss: 0.4262 - auc: 0.8820 - val_loss: 0.4300 - val_auc: 0.8820

Epoch 00002: val_auc improved from 0.87573 to 0.88202, saving model to DeepFM_w_seed28_b1_all_features.h5
Epoch 3/1000
190/190 - 4s - loss: 0.4191 - auc: 0.8864 - val_loss: 0.4221 - val_auc: 0.8850

Epoch 00003: val_auc improved from 0.88202 to 0.88502, saving model to DeepFM_w_seed28_b1_all_features.h5
Epoch 4/1000
190/190 - 4s - loss: 0.4148 - auc: 0.8888 - val_loss: 0.4257 - val_auc: 0.8853

Epoch 00004: val_auc improved from 0.88502 to 0.88533, saving model to DeepFM_w_seed28_b1_all_features.h5
Epoch 5/1000
190/190 - 4s - loss: 0.4146 - auc: 0.8889 - val_loss: 0.4246 - val_auc: 0.8865

Epoch 00005: val_auc improved from 0.88533 to 0.88654, saving model to D

Step3: Testing DeepFM-Stat
--------------------
The trained model weights for the 5 runs for the main article are in the same folder, which you can use for setting ```model_weight_path``` varialble in the cell below. 
- DeepFM_w_seed28_b1_all_features_r1.h5
- DeepFM_w_seed28_b1_all_features_r2.h5
- DeepFM_w_seed28_b1_all_features_r3.h5
- DeepFM_w_seed28_b1_all_features_r4.h5
- DeepFM_w_seed28_b1_all_features_r5.h5
  
In case of using newly trained model weight, you can use set the same model_weight_path in the Step2: Training DeepFM
- DeepFM_w_seed28_b1_all_features.h5

In [31]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_all_features_r1.h5'

X_test_df = pd.DataFrame(X_test, columns = ['f'+str(i) for i in range(X_test.shape[1])])
print(X_test_df.shape)

dense_features = X_test_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

test_model_input = {name: X_test_df[name].values for name in dense_features}
clf = DeepFM(dense_feature_columns, dense_feature_columns, 
             task='binary', seed=28, 
#              dnn_hidden_units=(128, 128, 64)
            )

clf.load_weights(model_weight_path)
y_prob = clf.predict(test_model_input, batch_size=256).ravel()
y_pred = [int(x) for x in (y_prob>=.5)]
# print(y_pred)
# print(y_prob)
print(classification_report(y_test, y_pred, digits=4))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

# easy to paste version
res = classification_report(y_test, y_pred, digits=4).split('\n')
label0 = [x for x in res[2].split(' ') if len(x)>1][:3]
label1 = [x for x in res[3].split(' ') if len(x)>1][:3]
acc = [x for x in res[5].split(' ') if len(x)>1][1]
res = label0+label1+[str(metrics.auc(fpr, tpr)), str(acc), str(metrics.log_loss(y_test, y_prob))]
print('\t'.join(res))

(65568, 62)
              precision    recall  f1-score   support

           0     0.8437    0.7565    0.7977     32068
           1     0.7879    0.8659    0.8250     33500

    accuracy                         0.8124     65568
   macro avg     0.8158    0.8112    0.8114     65568
weighted avg     0.8152    0.8124    0.8117     65568

AUROC 0.892900611387369
Log Loss 0.4104401888362103
0.8437	0.7565	0.7977	0.7879	0.8659	0.8250	0.892900611387369	0.8124	0.4104401888362103
CPU times: user 8.8 s, sys: 6.9 s, total: 15.7 s
Wall time: 12.2 s


DeepFM-Stat+Pattern
========================

Step1: Preparing data for training and testing DeepFM-Stat+Pattern
----------------

In [44]:
%%time

############# Training data preparation
# load train_df / test_df from SPMF
train_df_spmf = pd.read_csv('../baseline3/train_df_spmf.csv')
train_df_spmf.set_index('user',inplace=True)

# load train / test for DeepFM
with open('../data/ours_traindf_list.pkl', 'rb') as f:
    train_df = pickle.load(f)
# display(train_df)

### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy'
]
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
train_df = train_df[columns_consider]
# display(train_df)
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)
print(non_entropy_col_indices)
    
# y_train
y_labels = [x in test_users for x in train_users]
y_df = pd.DataFrame({
    'user': train_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

# Concat two types of features
train_df = pd.concat([
    train_df, 
    train_df_spmf, y_df], axis=1)
train_df = train_df.fillna(0)
train_df.shape


# X_train
X_train_ = train_df.values[:,:-1]
y_train = train_df.values[:,-1 ]
y_train = 1 - y_train # 0 for active
np.nan_to_num(X_train_, copy=False, nan=0)

X_train_ = X_train_.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.int8)
# X_train = np.log10(X_train_+1.)

X_train = X_train_
X_train[:,:62] = np.log10(X_train[:,:62]+1.) # only log on non-spmf features
print(X_train.shape)

# shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=28)
print(X_train.shape, y_train.shape)


########## Test data preparation
# load train_df / test_df from SPMF
test_df_spmf = pd.read_csv('../baseline3/test_df_spmf.csv')
test_df_spmf.set_index('user',inplace=True)

# load deepfm features test
with open('../data/ours_testdf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    test_df = pickle.load(filehandle)
print(test_df.shape)

#### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy'
]
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
test_df = test_df[columns_consider]
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)


# y_test
y_labels = [x in active_test_users for x in test_users]
y_df = pd.DataFrame({
    'user': test_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

# Concat two types of features
test_df = pd.concat([
    test_df, 
    test_df_spmf, y_df], axis=1)
test_df = test_df.fillna(0)

# X_test
X_test_ = test_df.values[:,:-1]
y_test = test_df.values[:,-1]
y_test = 1-y_test # 1 for inactive
np.nan_to_num(X_test_, copy=False, nan=0)

###
X_test_ = X_test_.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.int8)
# X_train = np.log10(X_train_+1.)
X_test = X_test_
X_test[:,:62] = np.log10(X_test[:,:62]+1.) # only log on non-spmf features
print(X_test.shape)
###


[0, 1, 2, 6, 7, 8, 12, 13, 14, 18, 19, 20, 24, 25, 26, 30, 31, 32, 36, 37, 38, 42, 43, 44, 48, 49, 50, 54, 55, 56, 60, 61]
(60792, 140)
(60792, 140) (60792,)
(65568, 82)
(65568, 140)
CPU times: user 2min 27s, sys: 1.83 s, total: 2min 29s
Wall time: 2min 34s


Step2: Training DeepFM-Stat+Pattern
--------------
(Go to Step3: Testing DeepFM-Stat+Pattern to run the one already trained for the paper)
--------------

In [45]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_all_features_spmfsparse.h5'

X_train_df = pd.DataFrame(X_train, columns = ['f'+str(i) for i in range(X_train.shape[1])])

###
dense_features = X_train_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
print('# of dense features', len(dense_features))
train_model_input = {name: X_train_df[name].values for name in dense_features}
###

### sparse features for DeepFM for patterns
sparse_features = [x for x in X_train_df.columns[62:]]
dense_features = [x for x in X_train_df.columns[:62]]
X_train_df[sparse_features] = X_train_df[sparse_features].astype(int)
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=X_train_df[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
train_model_input = {name: X_train_df[name].values for name in feature_names}
###


##### define model
tensorflow.keras.backend.clear_session()
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', seed=28)
print('# of params',model.count_params())

# compiling the model
es = EarlyStopping(monitor='val_auc', mode='max', patience=10)
mc = ModelCheckpoint(model_weight_path, 
                     monitor='val_auc', mode='max', verbose=1, save_best_only=True)
model.compile("adam", "binary_crossentropy", metrics=[AUC()], )
# training the model
history = model.fit(train_model_input, y_train, batch_size=256, epochs=1000, 
                    validation_split=0.2, verbose=2, callbacks=[es, mc])

# of dense features 140
# of params 65483
Epoch 1/1000
190/190 - 45s - loss: 0.3643 - auc: 0.9149 - val_loss: 0.3205 - val_auc: 0.9369

Epoch 00001: val_auc improved from -inf to 0.93685, saving model to DeepFM_w_seed28_b1_all_features_spmfsparse.h5
Epoch 2/1000
190/190 - 15s - loss: 0.2923 - auc: 0.9476 - val_loss: 0.2932 - val_auc: 0.9492

Epoch 00002: val_auc improved from 0.93685 to 0.94925, saving model to DeepFM_w_seed28_b1_all_features_spmfsparse.h5
Epoch 3/1000
190/190 - 15s - loss: 0.2702 - auc: 0.9555 - val_loss: 0.2711 - val_auc: 0.9558

Epoch 00003: val_auc improved from 0.94925 to 0.95576, saving model to DeepFM_w_seed28_b1_all_features_spmfsparse.h5
Epoch 4/1000
190/190 - 15s - loss: 0.2537 - auc: 0.9608 - val_loss: 0.2620 - val_auc: 0.9584

Epoch 00004: val_auc improved from 0.95576 to 0.95839, saving model to DeepFM_w_seed28_b1_all_features_spmfsparse.h5
Epoch 5/1000
190/190 - 15s - loss: 0.2464 - auc: 0.9630 - val_loss: 0.2554 - val_auc: 0.9603

Epoch 00005: val_auc im

KeyboardInterrupt: 

Step3: Testing DeepFM-Stat+Pattern
------------------------
The trained model weights for the 5 runs for the main article are in the same folder, which you can use for setting ```model_weight_path``` varialble in the cell below. 
- DeepFM_w_seed28_b1_all_features_spmfsparse_r1.h5
- DeepFM_w_seed28_b1_all_features_spmfsparse_r2.h5
- DeepFM_w_seed28_b1_all_features_spmfsparse_r3.h5
- DeepFM_w_seed28_b1_all_features_spmfsparse_r4.h5
- DeepFM_w_seed28_b1_all_features_spmfsparse_r5.h5
  
In case of using newly trained model weight, you can use set the same model_weight_path in the Step2: Training DeepFM
- DeepFM_w_seed28_b1_all_features_spmfsparse.h5

In [22]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_all_features_spmfsparse_r1.h5'

### DeepFM load and test
X_test_df = pd.DataFrame(X_test, columns = ['f'+str(i) for i in range(X_test.shape[1])])
print(X_test_df.shape)

dense_features = X_test_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
test_model_input = {name: X_test_df[name].values for name in dense_features}
###


### sparse features for DeepFM for patterns
sparse_features = [x for x in X_test_df.columns[62:]]
dense_features = [x for x in X_test_df.columns[:62]]
X_test_df[sparse_features] = X_test_df[sparse_features].astype(int)
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=X_test_df[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print(len(feature_names))
test_model_input = {name: X_test_df[name].values for name in feature_names}
###

clf = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', seed=28)
clf.load_weights(model_weight_path)
y_prob = clf.predict(test_model_input, batch_size=256).ravel()
y_pred = [int(x) for x in (y_prob>=.5)]
# print(y_pred)
# print(y_prob)
print(classification_report(y_test, y_pred, digits=4))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob, pos_label=1)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob, eps=1e-7))

(65568, 140)
140
              precision    recall  f1-score   support

           0     0.8581    0.9178    0.8869     32068
           1     0.9157    0.8547    0.8841     33500

    accuracy                         0.8856     65568
   macro avg     0.8869    0.8862    0.8855     65568
weighted avg     0.8875    0.8856    0.8855     65568

AUROC 0.9566304545936899
Log Loss 0.2728058444851035
CPU times: user 22.4 s, sys: 10.6 s, total: 33 s
Wall time: 26.2 s


DeepFM-Pattern
========================

Step1: Preparing data for training and testing DeepFM-Pattern
----------------

In [None]:
%%time

############# Training data preparation
# load train_df / test_df from SPMF
train_df_spmf = pd.read_csv('../baseline3/train_df_spmf.csv')
train_df_spmf.set_index('user',inplace=True)

# load train / test for DeepFM
with open('../data/ours_traindf_list.pkl', 'rb') as f:
    train_df = pickle.load(f)
# display(train_df)

### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy'
]
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
train_df = train_df[columns_consider]
# display(train_df)
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)
print(non_entropy_col_indices)
    
# y_train
y_labels = [x in test_users for x in train_users]
y_df = pd.DataFrame({
    'user': train_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

# Concat two types of features
train_df = pd.concat([
    train_df, 
    train_df_spmf, y_df], axis=1)
train_df = train_df.fillna(0)
train_df.shape


# X_train
X_train_ = train_df.values[:,:-1]
y_train = train_df.values[:,-1 ]
y_train = 1 - y_train # 0 for active
np.nan_to_num(X_train_, copy=False, nan=0)

X_train_ = X_train_.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.int8)
# X_train = np.log10(X_train_+1.)

X_train = X_train_
X_train[:,:62] = np.log10(X_train[:,:62]+1.) # only log on non-spmf features
print(X_train.shape)

# shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=28)
print(X_train.shape, y_train.shape)


########## Test data preparation
# load train_df / test_df from SPMF
test_df_spmf = pd.read_csv('../baseline3/test_df_spmf.csv')
test_df_spmf.set_index('user',inplace=True)

# load deepfm features test
with open('../data/ours_testdf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    test_df = pickle.load(filehandle)
print(test_df.shape)

#### columns to consdier
columns_model = [
    'count',
    'unique',
    'daydiff',
    'edittype_entropy',
    'target_entropy',
    'dow_entropy'
]
columns_consider = list()
for i in range(10):
    for c in columns_model:
        columns_consider.append((i, c))
columns_consider += [
    (9,'pred_first_edit'),
    (9,'pred_last_edit')
]
test_df = test_df[columns_consider]
non_entropy_col_indices = list()
for i,c in enumerate(columns_consider):
    if 'entropy' not in c[1]:
        non_entropy_col_indices.append(i)


# y_test
y_labels = [x in active_test_users for x in test_users]
y_df = pd.DataFrame({
    'user': test_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

# Concat two types of features
test_df = pd.concat([
    test_df, 
    test_df_spmf, y_df], axis=1)
test_df = test_df.fillna(0)

# X_test
X_test_ = test_df.values[:,:-1]
y_test = test_df.values[:,-1]
y_test = 1-y_test # 1 for inactive
np.nan_to_num(X_test_, copy=False, nan=0)

###
X_test_ = X_test_.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.int8)
# X_train = np.log10(X_train_+1.)
X_test = X_test_
X_test[:,:62] = np.log10(X_test[:,:62]+1.) # only log on non-spmf features
print(X_test.shape)
###

Step2: Training DeepFM-Pattern
--------------
(Go to Step3: Testing DeepFM-Pattern to run the one already trained for the paper)
--------------

In [42]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_spmfsparse_r1.h5'

X_train_df = pd.DataFrame(X_train, columns = ['f'+str(i) for i in range(X_train.shape[1])])

###
dense_features = X_train_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
train_model_input = {name: X_train_df[name].values for name in dense_features}
###

### sparse features for DeepFM for patterns
sparse_features = [x for x in X_train_df.columns[62:]]
# don't use stat features
dense_features = [] 
print('# of dense features', len(dense_features))
X_train_df[sparse_features] = X_train_df[sparse_features].astype(int)
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=X_train_df[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
train_model_input = {name: X_train_df[name].values for name in feature_names}
###


##### define model
tensorflow.keras.backend.clear_session()
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', seed=28)
print('# of params',model.count_params())

# compiling the model
es = EarlyStopping(monitor='val_auc', mode='max', patience=10)
mc = ModelCheckpoint(model_weight_path, 
                     monitor='val_auc', mode='max', verbose=1, save_best_only=True)
model.compile("adam", "binary_crossentropy", metrics=[AUC()], )

# training the model
history = model.fit(train_model_input, y_train, batch_size=256, epochs=1000, 
                    validation_split=0.2, verbose=2, callbacks=[es, mc])

# of dense features 0
# of params 57485
Epoch 1/1000
190/190 - 37s - loss: 0.4566 - auc: 0.8671 - val_loss: 0.4440 - val_auc: 0.8747

Epoch 00001: val_auc improved from -inf to 0.87469, saving model to DeepFM_w_seed28_b1_spmfsparse_r5.h5
Epoch 2/1000
190/190 - 13s - loss: 0.4311 - auc: 0.8809 - val_loss: 0.4385 - val_auc: 0.8763

Epoch 00002: val_auc improved from 0.87469 to 0.87626, saving model to DeepFM_w_seed28_b1_spmfsparse_r5.h5
Epoch 3/1000
190/190 - 12s - loss: 0.4288 - auc: 0.8816 - val_loss: 0.4356 - val_auc: 0.8772

Epoch 00003: val_auc improved from 0.87626 to 0.87720, saving model to DeepFM_w_seed28_b1_spmfsparse_r5.h5
Epoch 4/1000
190/190 - 13s - loss: 0.4271 - auc: 0.8824 - val_loss: 0.4372 - val_auc: 0.8773

Epoch 00004: val_auc improved from 0.87720 to 0.87733, saving model to DeepFM_w_seed28_b1_spmfsparse_r5.h5
Epoch 5/1000
190/190 - 13s - loss: 0.4265 - auc: 0.8826 - val_loss: 0.4365 - val_auc: 0.8774

Epoch 00005: val_auc improved from 0.87733 to 0.87740, saving mod

Step3: Testing DeepFM-Pattern
------------------------
The trained model weights for the 5 runs for the main article are in the same folder, which you can use for setting ```model_weight_path``` varialble in the cell below. 
- DeepFM_w_seed28_b1_spmfsparse_r1.h5
- DeepFM_w_seed28_b1_spmfsparse_r2.h5
- DeepFM_w_seed28_b1_spmfsparse_r3.h5
- DeepFM_w_seed28_b1_spmfsparse_r4.h5
- DeepFM_w_seed28_b1_spmfsparse_r5.h5
  
In case of using newly trained model weight, you can use set the same model_weight_path in the Step2: Training DeepFM
- DeepFM_w_seed28_b1_spmfsparse.h5

In [43]:
%%time

model_weight_path = 'DeepFM_w_seed28_b1_spmfsparse_r5.h5'

### DeepFM load and test
X_test_df = pd.DataFrame(X_test, columns = ['f'+str(i) for i in range(X_test.shape[1])])
print(X_test_df.shape)

dense_features = X_test_df.columns
dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]
test_model_input = {name: X_test_df[name].values for name in dense_features}
###


### sparse features for DeepFM for patterns
sparse_features = [x for x in X_test_df.columns[62:]]
# don't use stat features
dense_features = []
X_test_df[sparse_features] = X_test_df[sparse_features].astype(int)
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=X_test_df[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print('# of features', len(feature_names))
test_model_input = {name: X_test_df[name].values for name in feature_names}
###

clf = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', seed=28)
clf.load_weights(model_weight_path)
y_prob = clf.predict(test_model_input, batch_size=256).ravel()
y_pred = [int(x) for x in (y_prob>=.5)]
# print(y_pred)
# print(y_prob)
print(classification_report(y_test, y_pred, digits=4))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob, pos_label=1)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob, eps=1e-7))

# easy to paste version
res = classification_report(y_test, y_pred, digits=4).split('\n')
label0 = [x for x in res[2].split(' ') if len(x)>1][:3]
label1 = [x for x in res[3].split(' ') if len(x)>1][:3]
acc = [x for x in res[5].split(' ') if len(x)>1][1]
res = label0+label1+[str(metrics.auc(fpr, tpr)), str(acc), str(metrics.log_loss(y_test, y_prob))]
print('\t'.join(res))

(65568, 140)
# of features 78
              precision    recall  f1-score   support

           0     0.7838    0.8104    0.7969     32068
           1     0.8124    0.7861    0.7990     33500

    accuracy                         0.7980     65568
   macro avg     0.7981    0.7982    0.7979     65568
weighted avg     0.7984    0.7980    0.7980     65568

AUROC 0.8787050637730643
Log Loss 0.4314696413724143
0.7838	0.8104	0.7969	0.8124	0.7861	0.7990	0.8787050637730643	0.7980	0.4314696413724143
CPU times: user 12.3 s, sys: 7.4 s, total: 19.7 s
Wall time: 9.25 s
