In [16]:
import time
import pickle
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from datetime import datetime
from deepctr.models import xDeepFM, DeepFM
from deepctr.feature_column import  SparseFeat, DenseFeat, get_feature_names
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.metrics import AUC
import tensorflow
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt

# for reproducibility
seed = 123
np.random.seed(seed)
tensorflow.random.set_seed(seed)

Step1: Prepare data for training and testing
--------------------------------

In [17]:
%%time

# load train users
with open('../data/a_users_s2.data', 'rb') as filehandle:
    # store the data as binary data stream
    train_users = pickle.load(filehandle)
print(len(train_users))

# test
with open('../data/a_users_s3.data', 'rb') as filehandle:
    # store the data as binary data stream
    test_users = pickle.load(filehandle)
print(len(test_users))

# to get active inactive in test
with open('../data/a_users_s4.data', 'rb') as filehandle:
    # store the data as binary data stream
    s4_active_users = pickle.load(filehandle)
print(len(s4_active_users))

############### Training data preparation
with open('../data/baseline1_traindf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    train_df_list = pickle.load(filehandle)
    
refined_list = list()
for df in train_df_list:
#     print(df)
    temp_df = df[[('target','count'),('target','nunique'),('time','max'),('time','min')]]
    temp_df[('time','max')] = pd.to_datetime(temp_df[('time','max')])
    temp_df[('time','min')] = pd.to_datetime(temp_df[('time','min')])
    temp_df['daydiff'] = (temp_df[('time','max')] - temp_df[('time','min')]).dt.total_seconds() / (60.0*60*24)
    temp_df.columns = ['count', 'unique', 'max_time', 'min_time', 'daydiff']
    refined_list.append(temp_df[['count', 'unique', 'daydiff']])
    
# concat
train_df = pd.concat(refined_list, axis=1, keys=list(range(10)))
print(train_df.shape)
train_df.head()

# y_train
y_labels = [x in test_users for x in train_users]
y_df = pd.DataFrame({
    'user': train_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

train_df = pd.concat([train_df, y_df], axis=1)
train_df = train_df.fillna(0)
train_df.shape

# X_train
X_train = train_df.values[:,:-1]
y_train = train_df.values[:,-1 ]
y_train = 1 - y_train # 0 for active
np.nan_to_num(X_train, copy=False, nan=0.0)

X_train = X_train.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.int8)
X_train = np.log10(X_train+1.)
print(X_train.shape, y_train.shape)


############ Test data preparation
with open('../data/baseline1_testdf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    test_df_list = pickle.load(filehandle)
    
refined_list = list()
for df in test_df_list:
#     print(df)
    temp_df = df[[('target','count'),('target','nunique'),('time','max'),('time','min')]]
    temp_df[('time','max')] = pd.to_datetime(temp_df[('time','max')])
    temp_df[('time','min')] = pd.to_datetime(temp_df[('time','min')])
    temp_df['daydiff'] = (temp_df[('time','max')] - temp_df[('time','min')]).dt.total_seconds() / (60.0*60*24)
    temp_df.columns = ['count', 'unique', 'max_time', 'min_time', 'daydiff']
    refined_list.append(temp_df[['count', 'unique', 'daydiff']])
    
# concat
test_df = pd.concat(refined_list, axis=1, keys=list(range(10)))
print(test_df.shape)
test_df.head()

############### to refine
# y_train
y_labels = [x in s4_active_users for x in test_users]
y_df = pd.DataFrame({
    'user': test_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

test_df = pd.concat([test_df, y_df], axis=1)
test_df = test_df.fillna(0)
print(test_df.shape)

# X_train
X_test = test_df.values[:,:-1]
y_test = test_df.values[:,-1]
y_test = 1-y_test
np.nan_to_num(X_test, copy=False, nan=0.0)

X_test = X_test.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.int8)
X_test = np.log10(X_test+1.)
print(X_test.shape, y_test.shape)

60792
65568
76277
(60792, 30)
(60792, 30) (60792,)
(65568, 30)
(65568, 31)
(65568, 30) (65568,)
CPU times: user 5min 23s, sys: 1.79 s, total: 5min 25s
Wall time: 5min 37s


To train/test kNN-Zh in the main article
==============================
- Step2: Training kNN-Zh: to train the model with best parameters from grid search
- Step3: Testing kNN-Zh: to use the already trained one for the paper to get the results in the paper

Step2: Training kNN-Zh 
--------------
(Go to Step3: Testing kNN-Zh to run the one already trained for the paper)
--------------

In [67]:
%%time
### grid search cv
# clf = KNeighborsClassifier()
# k_range = [200,400,600,800,1000,1200,1400,1600,1800,2000,2200,2400,2600,2800,3000]
# param_grid = dict(n_neighbors=k_range)
# grid = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', verbose=3)
# grid.fit(X_train, y_train)
# clf = grid.best_estimator_

# direct fit with best param
clf = KNeighborsClassifier(n_neighbors=3000)
clf.fit(X_train, y_train)

# predict on train
y_pred = clf.predict(X_train)
print(classification_report(y_train,y_pred))

# store model
pickle.dump(clf, open('tmp/baseline1-knn3000.sav', 'wb'))

KNeighborsClassifier(n_neighbors=3000)
CPU times: user 5.31 ms, sys: 2.94 ms, total: 8.26 ms
Wall time: 33.6 ms


Step3: Testing kNN-Zh
--------------

In [14]:
%%time
# change to the new model path if trained again
clf = pickle.load(open('baseline1-knn3000.sav', 'rb')) 
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=4))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob, pos_label=1)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

              precision    recall  f1-score   support

           0     0.8090    0.7119    0.7574     32068
           1     0.7527    0.8391    0.7935     33500

    accuracy                         0.7769     65568
   macro avg     0.7808    0.7755    0.7755     65568
weighted avg     0.7802    0.7769    0.7759     65568

AUROC 0.8730792299572364
Log Loss 0.443924984563648
CPU times: user 6min 39s, sys: 1min 11s, total: 7min 51s
Wall time: 6min 41s


To train/test GBT-Zh in the main article
==============================
- Step2: Training GBT-Zh: to train the model with best parameters from grid search
- Step3: Testing GBT-Zh: to use the already trained one for the paper to get the results in the paper

Step2: Training GBT-Zh
--------------
(Go to Step3: Testing GBT-Zh to run the one already trained for the paper)
--------------

In [68]:
%%time

#### grid search cv
# parameters = {
#     "max_depth":[2,4,6,8,10],
#     "n_estimators":[1400]
# }
# grid = GridSearchCV(GradientBoostingClassifier(random_state=28,validation_fraction=.2,n_iter_no_change=10), 
#                     parameters, cv=3, n_jobs=-1, verbose=3)
# grid.fit(X_train, y_train)
# clf = grid.best_estimator_


# as the max in the paper
clf = GradientBoostingClassifier(max_depth=4, n_estimators=1400)
clf.fit(X_train, y_train)

# predit on training
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

# store model
pickle.dump(clf, open('tmp/baseline1-gbm-e1400-maxdepth4.sav', 'wb'))

CPU times: user 5min 46s, sys: 47.8 ms, total: 5min 46s
Wall time: 5min 47s


GradientBoostingClassifier(max_depth=4, n_estimators=1400)

Step3: Testing GBT-Zh
-----------------------------

In [15]:
%%time
# change to the new model path if trained again
clf = pickle.load(open('baseline1-gbm-e1400-maxdepth4.sav', 'rb'))
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred, digits=4))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

              precision    recall  f1-score   support

           0     0.8382    0.7514    0.7924     32068
           1     0.7835    0.8612    0.8205     33500

    accuracy                         0.8075     65568
   macro avg     0.8109    0.8063    0.8065     65568
weighted avg     0.8103    0.8075    0.8068     65568

AUROC 0.8890653727433682
Log Loss 0.4165479479663574
CPU times: user 583 ms, sys: 93.6 ms, total: 676 ms
Wall time: 1.13 s
