In [3]:
import time
import pickle
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics

# for reproducibility
seed = 123
np.random.seed(seed)

Step1: Prepare data for training and testing
--------------------------------

In [8]:
%%time

# load train users
with open('../data/a_users_s2.data', 'rb') as filehandle:
    # store the data as binary data stream
    train_users = pickle.load(filehandle)
print(len(train_users))

# test
with open('../data/a_users_s3.data', 'rb') as filehandle:
    # store the data as binary data stream
    test_users = pickle.load(filehandle)
print(len(test_users))

# to get active inactive in test
with open('../data/a_users_s4.data', 'rb') as filehandle:
    # store the data as binary data stream
    s4_active_users = pickle.load(filehandle)
print(len(s4_active_users))

############ Training data preparation
with open('../data/baseline2_traindf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    train_df_list = pickle.load(filehandle)
    
refined_list = list()
for df_ in train_df_list:
    df_['avg_edit_item'] = df_['count'] * 1.0 / df_['nunique']
    refined_list.append(df_[['count', 'nunique', 'avg_edit_item', 'daydiff', 'entropy']])
    
# concat
train_df = pd.concat(refined_list, axis=1, keys=list(range(10)))

print(train_df.shape)
train_df.head()

# y_train
y_labels = [x in test_users for x in train_users]
y_df = pd.DataFrame({
    'user': train_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)

train_df = pd.concat([train_df, y_df], axis=1)
train_df = train_df.fillna(0)

# X_train
X_train = train_df.values[:,:-1]
y_train = train_df.values[:,-1]
y_train = 1-y_train
np.nan_to_num(X_train, copy=False, nan=0.0)

X_train = X_train.astype(dtype=np.float32)
y_train = y_train.astype(dtype=np.int8)
X_train = np.log10(X_train+1.)
print(X_train.shape, y_train.shape)


########## Test data preparation
# y_train
y_labels = [x in s4_active_users for x in test_users]
y_df = pd.DataFrame({
    'user': test_users,
    'label': y_labels # 1 for active
})
y_df.set_index('user', inplace=True)
print("y df shape", y_df.shape)

with open('../data/baseline2_testdf_list.pkl', 'rb') as filehandle:
    # store the data as binary data stream
    test_df_list = pickle.load(filehandle)
    
refined_list = list()
for df_ in test_df_list:
    df_['avg_edit_item'] = df_['count'] * 1.0 / df_['nunique']
    refined_list.append(df_[['count', 'nunique', 'avg_edit_item', 'daydiff', 'entropy']])
    
# concat
test_df = pd.concat(refined_list, axis=1, keys=list(range(len(refined_list))))
test_df = test_df[test_df.index.isin(y_df.index)]
print("test df shape", test_df.shape)

test_df = pd.concat([test_df, y_df], axis=1)
test_df = test_df.fillna(0)
print(test_df.shape)

# X_test
X_test = test_df.values[:,:-1]
y_test = test_df.values[:,-1]
y_test = 1-y_test
np.nan_to_num(X_test, copy=False, nan=0.0)

X_test = X_test.astype(dtype=np.float32)
y_test = y_test.astype(dtype=np.int8)
X_test = np.log10(X_test+1.)
print(X_test.shape, y_test.shape)

60792
65568
76277
(60792, 50)
(60792, 50) (60792,)
y df shape (65568, 1)
test df shape (29509, 50)
(65568, 51)
(65568, 50) (65568,)
CPU times: user 2min 46s, sys: 214 ms, total: 2min 46s
Wall time: 2min 46s


To train/test RF-Sa in the main article
==============================
- Step2: Training RF-Sa: to train the model with best parameters from grid search
- Step3: Testing RF-Sa: to use the already trained one for the paper to get the results in the paper


Step2: Training RF-Sa
--------------------
(Go to Step3: Testing RF-Sa to run the one already trained for the paper)
--------------

In [7]:
%%time
### grid search cv
# clf = RandomForestClassifier(n_estimators=1400)
# param_grid = {'max_depth':[2,4,6,8,10]}
# grid = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', verbose=3)
# grid.fit(X_train, y_train)
# clf = grid.best_estimator_
# print(clf)

# fit with best hyperparams
direct fit with best hyperparam
clf = RandomForestClassifier(n_estimators=1400,max_depth=4)
clf.fit(X_train, y_train)

# predict on train
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

# store model
pickle.dump(clf, open('tmp/baseline2-rf1400.sav', 'wb'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END ....................................max_depth=2; total time=  22.2s
[CV 2/3] END ....................................max_depth=2; total time=  25.5s
[CV 3/3] END ....................................max_depth=2; total time=  28.0s
[CV 1/3] END ....................................max_depth=4; total time=  30.6s
[CV 2/3] END ....................................max_depth=4; total time=  36.8s
[CV 3/3] END ....................................max_depth=4; total time=  41.5s
[CV 1/3] END ....................................max_depth=6; total time=  35.6s
[CV 2/3] END ....................................max_depth=6; total time=  48.0s
[CV 3/3] END ....................................max_depth=6; total time=  53.9s
[CV 1/3] END ....................................max_depth=8; total time=  40.5s
[CV 2/3] END ....................................max_depth=8; total time=  59.1s
[CV 3/3] END ....................................

Step3: Testing RF-Sa
--------------

In [6]:
clf = pickle.load(open('baseline2-rf1400.sav', 'rb'))
print(clf)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred, digits=4))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

RandomForestClassifier(max_depth=4, n_estimators=1400)
              precision    recall  f1-score   support

           0     0.9274    0.4562    0.6116     32068
           1     0.6498    0.9658    0.7769     33500

    accuracy                         0.7166     65568
   macro avg     0.7886    0.7110    0.6942     65568
weighted avg     0.7856    0.7166    0.6960     65568

AUROC 0.7645887437888517
Log Loss 0.5509609374382396


To train/test LR-Sa in the main article
==============================
- Step2: Training LR-Sa: to train the model with best parameters from grid search
- Step3: Testing LR-Sa: to use the already trained one for the paper to get the results in the paper

Step2: Training LR-Sa
--------------------
(Go to Step3: Testing LR-Sa to run the one already trained for the paper)
--------------

In [6]:
%%time

### grid search cv
# clf = LogisticRegression(max_iter=10000)
# param_grid = {'C':[.1,1.,10.,100.,1000.]}
# grid = GridSearchCV(clf, param_grid, cv=3, scoring='accuracy', verbose=3)
# grid.fit(X_train, y_train)
# clf = grid.best_estimator_
# print(clf)

# direct fit with best hyperparam
clf = LogisticRegression(max_iter=10000,C=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

# store model
pickle.dump(clf, open('tmp/baseline2-lr.sav', 'wb'))

CPU times: user 10.9 s, sys: 0 ns, total: 10.9 s
Wall time: 3.08 s


LogisticRegression(C=1000, max_iter=10000)

Step3: Testing LR-Sa
---------------------

In [7]:
clf = pickle.load(open('baseline2-lr.sav', 'rb'))
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred, digits=4))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

              precision    recall  f1-score   support

           0     0.9004    0.4948    0.6386     32068
           1     0.6621    0.9476    0.7795     33500

    accuracy                         0.7261     65568
   macro avg     0.7812    0.7212    0.7091     65568
weighted avg     0.7786    0.7261    0.7106     65568

AUROC 0.7656331824723209
Log Loss 0.5885759913840601
