In [13]:
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC 
from joblib import dump, load
from sklearn.metrics import classification_report
from sklearn import metrics

# for reproducibility
seed = 123
np.random.seed(seed)

Step1: Prepare data for training and testing
--------------------------------
For SVM-Ar, the shared ```data``` folder only contains processed ```X_train```, ```y_trian```, ```X_test```, and ```y_test``` due to the upload size limit ISWC submission. 

In [11]:
%%time

######### Extracted pattern list
p78_list = ['8 -1',
 '17 -1',
 '15 -1',
 '29 -1',
 '6 -1',
 '4 -1',
 '22 -1',
 '25 -1',
 '12 -1',
 '5 -1',
 '3 -1',
 '16 -1',
 '14 -1',
 '6 -1 8 -1',
 '8 -1 8 -1',
 '8 -1 6 -1',
 '17 -1 8 -1',
 '8 -1 17 -1',
 '6 -1 6 -1',
 '18 -1 8 -1',
 '17 -1 6 -1',
 '6 -1 17 -1',
 '8 -1 18 -1',
 '15 -1 8 -1',
 '18 -1 6 -1',
 '17 -1 17 -1',
 '6 -1 8 -1 8 -1',
 '6 -1 6 -1 8 -1',
 '8 -1 8 -1 8 -1',
 '8 -1 8 -1 6 -1',
 '8 -1 6 -1 8 -1',
 '6 -1 8 -1 6 -1',
 '8 -1 6 -1 6 -1',
 '6 -1 6 -1 6 -1',
 '17 -1 8 -1 8 -1',
 '18 -1 8 -1 8 -1',
 '17 -1 6 -1 8 -1',
 '18 -1 6 -1 8 -1',
 '6 -1 17 -1 8 -1',
 '1 -1 24 -1',
 '11 -1 11 -1 13 -1',
 '2 -1 13 -1 26 -1',
 '17 -1 9 -1 13 -1',
 '24 -1 23 -1 22 -1',
 '25 -1 24 -1 17 -1',
 '2 -1 11 -1 13 -1',
 '24 -1 11 -1 15 -1',
 '24 -1 15 -1 11 -1',
 '20 -1 17 -1 24 -1',
 '29 -1 3 -1 24 -1',
 '21 -1 24 -1 22 -1',
 '21 -1 24 -1 5 -1',
 '24 -1 21 -1 26 -1',
 '24 -1 23 -1 26 -1',
 '7 -1 2 -1 25 -1',
 '3 -1 14 -1 24 -1',
 '24 -1 2 -1 2 -1',
 '15 -1 3 -1 24 -1',
 '13 -1 2 -1 14 -1',
 '3 -1 4 -1 24 -1',
 '4 -1 3 -1 24 -1',
 '13 -1 2 -1 13 -1',
 '24 -1 15 -1 2 -1',
 '21 -1 24 -1 26 -1',
 '24 -1 14 -1 28 -1',
 '4 -1 24 -1 13 -1',
 '9 -1 17 -1 13 -1',
 '24 -1 21 -1 2 -1',
 '2 -1 13 -1 14 -1',
 '25 -1 24 -1 14 -1',
 '23 -1 7 -1 2 -1',
 '3 -1 29 -1 24 -1',
 '21 -1 24 -1 2 -1',
 '1 -1 27 -1 17 -1',
 '24 -1 23 -1 2 -1',
 '27 -1 11 -1 14 -1',
 '12 -1 3 -1 24 -1',
 '2 -1 27 -1 20 -1']


# load train users
with open('../data/a_users_s2.data', 'rb') as filehandle:
        # store the data as binary data stream
        train_users = pickle.load(filehandle)
        print(len(train_users))

# test
with open('../data/a_users_s3.data', 'rb') as filehandle:
        # store the data as binary data stream
        test_users = pickle.load(filehandle)
        print(len(test_users))
            
# test
with open('../data/a_users_s4.data', 'rb') as filehandle:
        # store the data as binary data stream
        active_test_users = pickle.load(filehandle)
        print(len(active_test_users))
            
# train-test to index
with open('../data/train-test-users.pkl', 'rb') as f:
        distinct_users = pickle.load(f)
        
# train_users active
train_u_active = [x for x in train_users if x in test_users]
# train users inactive
train_u_inactive = [x for x in train_users if x not in test_users]
print(len(train_users),len(train_u_active),len(train_u_inactive))

# test_users active
test_u_active = [x for x in test_users if x in active_test_users]
# test users inactive
test_u_inactive = [x for x in test_users if x not in active_test_users]
print(len(test_users),len(test_u_active),len(test_u_inactive))


######### Training data preparation
if os.path.exists('X_train.pkl'):
    with open('X_train.pkl','rb') as f:
        X_train = pickle.load(f)
    with open('y_train.pkl','rb') as f:
        y_train = pickle.load(f)
else:
    act_p_dict = dict()
    inact_p_dict = dict()
    act_p_ind_dict = dict()
    inact_p_ind_dict = dict()
    all_patterns = list()

    with open('/media/parklize/Elements/dataset/train-spmf/train-active-spmf-output.txt', 'r') as f:
        ls = f.readlines()
        for l in ls:
            pattern = l[:l.find(' #SUP')].strip()
            count = l[l.find('#SUP: ')+5:l.find(' #SID')].strip()
            indices = [int(x) for x in l[l.find('#SID: ')+5:].strip().split(' ')]
            act_p_ind_dict[pattern] = indices
            act_p_dict[pattern] = int(count)
            all_patterns.append(pattern)

    with open('/media/parklize/Elements/dataset/train-spmf/train-inactive-spmf-output.txt', 'r') as f:
        ls = f.readlines()
        for l in ls:
            pattern = l[:l.find(' #SUP')].strip()
            count = l[l.find('#SUP: ')+5:l.find(' #SID')].strip()
            indices = [int(x) for x in l[l.find('#SID: ')+5:].strip().split(' ')]
            inact_p_ind_dict[pattern] = indices
            inact_p_dict[pattern] = int(count)
            all_patterns.append(pattern)

    all_patterns = list(set(all_patterns))

    #::::: inactive/active # * 78 + 1 label for training 
    # active
    act_np = np.zeros(shape=(29509,78))
    for ind, p in enumerate(p78_list):
        act_np[act_p_ind_dict[p],ind] = 1.

    # inactive (last 8 is single record without pattern)
    inact_np = np.zeros(shape=(31283,78))
    for ind, p in enumerate(p78_list):
        if p in inact_p_ind_dict:
            inact_np[inact_p_ind_dict[p],ind] = 1.

    print(act_np.shape, inact_np.shape)

    # attache ylabels (inactive 1, active 0)
    act_np = np.concatenate([act_np, np.zeros(shape=(29509,1))], axis=1)
    inact_np = np.concatenate([inact_np, np.ones(shape=(31283,1))], axis=1)
    print(act_np.shape, inact_np.shape)

    # combine
    train_np = np.concatenate([act_np, inact_np], axis=0)
    print(train_np.shape)

    X_train = train_np[:,:78]
    y_train = train_np[:,-1]

    print('dumping X_train, y_train')
    with open('X_train.pkl','wb') as f:
        pickle.dump(X_train, f)
        
    with open('y_train.pkl','wb') as f:
        pickle.dump(y_train, f)
        
        
############## Test data preparation
if os.path.exists('X_test.pkl'):
    with open('X_test.pkl','rb') as f:
        X_test = pickle.load(f)
    with open('y_test.pkl','rb') as f:
        y_test = pickle.load(f)
else:
    act_p_dict = dict()
    inact_p_dict = dict()
    act_p_ind_dict = dict()
    inact_p_ind_dict = dict()

    with open('/media/parklize/Elements/dataset/train-spmf/test-active-spmf-output.txt', 'r') as f:
        ls = f.readlines()
        for l in ls:
            pattern = l[:l.find(' #SUP')].strip()
            count = l[l.find('#SUP: ')+5:l.find(' #SID')].strip()
            indices = [int(x) for x in l[l.find('#SID: ')+5:].strip().split(' ')]
            act_p_ind_dict[pattern] = indices
            act_p_dict[pattern] = int(count)

    with open('/media/parklize/Elements/dataset/train-spmf/test-inactive-spmf-output.txt', 'r') as f:
        ls = f.readlines()
        for l in ls:
            pattern = l[:l.find(' #SUP')].strip()
            count = l[l.find('#SUP: ')+5:l.find(' #SID')].strip()
            indices = [int(x) for x in l[l.find('#SID: ')+5:].strip().split(' ')]
            inact_p_ind_dict[pattern] = indices
            inact_p_dict[pattern] = int(count)
            
    #::::: inactive/active # * 78  for testing 
    # active
    act_np = np.zeros(shape=(32068,78))
    for ind, p in enumerate(p78_list):
        act_np[act_p_ind_dict[p],ind] = 1.

    # inactive (last 3 is single record without pattern)
    inact_np = np.zeros(shape=(33500,78))
    for ind, p in enumerate(p78_list):
        if p in inact_p_ind_dict:
            inact_np[inact_p_ind_dict[p],ind] = 1.

    print(act_np.shape, inact_np.shape)

    # attache ylabels (inactive 1, active 0)
    act_np = np.concatenate([act_np, np.zeros(shape=(32068,1))], axis=1)
    inact_np = np.concatenate([inact_np, np.ones(shape=(33500,1))], axis=1)
    print(act_np.shape, inact_np.shape)

    # combine
    test_np = np.concatenate([act_np, inact_np], axis=0)
    print(test_np.shape)

    X_test = test_np[:,:78]
    y_test = test_np[:,-1]
    
    print('dumping X_test, y_test')
    with open('X_test.pkl','wb') as f:
        pickle.dump(X_test, f)
        
    with open('y_test.pkl','wb') as f:
        pickle.dump(y_test, f)

60792
65568
76277
60792 29509 31283
65568 32068 33500
(29509, 78) (31283, 78)
(29509, 79) (31283, 79)
(60792, 79)
dumping X_train, y_train
(32068, 78) (33500, 78)
(32068, 79) (33500, 79)
(65568, 79)
CPU times: user 7min 4s, sys: 7.77 s, total: 7min 12s
Wall time: 8min 3s


Step2: Training SVM-Ar
--------------
(Go to Step3: Testing SVM-Ar to run the one already trained for the paper)
--------------

In [29]:
%%time

### Gridsearch CV
# # defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000],  
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#               'kernel': ['rbf']}  
  
# grid = GridSearchCV(SVC(probability=True), param_grid, refit = True, verbose=3) 
  
# # fitting the model for grid search 
# grid.fit(X_train, y_train) 

# clf = grid.best_estimator_
# print(clf) # SVC(C=0.1, gamma=0.1)

# use gridsearch result directly for retraining
clf = SVC(probability=True, C=0.1, gamma=0.1)
clf.fit(X_train, y_train)

dump(clf, 'tmp/svm.joblib') 

CPU times: user 35min 7s, sys: 1.1 s, total: 35min 8s
Wall time: 35min 10s


SVC(C=0.1, gamma=0.1, probability=True)

Step2: Testing SVM-Ar
--------------

In [14]:
%%time

clf = load('svm.joblib') 
print('loaded trained SVM model...')

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred, digits=4))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
print('AUROC', metrics.auc(fpr, tpr))
print('Log Loss', metrics.log_loss(y_test, y_prob))

loaded trained SVM model...
              precision    recall  f1-score   support

         0.0     0.7994    0.7785    0.7888     32068
         1.0     0.7931    0.8130    0.8029     33500

    accuracy                         0.7961     65568
   macro avg     0.7963    0.7957    0.7959     65568
weighted avg     0.7962    0.7961    0.7960     65568

AUROC 0.8396467166785506
Log Loss 0.488075968530016
CPU times: user 12min 45s, sys: 817 ms, total: 12min 46s
Wall time: 13min 3s
