In [1]:
import sys
import numpy as np
import pandas as pd 
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV
sys.path.append("/home/mindy/Documents/projects/creditCardFraud/scripts/")

In [2]:
from util import *

In [3]:
%load_ext autoreload
%autoreload 2

### import data 

In [4]:
train = pd.read_csv("../processedData/TrainingData_normal.csv")
dev = pd.read_csv("../processedData/DevData_normal.csv")
test = pd.read_csv("../processedData/hold_outset_moreFraud.csv")

### generate train, test, and normal data 

In [5]:
training, norm, test_data, y_test = train_test_dfs(train,dev,test,"Class",0.2,1988)

### One-Class SVMS:
  * Train on normal class performs better 

In [6]:
outlier_ratio = training.Class.value_counts(normalize=True)[1]

In [7]:
svm_norm = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale") 
svm_norm.fit(norm)

OneClassSVM(nu=0.0017248568105510324)

In [8]:
svm_all = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale")
svm_all.fit(training.drop("Class",axis=1))

OneClassSVM(nu=0.0017248568105510324)

In [9]:
test_score_norm = svm_norm.decision_function(test_data)
test_score_all = svm_all.decision_function(test_data)

In [10]:
model_results(y_test, -test_score_norm, 0)

[[56751   112]
 [   42    57]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.34      0.58      0.43        99

    accuracy                           1.00     56962
   macro avg       0.67      0.79      0.71     56962
weighted avg       1.00      1.00      1.00     56962



In [11]:
model_results(y_test, -test_score_all, 0)

[[56774    89]
 [   78    21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.19      0.21      0.20        99

    accuracy                           1.00     56962
   macro avg       0.59      0.61      0.60     56962
weighted avg       1.00      1.00      1.00     56962



### SVM works better using normal class only 

### Define custom score

In [12]:
custom_score = make_custom_score()

### Define custom train and test splits

In [13]:
cvSplits, X_train,y_train = makeCustomSplits(training,"Class",5,2018,outlier_ratio)

In [14]:
params = {
        "kernel":["rbf","linear"],
        "gamma":[0.0001,0.00001, "scale"],
        "nu":[outlier_ratio*0.75, outlier_ratio,outlier_ratio*1.25]
        }



In [15]:
CV = GridSearchCV(
                OneClassSVM(),
                params,
                scoring= custom_score,
                refit=False,
                verbose = 1,
                cv=cvSplits,
                n_jobs=20
                )

In [32]:
CV.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   29.5s
[Parallel(n_jobs=20)]: Done  90 out of  90 | elapsed:  2.7min finished


GridSearchCV(cv=<generator object makeCustomSplits.<locals>.<genexpr> at 0x7f3a54d985c8>,
             estimator=OneClassSVM(), n_jobs=20,
             param_grid={'gamma': [0.0001, 1e-05, 'scale'],
                         'kernel': ['rbf', 'linear'],
                         'nu': [0.0012936426079132742, 0.0017248568105510324,
                                0.0021560710131887906]},
             refit=False,
             scoring={'f1_f': make_scorer(f1_f), 'fn': make_scorer(fn),
                      'fp': make_scorer(fp), 'prec_f': make_scorer(prec_f),
                      'recall_f': make_scorer(recall_f),
                      'tp': make_scorer(tp)},
             verbose=1)

In [33]:
df_output = CVResultsOutput(CV.cv_results_,custom_score.keys())

In [34]:
df_output

Unnamed: 0,gamma,kernel,nu,tp,fp,fn,f1_f,prec_f,recall_f
0,0.0001,rbf,0.001294,33.2,60.0,45.8,0.385545,0.356508,0.420253
1,0.0001,rbf,0.001725,36.2,78.2,42.8,0.374382,0.31654,0.458228
2,0.0001,rbf,0.002156,38.2,99.0,40.8,0.353397,0.278681,0.483544
3,0.0001,linear,0.001294,1.4,63.4,77.6,0.018048,0.018387,0.017722
4,0.0001,linear,0.001725,6.0,83.8,73.0,0.071278,0.067443,0.075949
5,0.0001,linear,0.002156,6.0,104.6,73.0,0.063534,0.05488,0.075949
6,1e-05,rbf,0.001294,32.6,60.0,46.4,0.379784,0.352201,0.412658
7,1e-05,rbf,0.001725,35.8,77.4,43.2,0.372585,0.31645,0.453165
8,1e-05,rbf,0.002156,38.2,98.6,40.8,0.353968,0.279322,0.483544
9,1e-05,linear,0.001294,1.4,63.4,77.6,0.018048,0.018387,0.017722


### Instead of using best selected estimator using refit, use the output with the best overall tp,fp,fn

In [7]:
best_model = OneClassSVM(kernel="rbf",gamma="scale",nu=outlier_ratio*0.75)

In [8]:
best_model.fit(norm)

OneClassSVM(nu=0.0012936426079132742)

In [9]:
best_pred_score = best_model.decision_function(test_data)

In [10]:
model_results(y_test,-best_pred_score,0)

[[56777    86]
 [   42    57]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.40      0.58      0.47        99

    accuracy                           1.00     56962
   macro avg       0.70      0.79      0.73     56962
weighted avg       1.00      1.00      1.00     56962



### Appears that GridSearchCV in this manner is more efficient than orthogonal search