---
### Import necessary libraries

In [1]:
import os
import gc
import sys
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import OneClassSVM

In [2]:
home = os.path.expanduser("~")
sys.path.append(f"{home}/Documents/projects/creditCardFraud/AnomalyDetection/scripts/")

In [3]:
from util import *

In [4]:
%load_ext autoreload
%autoreload 2

### garbage collect

In [5]:
gc.collect()
%reset -f out

Flushing output cache (0 entries)


### Import data for training 

In [6]:
train = pd.read_csv(f"{home}/Documents/projects/creditCardFraud/processedData/TrainingData_normal.csv")
dev = pd.read_csv(f"{home}/Documents/projects/creditCardFraud/processedData/DevData_normal.csv")
test = pd.read_csv(f"{home}/Documents/projects/creditCardFraud/processedData/hold_outset_moreFraud.csv")

### Generate train, test, and normal data 

In [7]:
training, norm, test_data, y_test = train_test_dfs(train,dev,test,"Class",0.2,1988)

-----
### One-Class SVMS:
  * Train on normal class performs better 

In [8]:
outlier_ratio = training.Class.value_counts(normalize=True)[1]

In [9]:
svm_norm = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale") 
svm_norm.fit(norm)

OneClassSVM(nu=0.0017248568105510324)

In [10]:
svm_all = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale")
svm_all.fit(training.drop("Class",axis=1))

OneClassSVM(nu=0.0017248568105510324)

In [11]:
test_score_norm = svm_norm.decision_function(test_data)
test_score_all = svm_all.decision_function(test_data)

In [12]:
output = model_results(y_test, -test_score_norm, threshold=0, ifprint=True)

[[56751   112]
 [   42    57]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.34      0.58      0.43        99

    accuracy                           1.00     56962
   macro avg       0.67      0.79      0.71     56962
weighted avg       1.00      1.00      1.00     56962



In [13]:
output = model_results(y_test, -test_score_all, 0, ifprint=True)

[[56774    89]
 [   78    21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.19      0.21      0.20        99

    accuracy                           1.00     56962
   macro avg       0.59      0.61      0.60     56962
weighted avg       1.00      1.00      1.00     56962



### SVM works better using normal class only 


### Garbage collect

In [14]:
gc.collect()
%reset -f out

Flushing output cache (2 entries)


---
### Define custom score for gridsearch

In [15]:
custom_score = make_custom_score()

### Define custom train and test splits

In [16]:
cvSplits, X_train,y_train = makeCustomSplits(training,"Class",5,2018,outlier_ratio)

In [17]:
params = {
        "kernel":["rbf","linear"],
        "gamma":[0.0001,0.00001, "scale"],
        "nu":[outlier_ratio*0.75, outlier_ratio,outlier_ratio*1.25]
        }



In [18]:
CV = GridSearchCV(
                OneClassSVM(),
                params,
                scoring= custom_score,
                refit=False,
                verbose = 1,
                cv=cvSplits,
                n_jobs=20
                )

In [19]:
CV.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   30.5s
[Parallel(n_jobs=20)]: Done  90 out of  90 | elapsed:  2.7min finished


GridSearchCV(cv=<generator object makeCustomSplits.<locals>.<genexpr> at 0x7f159324fa40>,
             estimator=OneClassSVM(), n_jobs=20,
             param_grid={'gamma': [0.0001, 1e-05, 'scale'],
                         'kernel': ['rbf', 'linear'],
                         'nu': [0.0012936426079132742, 0.0017248568105510324,
                                0.0021560710131887906]},
             refit=False,
             scoring={'f1_f': make_scorer(f1_f), 'fn': make_scorer(fn),
                      'fp': make_scorer(fp), 'prec_f': make_scorer(prec_f),
                      'recall_f': make_scorer(recall_f),
                      'tp': make_scorer(tp)},
             verbose=1)

### Garbage collect

In [20]:
gc.collect()
%reset -f out

Flushing output cache (1 entries)


In [21]:
df_output = CVResultsOutput(CV.cv_results_,custom_score.keys())

In [22]:
df_output.sort_values(["f1_f"],ascending=False)[:5]

Unnamed: 0,gamma,kernel,nu,tp,fp,fn,f1_f,prec_f,recall_f
12,scale,rbf,0.001294,42.4,76.0,36.6,0.4297,0.358832,0.536709
13,scale,rbf,0.001725,44.0,88.4,35.0,0.416325,0.332564,0.556962
14,scale,rbf,0.002156,45.0,104.6,34.0,0.393856,0.301078,0.56962
0,0.0001,rbf,0.001294,29.4,60.0,49.6,0.349133,0.329175,0.372152
6,1e-05,rbf,0.001294,29.4,60.0,49.6,0.349114,0.329135,0.372152


### Instead of using best selected estimator using refit, use the output with the best overall f1 score

In [23]:
best_model = OneClassSVM(kernel="rbf",gamma="scale",nu=outlier_ratio*0.75)

In [24]:
best_model.fit(norm)

OneClassSVM(nu=0.0012936426079132742)

In [25]:
best_pred_score = best_model.decision_function(test_data)

In [26]:
output = model_results(y_test,-best_pred_score,0,True)

[[56777    86]
 [   42    57]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.40      0.58      0.47        99

    accuracy                           1.00     56962
   macro avg       0.70      0.79      0.73     56962
weighted avg       1.00      1.00      1.00     56962



### Appears that GridSearchCV in this manner is more efficient than orthogonal search
---