In [1]:
import sys
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score
sys.path.append("/home/mindy/Documents/projects/creditCardFraud/scripts/")

In [2]:
from util import *

In [3]:
%load_ext autoreload
%autoreload 2

### import data 

In [4]:
train = pd.read_csv("../processedData/TrainingData_normal.csv")
dev = pd.read_csv("../processedData/DevData_normal.csv")
test = pd.read_csv("../processedData/hold_outset_moreFraud.csv")

### generate train, test, and normal data 

In [5]:
training, norm, test_data, y_test = train_test_dfs(train,dev,test,"Class",0.2,1988)

### One-Class SVMS:
  * Train on normal class performs better 

In [6]:
outlier_ratio = training.Class.value_counts(normalize=True)[1]

In [7]:
svm_norm = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale") 
svm_norm.fit(norm)

OneClassSVM(nu=0.0017248568105510324)

In [8]:
svm_all = OneClassSVM(kernel="rbf", nu=outlier_ratio,gamma="scale")
svm_all.fit(training.drop("Class",axis=1))

OneClassSVM(nu=0.0017248568105510324)

In [9]:
test_score_norm = svm_norm.decision_function(test_data)
test_score_all = svm_all.decision_function(test_data)

In [10]:
model_results(y_test, -test_score_norm, 0)

[[56751   112]
 [   42    57]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.34      0.58      0.43        99

    accuracy                           1.00     56962
   macro avg       0.67      0.79      0.71     56962
weighted avg       1.00      1.00      1.00     56962



In [11]:
model_results(y_test, -test_score_all, 0)

[[56774    89]
 [   78    21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.19      0.21      0.20        99

    accuracy                           1.00     56962
   macro avg       0.59      0.61      0.60     56962
weighted avg       1.00      1.00      1.00     56962



### SVM works better using normal class only 

### GridSearchCV:
* making the orignal label to be -1 and 1 

In [12]:
y_true = np.ones(norm.shape[0])

In [13]:
params = {
        "gamma":[0.001,"scale","auto"],
        "nu":[outlier_ratio]
        }

In [14]:
CV = GridSearchCV(
                OneClassSVM(kernel="rbf"),
                params,
                scoring= "f1_micro",
                verbose = 1,
                cv=3,
                n_jobs=20
                )

In [15]:
CV.fit(norm,y_true)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   9 out of   9 | elapsed:  1.4min finished


GridSearchCV(cv=3, estimator=OneClassSVM(), n_jobs=20,
             param_grid={'gamma': [0.001, 'scale', 'auto'],
                         'nu': [0.0017248568105510324]},
             scoring='f1_micro', verbose=1)

In [16]:
CV.best_params_,CV.best_score_

({'gamma': 0.001, 'nu': 0.0017248568105510324}, 0.9982281977523018)

In [17]:
test_score = CV.decision_function(test_data)

In [18]:
model_results(y_test,-test_score,0)

[[56761   102]
 [   54    45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       0.31      0.45      0.37        99

    accuracy                           1.00     56962
   macro avg       0.65      0.73      0.68     56962
weighted avg       1.00      1.00      1.00     56962



### Appears that manual tuning is better than using GridSearchCV