In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from pycaret.anomaly import AnomalyExperiment
from pycaret.classification import ClassificationExperiment

import os

save_path = os.path.join(os.getenv('HOME'), 'models', 'creditcard_anomaly')
data_path = '../../data/creditcard.csv'

### Data ingestion

In [2]:
data = pd.read_csv(data_path)
print(data.shape)
data.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(data.isna().values.sum()) # no nas
print(data.isnull().values.sum()) # no null

0
0


In [4]:
label = 'Class'
data[label].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### split data into training and test set

In [5]:
#split data
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=21)
test_df = train_df = None
for train_idx, test_idx in strat_split.split(data, data[label]):
    test_df = data.loc[test_idx]
    train_df = data.loc[train_idx]

In [6]:
print(train_df.shape)
print(test_df.shape)
print(train_df[label].value_counts())
print(test_df[label].value_counts())

(227845, 31)
(56962, 31)
0    227451
1       394
Name: Class, dtype: int64
0    56864
1       98
Name: Class, dtype: int64


### Data analysis

### AutoML anomaly detection

In [7]:
anom_exp = AnomalyExperiment()
anom_exp.setup(train_df, session_id=100)

Unnamed: 0,Description,Value
0,Session id,100
1,Original data shape,"(227845, 31)"
2,Transformed data shape,"(227845, 31)"
3,Numeric features,31
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


<pycaret.anomaly.oop.AnomalyExperiment at 0x7f9e6c7be910>

In [8]:
anom_exp.models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [9]:
# # I will try six random models from the above to try this training
lof_model = anom_exp.create_model('lof')
lof_result = anom_exp.assign_model(lof_model)
# lof_pred = anom_exp.predict_model(lof_model, data=test_df)
lof_result.head()

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Anomaly,Anomaly_Score
1865,1441.0,-0.568089,0.223634,2.784809,1.398554,-0.976417,1.181462,-0.350213,0.544987,0.18484,...,0.025451,0.259867,-0.494201,-0.114422,0.237891,0.169842,82.739998,0,0,1.135107
47141,43099.0,1.005389,-0.382836,1.280514,1.682977,-0.950041,0.752944,-0.764272,0.535072,1.297392,...,0.115125,0.161388,0.293618,-0.532962,0.072516,0.017359,18.58,0,0,1.060798
155271,104967.0,-3.633303,-1.481175,0.09403,-0.287921,-0.630724,0.531253,1.667692,-0.881678,3.188153,...,-0.853519,0.725182,-0.153942,-0.000339,-0.042111,1.383515,500.0,0,1,2.423973
250359,154865.0,-0.111852,0.255704,1.589792,-0.007662,-0.400043,0.183738,-0.123587,-0.053031,1.720502,...,-0.068752,0.039753,-0.629515,0.449097,-0.340204,-0.168181,14.84,0,0,0.988771
234143,147846.0,-3.063601,2.554789,-0.294903,-1.092867,-0.770817,-0.987801,-0.270569,0.883005,1.513483,...,0.222549,-0.183433,0.065646,-0.250951,0.383166,0.480414,1.54,0,0,1.020228


In [10]:
iforest_model = anom_exp.create_model('iforest')
iforest_result = anom_exp.assign_model(iforest_model)
iforest_result.head()

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Anomaly,Anomaly_Score
1865,1441.0,-0.568089,0.223634,2.784809,1.398554,-0.976417,1.181462,-0.350213,0.544987,0.18484,...,0.025451,0.259867,-0.494201,-0.114422,0.237891,0.169842,82.739998,0,0,-0.071976
47141,43099.0,1.005389,-0.382836,1.280514,1.682977,-0.950041,0.752944,-0.764272,0.535072,1.297392,...,0.115125,0.161388,0.293618,-0.532962,0.072516,0.017359,18.58,0,0,-0.07147
155271,104967.0,-3.633303,-1.481175,0.09403,-0.287921,-0.630724,0.531253,1.667692,-0.881678,3.188153,...,-0.853519,0.725182,-0.153942,-0.000339,-0.042111,1.383515,500.0,0,0,-0.0066
250359,154865.0,-0.111852,0.255704,1.589792,-0.007662,-0.400043,0.183738,-0.123587,-0.053031,1.720502,...,-0.068752,0.039753,-0.629515,0.449097,-0.340204,-0.168181,14.84,0,0,-0.081356
234143,147846.0,-3.063601,2.554789,-0.294903,-1.092867,-0.770817,-0.987801,-0.270569,0.883005,1.513483,...,0.222549,-0.183433,0.065646,-0.250951,0.383166,0.480414,1.54,0,0,-0.078585


In [12]:
pca_model = anom_exp.create_model('pca')
pca_result = anom_exp.assign_model(pca_model)
pca_result.head()

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Anomaly,Anomaly_Score
1865,1441.0,-0.568089,0.223634,2.784809,1.398554,-0.976417,1.181462,-0.350213,0.544987,0.18484,...,0.025451,0.259867,-0.494201,-0.114422,0.237891,0.169842,82.739998,0,0,7858.21476
47141,43099.0,1.005389,-0.382836,1.280514,1.682977,-0.950041,0.752944,-0.764272,0.535072,1.297392,...,0.115125,0.161388,0.293618,-0.532962,0.072516,0.017359,18.58,0,0,8865.373294
155271,104967.0,-3.633303,-1.481175,0.09403,-0.287921,-0.630724,0.531253,1.667692,-0.881678,3.188153,...,-0.853519,0.725182,-0.153942,-0.000339,-0.042111,1.383515,500.0,0,0,13045.626513
250359,154865.0,-0.111852,0.255704,1.589792,-0.007662,-0.400043,0.183738,-0.123587,-0.053031,1.720502,...,-0.068752,0.039753,-0.629515,0.449097,-0.340204,-0.168181,14.84,0,0,9682.652361
234143,147846.0,-3.063601,2.554789,-0.294903,-1.092867,-0.770817,-0.987801,-0.270569,0.883005,1.513483,...,0.222549,-0.183433,0.065646,-0.250951,0.383166,0.480414,1.54,0,0,7910.310346


In [13]:
cluster_model = anom_exp.create_model('cluster')
cluster_result = anom_exp.assign_model(cluster_model)
cluster_result.head()

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Anomaly,Anomaly_Score
1865,1441.0,-0.568089,0.223634,2.784809,1.398554,-0.976417,1.181462,-0.350213,0.544987,0.18484,...,0.025451,0.259867,-0.494201,-0.114422,0.237891,0.169842,82.739998,0,1,34446.522592
47141,43099.0,1.005389,-0.382836,1.280514,1.682977,-0.950041,0.752944,-0.764272,0.535072,1.297392,...,0.115125,0.161388,0.293618,-0.532962,0.072516,0.017359,18.58,0,0,7211.995117
155271,104967.0,-3.633303,-1.481175,0.09403,-0.287921,-0.630724,0.531253,1.667692,-0.881678,3.188153,...,-0.853519,0.725182,-0.153942,-0.000339,-0.042111,1.383515,500.0,0,1,16626.568359
250359,154865.0,-0.111852,0.255704,1.589792,-0.007662,-0.400043,0.183738,-0.123587,-0.053031,1.720502,...,-0.068752,0.039753,-0.629515,0.449097,-0.340204,-0.168181,14.84,0,0,6297.356445
234143,147846.0,-3.063601,2.554789,-0.294903,-1.092867,-0.770817,-0.987801,-0.270569,0.883005,1.513483,...,0.222549,-0.183433,0.065646,-0.250951,0.383166,0.480414,1.54,0,0,6174.733887


## Using classification algorithm

In [15]:
classify_exp = ClassificationExperiment()
classify_exp.setup()

classify_model = classify_exp.compare_models()

ValueError: One and only one of data and data_func must be set