In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from pycaret.anomaly import AnomalyExperiment

import os

save_path = os.path.join(os.getenv('HOME'), 'models', 'creditcard_anomaly')
data_path = '../../data/creditcard.csv'

### Data ingestion

In [2]:
data = pd.read_csv(data_path)
print(data.shape)
data.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(data.isna().values.sum()) # no nas
print(data.isnull().values.sum()) # no null

0
0


In [4]:
label = 'Class'
data[label].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### split data into training and test set

In [5]:
#split data
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=21)
test_df = train_df = None
for train_idx, test_idx in strat_split.split(data, data[label]):
    test_df = data.loc[test_idx]
    train_df = data.loc[train_idx]

In [6]:
print(train_df.shape)
print(test_df.shape)
print(train_df[label].value_counts())
print(test_df[label].value_counts())

(227845, 31)
(56962, 31)
0    227451
1       394
Name: Class, dtype: int64
0    56864
1       98
Name: Class, dtype: int64


### Data analysis

### AutoML anomaly detection

In [7]:
anom_exp = AnomalyExperiment()
anom_exp.setup(train_df, session_id=100)

Unnamed: 0,Description,Value
0,Session id,100
1,Original data shape,"(227845, 31)"
2,Transformed data shape,"(227845, 31)"
3,Numeric features,31
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


<pycaret.anomaly.oop.AnomalyExperiment at 0x7f9ab4f8adc0>

In [8]:
anom_exp.models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [None]:
# I will try six random models from the above to try this training
model_ids = ['sos', 'abod', 'iforest', 'mcd', 'cluster', 'lof', 'svm']
d = []
for m_id in model_ids:
    d.append(anom_exp.create_model(m_id))
    
model = anom_exp.create_model('lof')

Processing:   0%|          | 0/3 [00:00<?, ?it/s]