# Baseline Anomaly Detection

This notebok is a baseline method for anomaly detection and a first experiment into the approach that we are trying to integrate.

## Load Dependencies

In [1]:
from pycaret.datasets import get_data
from pycaret.anomaly import *
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
SEED = 42

## Loading Data

In [3]:
df = pd.read_csv('../../datasets/income/adult.csv')
df = df.drop(['fnlwgt'], axis=1)
# df = get_data('anomaly')

In [4]:
train_df, test_df = train_test_split(df, test_size=0.05, random_state=SEED)

new_record = {
    'age': 200,
    'workclass': 'State-gov',
    'education': 'Some-college',
    'education.num': 13,
    'marital.status': 'Married-civ-spouse',
    'occupation': 'Tech-support',
    'relationship': 'wife',
    'race': 'Black',
    'sex': 'Male',
    'capital.gain': 90000,
    'capital.loss': 100,
    'hours.per.week': 170,
    'native.country': 'United-States',
    'income': '<=50K'
}

new_record1 = {
    'age': 12,
    'workclass': 'State-gov',
    'education': 'Some-college',
    'education.num': 13,
    'marital.status': 'Married-civ-spouse',
    'occupation': 'Tech-support',
    'relationship': 'wife',
    'race': 'Black',
    'sex': 'Male',
    'capital.gain': 90000,
    'capital.loss': 100,
    'hours.per.week': 170,
    'native.country': 'United-States',
    'income': '<=50K'
}

# new_record = {
#     'age': 1000,
#     'relationship': 'wife',
#     'sex': 'Male',
# }

test_df = pd.concat([test_df, pd.DataFrame([new_record, new_record1])], ignore_index=True)

## Setup Pipeline

In [5]:
s = setup(train_df, session_id=SEED, max_encoding_ohe=0)

## Creating a Model

In [6]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pycaret.internal.patches.pyod.CBLOFForceToDouble
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


semi-working:
- cluster
- knn
- lof
- svm (kind of and takes much longer)
- pca

In [69]:
iforest = create_model('pca', fraction=1e-3)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [70]:
predictions = predict_model(iforest, data=test_df)

In [71]:
sorted_predictions = predictions.sort_values(by=['Anomaly', 'Anomaly_Score'], ascending=[False, False])

In [72]:
sorted_predictions

Unnamed: 0,age,workclass_0,workclass_1,education_0,education_1,education.num,marital.status_0,marital.status_1,occupation_0,occupation_1,...,sex,capital.gain,capital.loss,hours.per.week,native.country_0,native.country_1,native.country_2,income,Anomaly,Anomaly_Score
1629,200.0,0.0,2.0,0.0,3.0,13.0,0.0,1.0,0.0,2.0,...,1.0,90000.0,100.0,170.0,0.0,0.0,1.0,0.0,1,16295.577252
1630,12.0,0.0,2.0,0.0,3.0,13.0,0.0,1.0,0.0,2.0,...,1.0,90000.0,100.0,170.0,0.0,0.0,1.0,0.0,1,13292.176419
8,28.0,0.0,1.0,1.0,2.0,4.0,0.0,1.0,2.0,2.0,...,1.0,0.0,2179.0,40.0,0.0,2.0,4.0,0.0,0,11817.081949
532,52.0,0.0,1.0,0.0,2.0,9.0,0.0,1.0,1.0,3.0,...,1.0,99999.0,0.0,40.0,0.0,2.0,1.0,1.0,0,11725.439188
1439,38.0,0.0,1.0,2.0,4.0,15.0,0.0,1.0,0.0,1.0,...,1.0,99999.0,0.0,70.0,0.0,0.0,1.0,1.0,0,11618.036580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,46.0,0.0,1.0,0.0,2.0,9.0,0.0,1.0,1.0,3.0,...,1.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,0,1842.538405
1351,37.0,0.0,1.0,0.0,3.0,10.0,0.0,1.0,1.0,2.0,...,1.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,0,1807.696330
1575,29.0,0.0,1.0,0.0,2.0,9.0,0.0,1.0,1.0,2.0,...,1.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,0,1806.824217
1109,41.0,0.0,1.0,0.0,2.0,9.0,0.0,1.0,1.0,3.0,...,1.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,0,1799.024523


In [73]:
sorted_predictions1 = predictions.sort_values(by=['age', 'Anomaly', 'Anomaly_Score'], ascending=[True, False, False])
sorted_predictions1

Unnamed: 0,age,workclass_0,workclass_1,education_0,education_1,education.num,marital.status_0,marital.status_1,occupation_0,occupation_1,...,sex,capital.gain,capital.loss,hours.per.week,native.country_0,native.country_1,native.country_2,income,Anomaly,Anomaly_Score
1630,12.0,0.0,2.0,0.0,3.0,13.0,0.0,1.0,0.0,2.0,...,1.0,90000.0,100.0,170.0,0.0,0.0,1.0,0.0,1,13292.176419
1372,17.0,0.0,1.0,1.0,0.0,7.0,0.0,4.0,1.0,2.0,...,1.0,0.0,1721.0,15.0,0.0,0.0,1.0,0.0,0,5015.218119
1116,17.0,0.0,3.0,3.0,1.0,8.0,0.0,4.0,1.0,0.0,...,0.0,0.0,0.0,25.0,0.0,0.0,1.0,0.0,0,4902.576798
1306,17.0,1.0,0.0,2.0,2.0,6.0,0.0,4.0,0.0,4.0,...,1.0,0.0,0.0,12.0,0.0,0.0,1.0,0.0,0,4867.224053
1408,17.0,0.0,1.0,3.0,1.0,8.0,0.0,4.0,1.0,2.0,...,1.0,0.0,0.0,16.0,0.0,0.0,1.0,0.0,0,4543.627224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,83.0,0.0,4.0,0.0,2.0,9.0,0.0,2.0,1.0,3.0,...,1.0,0.0,0.0,8.0,0.0,0.0,1.0,0.0,0,4546.402620
834,84.0,1.0,0.0,0.0,3.0,10.0,0.0,1.0,0.0,4.0,...,1.0,0.0,0.0,35.0,0.0,0.0,1.0,1.0,0,4496.845092
362,90.0,0.0,1.0,0.0,2.0,9.0,0.0,2.0,1.0,4.0,...,1.0,0.0,0.0,99.0,0.0,0.0,1.0,0.0,0,5269.078130
1093,90.0,0.0,4.0,0.0,2.0,9.0,0.0,4.0,1.0,3.0,...,1.0,2964.0,0.0,12.0,0.0,0.0,1.0,0.0,0,4814.952240
