In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split

from notebooks.anomaly_detection.models.autoencoder_model import AutoencoderModel
from notebooks.anomaly_detection.models.dbscan_clustering_model import DBSCANClusteringModel
from notebooks.anomaly_detection.models.isolated_forest_model import IsolatedForestModel

In [2]:
SEED = 42

## Loading the Data
First we'll load in the dataset that we want to use. 
For this use-case we'll use the adult income consensus dataset.

In [3]:
df = pd.read_csv('../../datasets/income/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


After inspection, we decided to remove the `fnlwgt` column from the data, as there is no concrete description of what these values indicate.
It might translate to `Final Weight` as a weight used to indicate some metric of the corresponding instance. However, as we do not know what it is exactly, how it is computed (if at all), we decided to remove it.

In [4]:
df = df.drop(['fnlwgt'], axis=1)

We have some categorical data that needs to be encoded into such a way that the models can understand it.
For this approach, we'll use simple one hot encoding.

In [5]:
df = pd.get_dummies(df)

In [6]:
df.head()

Unnamed: 0,age,education.num,capital.gain,capital.loss,hours.per.week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_<=50K,income_>50K
0,90,9,0,4356,40,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,82,9,0,4356,18,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
2,66,10,0,4356,40,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
3,54,4,0,3900,40,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
4,41,10,0,3900,40,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False


We can see that due to the amount of categorical options, we data becomes slightly sparse from 15 initial columns to 109 columns.

## Split the data into Train/Test

For testing the data, we'll use a 95/5 train/test split.

In [7]:
train_df, test_df = train_test_split(df, test_size=0.05, random_state=SEED)

As we assume all data samples from the income dataset to be "normal", we'll introduce some obviously wrong relationships.

We added two instances from outside of the age range, with clearly invalid relationships (both are male but are also a wife)

In [8]:
new_record = {
    'age': 200,
    'workclass': 'State-gov',
    'education': 'Some-college',
    'education.num': 13,
    'marital.status': 'Married-civ-spouse',
    'occupation': 'Tech-support',
    'relationship': 'wife',
    'race': 'Black',
    'sex': 'Male',
    'capital.gain': 90000,
    'capital.loss': 100,
    'hours.per.week': 170,
    'native.country': 'United-States',
    'income': '<=50K'
}

new_record1 = {
    'age': 12,
    'workclass': 'State-gov',
    'education': 'Some-college',
    'education.num': 13,
    'marital.status': 'Married-civ-spouse',
    'occupation': 'Tech-support',
    'relationship': 'wife',
    'race': 'Black',
    'sex': 'Male',
    'capital.gain': 90000,
    'capital.loss': 100,
    'hours.per.week': 170,
    'native.country': 'United-States',
    'income': '<=50K'
}

test_df = pd.concat([test_df, pd.DataFrame([new_record, new_record1])], ignore_index=True)

In [9]:
test_df = pd.get_dummies(test_df)

## Isolated Forests

In [13]:
if_model = IsolatedForestModel(train_df=train_df, test_df=test_df, n_estimators=100, contamination='auto', random_state=SEED)
if_model.fit()

In [14]:
if_model.train_df[if_model.predict(if_model.train_df) == -1]

Unnamed: 0,age,education.num,capital.gain,capital.loss,hours.per.week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_<=50K,income_>50K
20440,27,13,0,0,35,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
742,40,4,0,1887,40,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
214,42,14,0,2201,60,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1723,59,16,25236,0,45,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
3208,35,15,4787,0,45,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,52,16,0,1887,70,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
5342,42,12,0,0,40,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
18384,39,16,0,0,40,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
12183,75,6,0,0,1,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


## DBScan Clustering

In [28]:
db_model = DBSCANClusteringModel(train_df=train_df, test_df=test_df, random_state=SEED, epsilon=.1, min_samples=50)

In [29]:
db_model.predict(db_model.train_df)

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)

## Autoencoder

In [33]:
ae_model = AutoencoderModel(train_df=train_df, test_df=test_df, random_state=SEED)
ae_model.fit()

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [ ]:
ae_model.predict(ae_model.train_df)