In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from pyod.models.ecod import ECOD

In [2]:
dataset = load_iris()
X = dataset["data"]
X_train, X_test = train_test_split(X)

In [3]:
detector = ECOD()
detector.fit(X_train)

ECOD(contamination=0.1, n_jobs=1)

In [4]:
# detector.decision_function(X_test)
detector.predict(X_test)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [5]:
fake_var_1 = np.random.randn(1000)
fake_var_2 = np.random.randn(1000)
fake_var_3 = [x for _ in range(500) for x in ("cat1", "cat2")]

# contaminate with outliers
fake_var_1[10] = 100
fake_var_2[100] = -100

fake_data = pd.DataFrame(
    {
        "var_1": fake_var_1,
        "var_2": fake_var_2,
        "var_3": fake_var_3
    }
)

In [6]:
fake_data_dummy = pd.get_dummies(fake_data)

detector = ECOD()
detector.fit(fake_data_dummy)
pred = detector.predict(fake_data_dummy)
print(f"10: {pred[10]} | 100: {pred[100]}")


10: 1 | 100: 1


In [7]:
fake_data_2 = fake_data.copy()
fake_data_2.loc[200:210, "var_3"] = "cat3"

fake_data_dummy = pd.get_dummies(fake_data_2)

detector = ECOD()
detector.fit(fake_data_dummy)
pred = detector.predict(fake_data_dummy)
for i in range(10):
    idx = 200 + i
    print(f"-> {idx} = {pred[idx]}")

-> 200 = 1
-> 201 = 1
-> 202 = 1
-> 203 = 1
-> 204 = 1
-> 205 = 1
-> 206 = 1
-> 207 = 1
-> 208 = 1
-> 209 = 1
