In [1]:
import pandas as pd
from sklearn.datasets import fetch_covtype
import numpy as np
from tqdm import tqdm
import copy

from river.datasets import synth
from src.models.networks import get_mlp
from src.data.datasets import get_dataset, river2np_dataset
from src.models.networks import get_mlp, get_autoencoder
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

from torchvision.datasets import MNIST, FashionMNIST

from sklearn.ensemble import IsolationForest
from river.anomaly import HalfSpaceTrees

In [11]:
anom_type = "ood"
anom_proba = 0.05
anom_len = 2
anom_class = 3
device = "cuda:0"
ood_type = "classes"

id_classes = [1, 2]
ood_classes = [4]

In [6]:
def get_anom_idcs(total_len, anom_len=6, anom_proba=0.05):
    anom_insert_idcs = []
    current_is_anom = False

    adj_anom_proba = anom_proba / (1 - anom_proba)
    p_anom_normal = 1 / anom_len
    p_normal_anom = adj_anom_proba / (anom_len * (1 - adj_anom_proba))

    exit_probas = {False: p_normal_anom, True: p_anom_normal}
    for idx, p in enumerate(np.random.rand(total_len)):
        if p < exit_probas[current_is_anom]:
            current_is_anom = not current_is_anom
        if current_is_anom:
            anom_insert_idcs.append(idx)
    return anom_insert_idcs[::-1]

In [12]:
np.random.seed(42)
torch.manual_seed(42)

# mnist_data = MNIST(root="./data", train=True, download=True)
# xs, ys = mnist_data.data, mnist_data.targets
# xs = xs / 255
# xs, ys = xs.numpy(), ys.numpy()
# xs = xs.reshape(len(ys), -1)

# fmnist_data = FashionMNIST(root="./data", train=True, download=True)
# xs_ood = fmnist_data.data
# xs_ood = xs_ood / 255
# xs_ood = xs_ood.numpy()
# xs_ood = xs_ood.reshape(len(ys), -1)

xs_all, ys_all = get_dataset("Covertype")

id_mask = np.isin(ys_all, id_classes)
xs, ys = xs_all[id_mask], ys_all[id_mask]


n_normal = len(xs)
anom_insert_idcs = get_anom_idcs(len(xs), anom_len, anom_proba)

# anom_insert_idcs = np.sort(
#     np.random.choice(np.arange(n_normal), size=round(anom_proba/(1-anom_proba)), replace=False)
# )[::-1]

n_anom = len(anom_insert_idcs)

y_unique = np.unique(ys)


if ood_type == "other":
    ood_stream = synth.RandomRBF(
        seed_model=42, seed_sample=42, n_classes=10, n_features=xs.shape[-1]
    ).take(n_anom)
    xs_ood, _ = river2np_dataset(ood_stream)
elif ood_type == "classes":
    xs_ood = xs_all[np.isin(ys_all, ood_classes)]


is_anom = np.zeros(len(xs)).astype("bool")
if anom_type == "label_flip":
    x_contam, y_contam = xs.copy(), ys.copy()
    y_contam[anom_insert_idcs] = np.random.choice(y_unique, size=n_anom)
    # y_contam[anom_insert_idcs] = anom_class
    is_anom[anom_insert_idcs] = True
elif anom_type == "feature_swap":
    swap_ratio = 0.1
    n_swap = round(swap_ratio * xs.shape[-1])
    swap_idcs = np.random

    x_contam, y_contam = xs.copy(), ys.copy()
    y_contam[anom_insert_idcs] = np.random.choice(y_unique, size=n_anom)
    # y_contam[anom_insert_idcs] = anom_class
    is_anom[anom_insert_idcs] = True
else:
    anom_idcs = np.random.choice(len(xs_ood), size=n_anom, replace=True)
    anoms = xs_ood[anom_idcs]
    x_contam, y_contam, is_anom = xs.tolist(), ys.tolist(), is_anom.tolist()
    for idx, anom in zip(anom_insert_idcs, anoms):
        x_contam.insert(idx, anom)
        y_contam.insert(idx, anom_class)
        # y_contam.insert(idx, np.random.choice(y_unique))

        is_anom.insert(idx, True)

    x_contam = np.array(x_contam)
    y_contam = np.array(y_contam)

label_enc = LabelEncoder()
y_contam = label_enc.fit_transform(y_contam)
is_anom = np.array(is_anom)

data_contam = TensorDataset(
    torch.tensor(x_contam, dtype=torch.float),
    torch.tensor(y_contam, dtype=torch.long),
    torch.tensor(is_anom),
)
loader_contam = DataLoader(data_contam, batch_size=1, shuffle=False)

In [15]:
torch.manual_seed(42)

accuracies = []
for skip_anomalies in [True, False]:
    model = get_mlp(
        in_features=x_contam.shape[-1],
        out_features=y_contam.max() + 1,
        n_hidden_units=128,
        n_hidden_layers=1,
    )
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

    all_preds = []
    for x, y, is_anom_i in tqdm(loader_contam):
        x, y = x.to(device), y.to(device)

        logits = model(x)
        pred = torch.argmax(logits, dim=-1)
        all_preds.append(pred.detach().cpu().item())

        if skip_anomalies and is_anom_i:
            continue

        loss = F.cross_entropy(logits, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    all_preds = np.array(all_preds)

    accuracies.append((all_preds == y_contam)[~is_anom].mean())
print(accuracies[0] - accuracies[1])

  0%|          | 0/93415 [00:00<?, ?it/s]

100%|██████████| 93415/93415 [00:55<00:00, 1679.27it/s]
100%|██████████| 93415/93415 [00:53<00:00, 1759.19it/s]

0.0069162057740180805





In [None]:
def run(clf_params):
    model = get_mlp(
        in_features=x_contam.shape[-1],
        out_features=y_contam.max() + 1,
        n_hidden_units=clf_params['n_hidden_units'],
        n_hidden_layers=clf_params['n_hidden_layers'],
    )
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

    all_preds = []
    for x, y, is_anom_i in tqdm(loader_contam):
        x, y = x.to(device), y.to(device)

        logits = model(x)
        pred = torch.argmax(logits, dim=-1)
        all_preds.append(pred.detach().cpu().item())

        if skip_anomalies and is_anom_i:
            continue

        loss = F.cross_entropy(logits, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    all_preds = np.array(all_preds)

    accuracies.append((all_preds == y_contam)[~is_anom].mean())

In [9]:
accuracies

[np.float64(0.8334591166630997), np.float64(0.8251799452560911)]

In [14]:
autoencoder = get_autoencoder(
    in_features=x_contam.shape[-1], compression=.5, dropout=0
)
autoencoder = autoencoder.to(device)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-4)

recon_errors = []
for x, y, is_anom_i in tqdm(loader_contam):
    x, y = x.to(device), y.to(device)
    autoencoder.eval()
    with torch.inference_mode():
        x_recon = autoencoder(x)
        error = F.mse_loss(x_recon, x, reduction="none").mean().item()
        recon_errors.append(error)

    autoencoder.train()
    optimizer.zero_grad()
    x_recon = autoencoder(x)
    loss = F.mse_loss(x_recon, x)
    loss.backward()
    optimizer.step()


recon_errors = np.array(recon_errors)

# ROC AUC for anomaly detection
roc_auc_score(is_anom, recon_errors)


  0%|          | 0/93415 [00:00<?, ?it/s]

100%|██████████| 93415/93415 [01:29<00:00, 1040.46it/s]


0.9530651603404363

In [30]:
threshold = np.quantile(recon_errors, 0.95)
print(
    "Recall: ",
    (recon_errors > threshold)[is_anom].mean(),
    "Precision: ",
    is_anom[recon_errors > threshold].mean(),
)

Recall:  0.5825786977145321 Precision:  0.5784628559195033
