In [120]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import torch

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier

from src import TransactionDatasetNewData, Conv1dAutoEncoder

In [13]:
dataset_normal = TransactionDatasetNewData('data/new_data/normal/')
dataset_anomaly = TransactionDatasetNewData('data/new_data/anomaly/')

In [23]:
len(dataset_normal), len(dataset_anomaly)

(351060, 99870)

In [6]:
model = Conv1dAutoEncoder.load_from_checkpoint(
    'lightning_logs/cae/version_0/checkpoints/epoch=19-step=87780.ckpt',
    in_channels=1,
    n_latent_features=8
)

In [43]:
with torch.no_grad():
    info_normal = model.predict_step(dataset_normal[100].unsqueeze(0))
    info_anomaly = model.predict_step(dataset_anomaly[431].unsqueeze(0))

In [36]:
info_normal['loss'].item(), info_anomaly['loss']

(0.3152805268764496, tensor(0.3941))

In [48]:
info_normal['latent'].squeeze().mean(1)

tensor([-0.0637, -0.0894,  0.0334,  0.0933, -0.2125, -0.0226, -0.1591,  0.2788])

In [64]:
losses_normal = np.zeros(len(dataset_normal), dtype=np.float16)
losses_anomaly = np.zeros(len(dataset_anomaly), dtype=np.float16)

features_normal = np.zeros((len(dataset_normal), 8), dtype=np.float16)
features_anomaly = np.zeros((len(dataset_anomaly), 8), dtype=np.float16)

with torch.no_grad():
    for i, normal_sample in tqdm(enumerate(dataset_normal), total=len(dataset_normal)):
        model_sample = model.predict_step(normal_sample.unsqueeze(0))
        losses_normal[i] = model_sample['loss'].item()
        features_normal[i] = model_sample['latent'].squeeze().mean(1).numpy()

    for i, anomaly_sample in tqdm(enumerate(dataset_anomaly), total=len(dataset_anomaly)):
        model_sample = model.predict_step(anomaly_sample.unsqueeze(0))
        losses_anomaly[i] = model_sample['loss'].item()
        features_anomaly[i] = model_sample['latent'].squeeze().mean(1).numpy()

100%|██████████| 351060/351060 [47:32<00:00, 123.05it/s] 


FileNotFoundError: [Errno 2] No such file or directory: 'data/new_data/normal/351060.csv'

In [65]:
with torch.no_grad():
    for i, anomaly_sample in tqdm(enumerate(dataset_anomaly), total=len(dataset_anomaly)):
        model_sample = model.predict_step(anomaly_sample.unsqueeze(0))
        losses_anomaly[i] = model_sample['loss'].item()
        features_anomaly[i] = model_sample['latent'].squeeze().mean(1).numpy()

100%|██████████| 99870/99870 [14:37<00:00, 113.77it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'data/new_data/anomaly/99870.csv'

In [77]:
losses_anomaly = losses_anomaly.reshape(-1, 1)
losses_normal = losses_normal.reshape(-1, 1)

In [86]:
anomaly_df = np.concatenate((features_anomaly, losses_anomaly, np.ones(losses_anomaly.shape)), axis=1)
normal_df = np.concatenate((features_normal, losses_normal, np.zeros(losses_normal.shape)), axis=1)

In [91]:
df_learn = np.concatenate((normal_df, anomaly_df), axis=0)
df_learn = pd.DataFrame(df_learn)
df_learn[9] = df_learn[9].astype(np.int32)
df_learn.rename(columns={9: 'target'}, inplace=True)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(
    df_learn[[0, 1, 2, 3, 4, 5, 6, 7, 8]],
    df_learn['target'],
    test_size=.8,
    shuffle=True)

In [122]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

print(f'f1 for train - {f1_score(y_train, log_reg.predict(X_train))}')
print(f'f1 for test - {f1_score(y_test, log_reg.predict(X_test))}')

print(f'roc_auc for train - {roc_auc_score(y_train, log_reg.predict(X_train))}')
print(f'roc_auc for test - {roc_auc_score(y_test, log_reg.predict(X_test))}')

f1 for train - 0.1423217550274223
f1 for test - 0.14400324972072712
roc_auc for train - 0.5355523230582288
roc_auc for test - 0.5359898280538264


In [124]:
xgcl = XGBClassifier(n_estimators=600)
xgcl.fit(X_train, y_train)

print(f'f1 for train - {f1_score(y_train, xgcl.predict(X_train))}')
print(f'f1 for test - {f1_score(y_test, xgcl.predict(X_test))}')

print(f'roc_auc for train - {roc_auc_score(y_train, xgcl.predict(X_train))}')
print(f'roc_auc for test - {roc_auc_score(y_test, xgcl.predict(X_test))}')

f1 for train - 0.631274729055565
f1 for test - 0.19199821642108678
roc_auc for train - 0.7310663588510085
roc_auc for test - 0.5428910102026354
