In [1]:
import random 
from types import SimpleNamespace

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
import sys
import os
from pathlib import Path

parent = Path(os.path.abspath("")).resolve().parents[0]
if parent not in sys.path:
    sys.path.insert(0, str(parent))
    
from ml.utils.data_utils import TorchDataset
from ml.utils.metrics import get_classification_metrics, get_probability_measures, get_lift_demotion_scores

from ml.models.autoencoder import AutoEncoder
from ml.models.mlp import MLP
from ml.utils.losses import FocalLoss, CustomHingeLoss, CustomBCELoss, ModifiedHuberLoss, FbetaLoss

In [3]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
def merge_strings_and_integers(val):
    try:
        return int(val)
    except:
        return 'other'

In [5]:
df = pd.read_csv("final_autof.csv")
df.drop([
    'MODE(consumptions.MS_METER_NBR)',
    'MODE(representations.SUPPLIER)',
    'MODE(representations.SUPPLIER_TO)',
], axis=1, inplace=True)
df.dropna(subset=['number_of_zeros'], inplace=True)


df['MODE(consumptions.BS_RATE)'] = df['MODE(consumptions.BS_RATE)'].apply(merge_strings_and_integers)
df = df.drop(['rec_id'], axis=1)
df = df.drop_duplicates()
df['MODE(requests.REQUEST_TYPE)'] = df['MODE(requests.REQUEST_TYPE)'].replace(0, 'unknown')
df = pd.get_dummies(df, columns=['MODE(requests.REQUEST_TYPE)'], prefix='MODE(requests.REQUEST_TYPE)')
df = pd.get_dummies(df, columns=['MODE(consumptions.BS_RATE)'], prefix='MODE(consumptions.BS_RATE)')
df = df.drop(['voltage'], axis=1)
df.fillna(0, inplace=True)
df.head()

  df = pd.read_csv("final_autof.csv")


Unnamed: 0,COUNT(consumptions),MAX(consumptions.CSS_MS_HS_USE),MEAN(consumptions.CSS_MS_HS_USE),MIN(consumptions.CSS_MS_HS_USE),NUM_UNIQUE(consumptions.BS_RATE),NUM_UNIQUE(consumptions.MS_METER_NBR),SKEW(consumptions.CSS_MS_HS_USE),STD(consumptions.CSS_MS_HS_USE),SUM(consumptions.CSS_MS_HS_USE),MODE(consumptions.DAY(MEASUREMENT_DATE)),...,MODE(consumptions.BS_RATE)_40,MODE(consumptions.BS_RATE)_41,MODE(consumptions.BS_RATE)_42,MODE(consumptions.BS_RATE)_43,MODE(consumptions.BS_RATE)_44,MODE(consumptions.BS_RATE)_52,MODE(consumptions.BS_RATE)_53,MODE(consumptions.BS_RATE)_54,MODE(consumptions.BS_RATE)_55,MODE(consumptions.BS_RATE)_other
0,4.0,135.0,64.25,12.0,1.0,1.0,0.758461,54.389797,257.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,127.0,51.9,0.0,1.0,2.0,0.783315,43.072162,519.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,12.0,6068.0,3924.166667,2446.0,1.0,1.0,0.574519,1012.516472,47090.0,28.0,...,0,0,0,0,0,0,0,0,0,0
3,6.0,1061.0,479.333333,7.0,2.0,1.0,0.020392,409.340282,2876.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,24.0,1247.0,536.833333,0.0,2.0,1.0,0.537625,335.364392,12884.0,4.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df = df.drop_duplicates()
print(len(df.loc[df.target == 1]))
df.head()

1552


Unnamed: 0,COUNT(consumptions),MAX(consumptions.CSS_MS_HS_USE),MEAN(consumptions.CSS_MS_HS_USE),MIN(consumptions.CSS_MS_HS_USE),NUM_UNIQUE(consumptions.BS_RATE),NUM_UNIQUE(consumptions.MS_METER_NBR),SKEW(consumptions.CSS_MS_HS_USE),STD(consumptions.CSS_MS_HS_USE),SUM(consumptions.CSS_MS_HS_USE),MODE(consumptions.DAY(MEASUREMENT_DATE)),...,MODE(consumptions.BS_RATE)_40,MODE(consumptions.BS_RATE)_41,MODE(consumptions.BS_RATE)_42,MODE(consumptions.BS_RATE)_43,MODE(consumptions.BS_RATE)_44,MODE(consumptions.BS_RATE)_52,MODE(consumptions.BS_RATE)_53,MODE(consumptions.BS_RATE)_54,MODE(consumptions.BS_RATE)_55,MODE(consumptions.BS_RATE)_other
0,4.0,135.0,64.25,12.0,1.0,1.0,0.758461,54.389797,257.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,10.0,127.0,51.9,0.0,1.0,2.0,0.783315,43.072162,519.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,12.0,6068.0,3924.166667,2446.0,1.0,1.0,0.574519,1012.516472,47090.0,28.0,...,0,0,0,0,0,0,0,0,0,0
3,6.0,1061.0,479.333333,7.0,2.0,1.0,0.020392,409.340282,2876.0,4.0,...,0,0,0,0,0,0,0,0,0,0
4,24.0,1247.0,536.833333,0.0,2.0,1.0,0.537625,335.364392,12884.0,4.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print("Bincount of y:", df['target'].value_counts())

Bincount of y: 0.0    698967
1.0      1552
Name: target, dtype: int64


In [8]:
train, val = train_test_split(
    df, test_size=0.2, 
    random_state=42, shuffle=True, 
    stratify=df.target.values)

In [9]:
train.target.value_counts(), val.target.value_counts()

(0.0    559173
 1.0      1242
 Name: target, dtype: int64,
 0.0    139794
 1.0       310
 Name: target, dtype: int64)

In [10]:
X_train = train.drop('target', axis=1)
y_train = train['target']
X_val = val.drop('target', axis=1)
y_val = val['target']

In [11]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [12]:
train_dataset = TorchDataset(
    X=torch.tensor(X_train).float(),
    y=torch.tensor(y_train.to_numpy()).float()
)  # reconstruction
val_dataset = TorchDataset(
    X=torch.tensor(X_val).float(),
    y=torch.tensor(y_val.to_numpy()).float()
)

In [13]:
np.bincount(y_train), np.bincount(y_val)

(array([559173,   1242]), array([139794,    310]))

In [14]:
train_loader = DataLoader(
    train_dataset, batch_size=512, shuffle=False
)
val_loader = DataLoader(
    val_dataset, batch_size=512, shuffle=False
)

In [15]:
encoder = MLP(
    in_size=X_train.shape[1],
    layer_units=[64],
    out_size=32,
    vae=False) # do not use VAE
decoder = MLP(
    in_size=32, 
    layer_units=[64],
    out_size=X_train.shape[1]
)
classifier = MLP(
    in_size=X_train.shape[1],
    layer_units=[32, 64],
    out_size=1,
    init_weights=False
)
model = AutoEncoder(encoder, decoder, classifier=classifier) # make it an AutoEncoder-based classifier.
model

AutoEncoder(
  (encoder): MLP(
    (MLP): Sequential(
      (0): Linear(in_features=88, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=32, bias=True)
    )
  )
  (decoder): MLP(
    (MLP): Sequential(
      (0): Linear(in_features=32, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=88, bias=True)
    )
  )
  (classifier): MLP(
    (MLP): Sequential(
      (0): Linear(in_features=88, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=1, bias=True)
    )
  )
)

In [16]:
train_bins = np.bincount(y_train)
alpha = train_bins[0] / train_bins[1]
alpha

450.219806763285

In [17]:
optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=1e-3)
criterion = FbetaLoss()#torch.nn.BCEWithLogitsLoss()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
    
args = dict(
    model=model,
    train_loader=train_loader, 
    test_loader=val_loader,
    epochs=100, 
    optimizer=optimizer, 
    criterion=criterion,
    reconstruction=False,
    vae=False,
    device=device,
    verbose=True, 
    return_best=True,
    plot_history=True,
    num_test_samples=100
)

cuda


In [18]:
args = SimpleNamespace(**args)
model = model.fit(args)

Epoch: 1/100
[Train]	Loss=0.9957063997717208, Accuracy: 0.0022162147694119536, Precision: 0.0022162147694119536, Recall: 1.0, F1: 0.004422628045230452
		Brier score loss: 0.6583152773986325, ROC-AUC: 0.744742505389537, PR-AUC: 0.006793969079924019
		Lift: 0.02, Demotion: 1.0, Weighted Score: 0.314
[Test]	Loss=0.9939190149307251, Accuracy: 0.0022126420373436875, Precision: 0.0022126420373436875, Recall: 1.0, F1: 0.004415514122523396
		Brier score loss: 0.6583155309501787, ROC-AUC: 0.7331931847183436, PR-AUC: 0.007179540272748152
		Lift: 0.0, Demotion: 1.0, Weighted Score: 0.3

Epoch: 2/100
[Train]	Loss=0.9940776368254396, Accuracy: 0.0022162147694119536, Precision: 0.0022162147694119536, Recall: 1.0, F1: 0.004422628045230452
		Brier score loss: 0.7376924379944031, ROC-AUC: 0.7933679386420076, PR-AUC: 0.009392615594654104
		Lift: 0.02, Demotion: 1.0, Weighted Score: 0.314
[Test]	Loss=0.9846575856208801, Accuracy: 0.0022126420373436875, Precision: 0.0022126420373436875, Recall: 1.0, F1: 0

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
tmp_df = df.copy()

In [None]:
def get_loss_and_hidden(row, model):
    model.eval()
    model.to('cpu') # move model to cpu
    tmp_X = row.drop('target') # get the features
    target = row.target # get the target
    input_tensor = torch.tensor(tmp_X.values, dtype=torch.float32) # transform to torch tensor
    target = torch.tensor(target, dtype=torch.float32)
    
    encoded = model.encode(input_tensor) # get the latent
    decoded = model(input_tensor) # get the prediction
    loss = torch.nn.functional.binary_cross_entropy_with_logits(decoded[0], target) # calculate the loss
    
    return loss.item(), encoded.detach().numpy().tolist()

In [None]:
X = tmp_df.drop(['target'], axis=1)
X = scaler.transform(X)
y = tmp_df['target'].values
cols = list(df.columns)
cols.remove("target")
tmp = pd.DataFrame(X, columns=cols)
tmp['target'] = y

In [None]:
tmp[['loss', 'hidden']] = tmp.apply(lambda row: pd.Series(get_loss_and_hidden(row, model)), axis=1)
tmp.drop(['loss'], axis=1, inplace=True)

In [None]:
tmp.head()

In [None]:
len(tmp)

In [None]:
tmp.head()

In [None]:
tmp2 = pd.DataFrame(tmp['hidden'].apply(pd.Series).values, columns=[f't_hidden_{i}' for i in range(1, 33)])

In [None]:
tmp3 = pd.concat([tmp, tmp2], axis=1)

In [None]:
tmp3.head()

In [None]:
tmp3.drop(['hidden'], axis=1, inplace=True)

In [None]:
tmp3.head()

In [None]:
final = tmp3[['t_hidden_1', 't_hidden_2', 't_hidden_3', 't_hidden_4',
          't_hidden_5', 't_hidden_6', 't_hidden_7', 't_hidden_8',
          't_hidden_9', 't_hidden_10', 't_hidden_11', 't_hidden_12',
          't_hidden_13', 't_hidden_14', 't_hidden_15', 't_hidden_16',
          't_hidden_17', 't_hidden_18', 't_hidden_19', 't_hidden_20',
          't_hidden_21', 't_hidden_22', 't_hidden_23', 't_hidden_24',
          't_hidden_25', 't_hidden_26', 't_hidden_27', 't_hidden_28',
          't_hidden_29', 't_hidden_30', 't_hidden_31', 't_hidden_32']]

In [None]:
len(df), len(final)

In [None]:
#final.to_csv("autoencoder_classifier_loss_encoded.csv", index=False)