# DO NOT RUN IT ON A LOCAL MACHINE!

In [None]:
import random 
from types import SimpleNamespace

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import optuna
import itertools

import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys
import os
from pathlib import Path

parent = Path(os.path.abspath("")).resolve().parents[0]
if parent not in sys.path:
    sys.path.insert(0, str(parent))
    
from ml.utils.data_utils import TorchDataset
from ml.utils.metrics import get_classification_metrics, get_probability_measures, get_lift_demotion_scores

from ml.models.autoencoder import AutoEncoder
from ml.models.mlp import MLP

In [None]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
def merge_strings_and_integers(val):
    try:
        return int(val)
    except:
        return 'other'

In [None]:
df = pd.read_csv("final_autof.csv")
df.drop([
    'MODE(consumptions.MS_METER_NBR)',
    'MODE(representations.SUPPLIER)',
    'MODE(representations.SUPPLIER_TO)',
], axis=1, inplace=True)
df.dropna(subset=['number_of_zeros'], inplace=True)


df['MODE(consumptions.BS_RATE)'] = df['MODE(consumptions.BS_RATE)'].apply(merge_strings_and_integers)
df = df.drop(['rec_id'], axis=1)
df = df.drop_duplicates()
df['MODE(requests.REQUEST_TYPE)'] = df['MODE(requests.REQUEST_TYPE)'].replace(0, 'unknown')
df = pd.get_dummies(df, columns=['MODE(requests.REQUEST_TYPE)'], prefix='MODE(requests.REQUEST_TYPE)')
df = pd.get_dummies(df, columns=['MODE(consumptions.BS_RATE)'], prefix='MODE(consumptions.BS_RATE)')
df = df.drop(['voltage'], axis=1)
df.fillna(0, inplace=True)
df.head()

In [None]:
df = df.drop_duplicates()
print(len(df.loc[df.target == 1]))
df.head()

In [None]:
print("Bincount of y:", df['target'].value_counts())

In [None]:
train, val = train_test_split(
    df, test_size=0.2, 
    random_state=42, shuffle=True, 
    stratify=df.target.values)

In [None]:
train.target.value_counts(), val.target.value_counts()

In [None]:
X_train = train.drop('target', axis=1)
y_train = train['target']
X_val = val.drop('target', axis=1)
y_val = val['target']

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
train_dataset = TorchDataset(
    X=torch.tensor(X_train).float(),
    y=torch.tensor(y_train.to_numpy()).float()
)  # reconstruction
val_dataset = TorchDataset(
    X=torch.tensor(X_val).float(),
    y=torch.tensor(y_val.to_numpy()).float()
)

In [None]:
np.bincount(y_train), np.bincount(y_val)

In [None]:
train_loader = DataLoader(
    train_dataset, batch_size=512, shuffle=False
)
val_loader = DataLoader(
    val_dataset, batch_size=512, shuffle=False
)

In [None]:
iterable = [4, 8, 16, 32, 64]
combinations = []
for r in range(1, len(iterable) + 1):
    combinations.extend([list(x) for x in itertools.combinations(iterable=iterable, r=r)])
print(combinations)

In [None]:
iterable2 = [16, 32, 64, 128]
combinations2 = []
for r in range(1, len(iterable2) + 1):
    combinations2.extend([list(x) for x in itertools.combinations(iterable=iterable2, r=r)])
print(combinations2)

In [None]:
def objective(trial):
    hidden_units = trial.suggest_categorical('units', choices=combinations)
    hidden_units2 = trial.suggest_categorical('units2', choices=combinations2)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    
    encoder = MLP(
        in_size = X_train.shape[1],
        layer_units = hidden_units[:-1],
        out_size=hidden_units[-1],
        vae=False
    )
    
    decoder_units = hidden_units[:]
    decoder_units.reverse()
    
    decoder = MLP(
        in_size = decoder_units[0],
        layer_units = decoder_units[1:],
        out_size=X_train.shape[1],    
    )
    
    classifier = MLP(
        in_size=X_train.shape[1],
        layer_units = hidden_units2,
        out_size=1,
        init_weights=False
    )
    model = AutoEncoder(encoder, decoder, classifier=classifier) # make it an AutoEncoder-based classifier.
    # print(model)
    
    optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=lr, weight_decay=weight_decay)
    
    criterion = torch.nn.BCEWithLogitsLoss()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    args = dict(
        model=model,
        train_loader=train_loader, 
        test_loader=val_loader,
        epochs=100, 
        optimizer=optimizer, 
        criterion=criterion,
        reconstruction=False,
        vae=False,
        device=device,
        verbose=True, 
        return_best=True,
        plot_history=True,
        num_test_samples=100
    )
    args = SimpleNamespace(**args)
    model = model.fit(args)
    
    args = dict(
        model=model,
        data_loader=val_loader,
        criterion=criterion,
        reconstruction=False,
        num_samples=100,
        device=device
    )
    args = SimpleNamespace(**args)
    aux = model.predict(args)
    
    weighted_score = aux[-1]
    
    return weighted_score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("  Number of finished trials: ", len(study.trials))

In [None]:
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))