## Imports and dataset

In [1]:
from src.models.conjugate_bayes_lin_reg import NormalInverseGammaPriorLinearRegression

from src.attacks.distr_attacks import mlmc_attack, kl_div, kl_to_appd

import numpy as np
import torch
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from joblib import Parallel, delayed

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid", palette="muted", font="serif")

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams.update({
    'axes.titlesize': 18,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'axes.titleweight': 'bold',
    'axes.edgecolor': 'black',
    'axes.linewidth': 1,
    'grid.alpha': 0.5,
    'grid.linestyle': '--',
    'legend.fontsize': 12,
    'legend.frameon': False,
    'figure.dpi': 300,  
})

In [None]:
# set all seeds for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Comparison of 3 datasets: Wine, energy and housing

In [4]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
data = pd.read_excel(url)

# The last two columns are the target variables (Heating Load and Cooling Load)
X = data.iloc[:, :-2].values  # Covariates
y = data.iloc[:, -2].values  # Heating Load
y_energy = y / 4  # to make the target variable similar to the other datasets

# Normalize the data
scaler = MinMaxScaler()
X_energy = scaler.fit_transform(X)

In [5]:
# Wine Quality dataset with response in {3, 4, 5, 6, 7, 8}, 11 features and 4898 samples

# URL to the Wine Quality dataset (for example, from UCI Machine Learning Repository)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
# Load the dataset directly into a Pandas DataFrame
data = pd.read_csv(url, delimiter=";")

X = data.iloc[:, :-1].values
y_wine = data.iloc[:, -1].values

# Normalize the data
scaler = MinMaxScaler()
X_wine = scaler.fit_transform(X)

In [6]:
# Housing dataset
california_housing = fetch_california_housing(as_frame=True)

X = california_housing.data.values
y_housing = california_housing.target.values

X_housing = MinMaxScaler().fit_transform(X)

In [None]:
# For all 3 datasets: Fit the model, compute the average rmse over the test set for the original data and the adversarial examples with epsilon = 0.2 and epsilon = 0.5
datasets = ['energy', 'housing', 'wine']
epsilons = [0, 0.2, 0.5]

results = []
results_attack = []

for dataset in datasets:
    print(f'Running dataset {dataset}')
    if dataset == 'energy':
        X, y = X_energy, y_energy
    elif dataset == 'wine':
        X, y = X_wine, y_wine
    elif dataset == 'housing':
        X, y = X_housing, y_housing

    results_dataset = []
    results_dataset_attack = []
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)

        model = NormalInverseGammaPriorLinearRegression(prior_params={
            'mu': torch.zeros(X_train.shape[1]), 
            'lam': torch.eye(X_train.shape[1]), 
            'a': torch.tensor([10]), 
            'b': torch.tensor([1])}
            )
        data = {'X': torch.tensor(X_train, dtype=torch.float32), 'y': torch.tensor(y_train, dtype=torch.float32)}
        model.fit(data)

        X_test = X_test[:88, :]  # 88 is the njobs number, so we can parallelize the computation efficiently
        y_test = y_test[:88]

        res_it = []
        res_it_attack = []
        for epsilon in epsilons:
            def compute_kl(i):
                x = torch.tensor(X_test[i,:].copy(), dtype=torch.float32).unsqueeze(1)
                std = model.sample_predictive_distribution(x, 1000).std()
                appd = torch.distributions.normal.Normal(2 + x.T @ model.mu, 2 * std)
                if epsilon == 0:
                    x_adv = x.clone().detach()
                else:
                    x_adv, _ = mlmc_attack(model, x, appd, epsilon=epsilon, verbose=False, R=10, lr=0.01, n_iter=800)
                sigma2 = model.sample_posterior_distribution(1000)[1].mean()
                att_kl = kl_to_appd(model.mu, model.lam, sigma2, x_adv, 2 * x.T @ model.mu, 4 * std ** 2).item()
                df_kl = kl_div(model.mu, model.lam, sigma2, x, x_adv).item()
                return att_kl#, df_kl

            kl_values = Parallel(n_jobs=-1)(delayed(compute_kl)(i) for i in range(X_test.shape[0]))
            #kl_values_attack = np.array(kl_values)[]
            kl = sum(kl_values)
            res_it.append(kl / X_test.shape[0])
        results_dataset.append(res_it)
    results.append(results_dataset)

In [None]:
# Display as a table
results_array = np.array(results).mean(axis=1)
results_array = np.round(results_array, 3)
results_df = pd.DataFrame(results_array, columns=epsilons, index=datasets)
results_df

In [None]:
# Display as a table
results_array = 2 * np.array(results).std(axis=1)
results_array = np.round(results_array, 3)
results_df = pd.DataFrame(results_array, columns=epsilons, index=datasets)
results_df