In [14]:
import argparse
import datetime
import os
import random
import signal
import sys
import warnings
from collections import Counter
from functools import partial
from pathlib import Path
from types import FrameType
from typing import Any

import dill
import multiprocess
import numpy as np
import pandas as pd
import torch
from synth_xai.explanations.explanation_utils import (
    evaluate_bb,
    find_top_closest_rows,
    get_test_data,
    is_explainer_supported,
    label_synthetic_data,
    load_bb,
    load_synthetic_data,
    make_predictions,
    prepare_neighbours,
    setup_wandb,
    transform_input_data,
)
from loguru import logger
from multiprocess import Pool
from scipy.stats import sem
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
)
from synth_xai.utils import (
    prepare_adult,
    prepare_breast_cancer,
    prepare_covertype,
    prepare_diabetes,
    prepare_dutch,
    prepare_house16,
    prepare_letter,
    prepare_pima,
    prepare_shuttle,
)

from synth_xai.bb_architectures import MultiClassModel, SimpleModel
from synth_xai.explanations.explainer_model import ExplainerModel

In [15]:
store_path = "../artifacts/dutch/explanations/logistic_tvae_100000_2500_1.pkl"
with Path(store_path).open("rb") as f:
    logistic_explanations = dill.load(f)

In [16]:
logistic_explanations[0]

(['sex_binary: coefficient=-92.56249402713907, value=0',
  'edu_level: coefficient=-48.93522179546129, value=3',
  'citizenship: coefficient=17.84497095300212, value=1',
  'Marital_status: coefficient=-17.20016342561355, value=2',
  'age: coefficient=-14.963362112520459, value=7',
  'prev_residence_place: coefficient=-14.65777058946945, value=1',
  'country_birth: coefficient=8.140114941576273, value=1',
  'economic_status: coefficient=6.505416047619395, value=111',
  'household_size: coefficient=-5.8086956664258915, value=113',
  'cur_eco_activity: coefficient=-2.5422864730615435, value=138',
  'household_position: coefficient=0.5672736517644529, value=1122'],
 1)

In [17]:
store_path = "../artifacts/dutch/explanations/dt_tvae_100000_2500_1.pkl"
with Path(store_path).open("rb") as f:
    dt_explanations = dill.load(f)

In [18]:
dt_explanations[0]

(['(edu_level = 3) <= 4.5',
  '(sex_binary = 0) <= 0.5',
  '(prev_residence_place = 1) <= 1.5',
  'Leaf node 3 reached, prediction: 1'],
 1)

In [19]:
store_path = "../artifacts/dutch/explanations/knn_tvae_100000_2500_1.pkl"
with Path(store_path).open("rb") as f:
    knn_explanations = dill.load(f)

In [20]:
knn_explanations[0]

(['KNN prediction: 1',
  'Nearest neighbors (index, distance, label):',
  'Index: 134, distance: 0.0000, label: 1, sample: [   7 1122  113    1    1    1    3  111  138    2    0]',
  'Index: 25, distance: 0.0000, label: 1, sample: [   7 1122  113    1    1    1    3  111  138    2    0]',
  'Index: 40, distance: 0.0000, label: 1, sample: [   7 1122  113    1    1    1    3  111  138    2    0]'],
 1)

In [7]:
file_path = Path("./synth_xai/explanations/")
_, _, _, _, _, _, train_df, test_data = prepare_dutch(
                sweep=False, seed=42, current_path=file_path
            )

bb = load_bb("/home/lcorbucci/synth_xai/artifacts/dutch/bb/dutch_BB.pth")

In [8]:
x, y, scaler = transform_input_data(train_data=train_df, test_data=test_data, outcome_variable="occupation_binary")
outputs = evaluate_bb(x, y, bb)

[32m2025-05-26 12:32:27.515[0m | [1mINFO    [0m | [36msynth_xai.explanations.explanation_utils[0m:[36mevaluate_bb[0m:[36m217[0m - [1mAccuracy: 0.832092022509103 - F1: 0.8321849875595729[0m


In [9]:
num_samples = len(test_data)
for index in range(num_samples):
    sample = test_data.iloc[[index]]
    x_sample = torch.tensor([x[index]])
    y_sample = torch.tensor([y[index]])
    sample_pred_bb = make_predictions(x_sample, y_sample, bb)

In [12]:
test_data.iloc[[0]].to_dict()

{'age': {18273: 7},
 'household_position': {18273: 1122},
 'household_size': {18273: 113},
 'prev_residence_place': {18273: 1},
 'citizenship': {18273: 1},
 'country_birth': {18273: 1},
 'edu_level': {18273: 3},
 'economic_status': {18273: 111},
 'cur_eco_activity': {18273: 138},
 'Marital_status': {18273: 2},
 'sex_binary': {18273: 0},
 'occupation_binary': {18273: 1}}

In [21]:
test_data.iloc[[134]].to_dict()

{'age': {781: 11},
 'household_position': {781: 1121},
 'household_size': {781: 112},
 'prev_residence_place': {781: 1},
 'citizenship': {781: 1},
 'country_birth': {781: 1},
 'edu_level': {781: 3},
 'economic_status': {781: 111},
 'cur_eco_activity': {781: 135},
 'Marital_status': {781: 2},
 'sex_binary': {781: 0},
 'occupation_binary': {781: 1}}

In [None]:
"""
Example code of a simple RNN, GRU, LSTM on the MNIST dataset.

Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
*    2020-05-09 Initial coding
*    2022-12-16 Updated with more detailed comments, docstrings to functions, and checked code still functions as intended.

"""

# Imports
import torch
import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets  # Standard datasets
import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
from torch import optim  # For optimizers like SGD, Adam, etc.
from torch import nn  # All neural network modules
from torch.utils.data import (
    DataLoader,
)  # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm  # For a nice progress bar!

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Hyperparameters
input_size = 28
hidden_size = 256
num_layers = 2
num_classes = 10
sequence_length = 28
learning_rate = 0.005
batch_size = 64
num_epochs = 3

# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.gru(x, h0)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Recurrent neural network with LSTM (many-to-one)
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * sequence_length, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        out, _ = self.lstm(
            x, (h0, c0)
        )  # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = out.reshape(out.shape[0], -1)

        # Decode the hidden state of the last time step
        out = self.fc(out)
        return out


# Load Data
train_dataset = datasets.MNIST(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Initialize network (try out just using simple RNN, or GRU, and then compare with LSTM)
model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Network
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
        # Get data to cuda if possible
        data = data.to(device=device).squeeze(1)
        targets = targets.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent update step/adam step
        optimizer.step()

# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    # Set model to eval
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    # Toggle model back to train
    model.train()
    return num_correct / num_samples


print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
