In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import torch
from train_bb import SimpleModel
import random
import numpy as np
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from utils import prepare_adult
from pathlib import Path

current_script_path = Path.cwd()

_, _, _, _, _, _, real_data = prepare_adult(
    sweep=False, seed=42, current_path=current_script_path
)

real_data["income"].value_counts()

(48841, 18) (48841,)


income
0    29723
1     9349
Name: count, dtype: int64

In [2]:
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
np.random.seed(42)

In [3]:
df = pd.read_csv("data/pima/pima.csv")

sample = df.sample(1)

In [None]:
synthetic_data = pd.read_csv("data/pima/synthetic_daata.csv")
synthetic_data["Outcome"].value_counts()

Outcome
0    26382
1    23618
Name: count, dtype: int64

In [5]:
# find the 100 closest row in the synthetic data to the sample row
# using cosine similarity between the sample row and each row in the synthetic data
y_sample = sample["Outcome"]
sample = sample.drop(columns=["Outcome"])
y_synth = synthetic_data["Outcome"]
synthetic_data = synthetic_data.drop(columns=["Outcome"])

similarity = cosine_similarity(sample, synthetic_data)
similarity = similarity.flatten()

top = 5
total_top = (len(similarity) * top) // 100
top_100 = similarity.argsort()[-total_top:][::-1]

In [6]:
# load the bb
# dataset_name = "pima"
# model_name = "pima_bb"

dataset_name = "pima"
model_name = "pima_bb"

bb = torch.load(f"./artifacts/{dataset_name}/bb/{model_name}.pth")

## Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
from torch.utils.data import (
    DataLoader,
    TensorDataset,
)

In [8]:
synthetic_data["Outcome"] = y_synth

In [9]:
def create_torch_loader(batch_size, x_train, y_train):
    x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

    train_dataset = TensorDataset(x_train_tensor, y_train_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size)

    return train_loader

In [10]:
from sklearn.model_selection import GridSearchCV

top_100_rows_with_outcome = synthetic_data.iloc[top_100]
y = top_100_rows_with_outcome["Outcome"]
X = top_100_rows_with_outcome.drop("Outcome", axis=1)

In [11]:
Counter(y)

Counter({0: 1376, 1: 1124})

In [12]:
old_X = copy.deepcopy(X)
X = X.values
y = y.values
print(Counter(y))
train_loader = create_torch_loader(batch_size=16, x_train=X, y_train=y)

Counter({0: 1376, 1: 1124})


In [13]:
predictions = []
with torch.no_grad():
    for data, target in train_loader:
        data, target = data.to("cuda"), target.to("cuda")
        outputs = bb(data)
        predicted = outputs.argmax(dim=1, keepdim=True)
        predictions.extend(predicted)

In [14]:
y = [item.item() for item in predictions]
Counter(y)

Counter({1: 1930, 0: 570})

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define the parameter grid
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7, 10],
    "min_samples_leaf": [1, 2, 5, 10],
    "class_weight": [None, "balanced"],
}

# Initialize the grid search
grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy"
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy} - F1: {f1}")

Accuracy: 0.882 - F1: 0.9244558258642765


In [19]:
# Get the path of the sample row in the tree:
# Get the prediction for the sample row
sample_pred = clf.predict(sample)

# Get the decision path for the sample row
node_indicator = clf.decision_path(sample)
leave_id = clf.apply(sample)

# Get the feature and threshold used at each node
feature = clf.tree_.feature
threshold = clf.tree_.threshold

# Get the feature names
feature_names = sample.columns

# Print the path from the root to the leaf
node_index = node_indicator.indices[node_indicator.indptr[0] : node_indicator.indptr[1]]
print("Decision path for the sample row:")
for node_id in node_index:
    if leave_id[0] == node_id:
        print(f"Leaf node {node_id} reached, prediction: {sample_pred[0]}")
    else:
        if sample.iloc[0, feature[node_id]] <= threshold[node_id]:
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print(
            f"Node {node_id}: ({feature_names[feature[node_id]]} = {sample.iloc[0, feature[node_id]]}) {threshold_sign} {threshold[node_id]}"
        )


Decision path for the sample row:
Node 0: (BloodPressure = 58) <= 58.5
Node 1: (Glucose = 98) > 87.5
Node 89: (SkinThickness = 33) > 24.5
Node 109: (BloodPressure = 58) > 55.5
Node 113: (Age = 43) > 29.5
Node 125: (Glucose = 98) > 88.5
Leaf node 127 reached, prediction: 1


In [20]:
# Find the closest path that gives a different prediction
different_pred = 1 - sample_pred[0]
for i in range(len(clf.tree_.value)):
    if clf.tree_.value[i][0][different_pred] > clf.tree_.value[i][0][sample_pred[0]]:
        different_node_id = i
        break

# Print the path for the different prediction
node_indicator_diff = clf.decision_path(old_X.iloc[[different_node_id]])
node_index_diff = node_indicator_diff.indices[
    node_indicator_diff.indptr[0] : node_indicator_diff.indptr[1]
]

feature_names = sample.columns

print("\nClosest path with a different prediction:")
for node_id in node_index_diff:
    if different_node_id == node_id:
        print(f"Leaf node {node_id} reached, prediction: {different_pred}")
    else:
        if old_X.iloc[different_node_id, feature[node_id]] <= threshold[node_id]:
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print(
            f"Node {node_id}: ({feature_names[feature[node_id]]} = {old_X.iloc[different_node_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]}"
        )


Closest path with a different prediction:
Leaf node 0 reached, prediction: 0
Node 128: (Age = 40) > 34.5
Node 228: (BloodPressure = 62) <= 73.5
Node 229: (Glucose = 94) <= 104.5
Node 230: (Age = 40) <= 45.5
Node 231: (BMI = 30.5) > 27.75
Node 241: (Glucose = 94) > 73.5
Node 243: (Insulin = 185) > 173.0
Node 245: (SkinThickness = 29) > 21.5
Node 247: (BloodPressure = 62) <= 70.5
Node 248: (DiabetesPedigreeFunction = 0.476) > -2.0
