In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings

import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from src.calime_explainer import CALimeExplainer
from src.causal_model import get_causal_model
from src.lime_explainer import LimeExplainer

warnings.simplefilter('ignore')
import networkx as nx
import pickle

dataset_name = 'banknote'

path = os.path.join(os.path.dirname(__file__), 'experiments', dataset_name)
if not os.path.exists(path):
    os.makedirs(path)
dataset_path = os.path.join(os.path.dirname(__file__), 'data', dataset_name + '.csv')

print(f"Loading the {dataset_name} dataset...")
df = pd.read_csv(dataset_path, index_col=False)
print("Dataset loaded successfully.")
print("Generating the causal model...")

# Split the DataFrame into features (X) and target labels (y).
X, y = df.iloc[:, :-1], df.iloc[:, -1]

# Create lists to store feature names and unique class values.
feature_names, class_values = list(X.columns), list(y.unique())

# Split the data into training and testing sets.
num_samples = 100
train, test = X.head(len(X) - num_samples), X.tail(num_samples)
labels_train, labels_test = y.head(len(y) - num_samples), y.tail(num_samples)

# Further split the training set into two subsets: 'train_bb' and 'train_gen'.
# This is typically done for causal modeling purposes.
train_bb, train_gen, y_bb, y_gen = train_test_split(train, labels_train, test_size=0.3, random_state=0)

# Generate a causal model ('generative_model') and a graph representation ('graph')
# using the 'train_gen' dataset, feature names, and a specified 'path'.
graph_path = os.path.join(path, 'ground_truth.gpickle')
generation_path = os.path.join(path, 'generative_model.pkl')
if os.path.exists(graph_path) and os.path.exists(generation_path):
    graph = nx.read_gpickle(graph_path)
    with open(generation_path, 'rb') as file:
        generative_model = pickle.load(file)
else:
    generative_model, graph = get_causal_model(train_gen, feature_names, path)
print("Causal model and graph generated.")

print("Fitting the Black Box...")
# Create a RandomForestClassifier instance as the base estimator.
estimator = RandomForestClassifier()
hyper_param = dict(n_estimators=[100], #, 300, 500, 1000],
                   max_depth=[5, 8, 15],
                   min_samples_split=[2], #, 5, 10, 15, 100],
                   min_samples_leaf=[1]#, 2, 5, 10]
                   )
# Perform hyperparameter tuning using RandomizedSearchCV with the specified hyperparameters.
search = RandomizedSearchCV(estimator, hyper_param, verbose=1)

search.fit(train_bb, y_bb)

# Get the best estimator (model) from the hyperparameter tuning.
best_estimator = search.best_estimator_

# Use the trained 'best_estimator' model to make predictions on the test dataset.
y_pred = best_estimator.predict(test)

# Print a classification report that includes metrics like precision, recall, and F1-score,
# comparing the predicted labels ('y_pred') to the true labels ('labels_test'),
# considering the possible class values ('class_values').
# print("Classification Report:")
# print(classification_report(labels_test, y_pred, labels=class_values))

# Reset the index of the 'test' DataFrame.
test = test.reset_index()

# Select a random row from the 'test' DataFrame.
random_row = test.sample(n=1)

# Extract the feature values from the selected row ('data_row').
data_row = random_row.iloc[:, 1:].values.flatten()

# Print the index of the selected row from the 'test' DataFrame.
print('Selected Row Index:', random_row.index[0])

# LIME explanation
lime_explainer = LimeExplainer(train_bb.values, feature_names=feature_names,
                               class_names=class_values, discretize_continuous=False)

lime_exp, lime_data, lime_neighbor_gen_time = lime_explainer.explain_instance(data_row, best_estimator.predict_proba)

# CALIME explanation
calime_explainer = CALimeExplainer(graph, generative_model, train_bb.values, feature_names=feature_names,
                                   class_names=class_values, discretize_continuous=False)

calime_exp, calime_data, calime_neighbor_gen_time = calime_explainer.explain_instance(data_row,
                                                                                      best_estimator.predict_proba)

lime_exp.show_in_notebook()

ModuleNotFoundError: No module named 'lime'

In [6]:
dataset_path = './data/banknote.csv'

In [8]:
df = pd.read_csv(dataset_path, index_col=False)

NameError: name 'pd' is not defined

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

features_names = list(X.columns)
class_values = list(y.unique())

In [None]:
num_samples = 100

In [None]:
# train and test set
train, test = X.head(len(X) - num_samples), X.tail(num_samples)
# save test 
#test.to_csv(path + '/test.csv', index_label='index')
labels_train, labels_test = y.head(len(y) - num_samples), y.tail(num_samples)

train_bb, train_gen, y_bb, y_gen = train_test_split(train, labels_train, test_size=0.3, random_state=0)

In [None]:
path = './experiments'

In [None]:
generative_model, graph = get_causal_model(train_gen, features_names, path)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
estimator = RandomForestClassifier()
hyper_param = dict(n_estimators=[100, 300, 500, 1000],
                        max_depth=[5, 8, 15],
                        min_samples_split=[2, 5, 10, 15, 100],
                        min_samples_leaf=[1, 2, 5, 10]
                       )

In [None]:
search = RandomizedSearchCV(estimator, hyper_param)
search.fit(train_bb, y_bb)
bb = search.best_estimator_

In [None]:
y_pred = bb.predict(test)
#filename = 'bb_model.sav'
#pickle.dump(bb, open(path '/' + filename, 'wb'))

In [None]:
print(classification_report(labels_test, y_pred, labels=class_values))

In [None]:
test = test.reset_index()

In [None]:
row = test.sample(n=1)
data_row = row.iloc[:, 1:].values.flatten()
    
print('index:', test.iloc[row.index[0], 0])
print('data_row: ', data_row)

## LIME

In [None]:
lime_explainer = LimeTabularExplainer(train_bb.values, feature_names=features_names,
                                      class_names=class_values, discretize_continuous=False)

lime_exp, lime_data, lime_neighbor_gen_time = lime_explainer.explain_instance(data_row, bb.predict_proba, 
                                                                              num_samples=1000)

## CALIME

In [None]:
calime_explainer = CALimeTabularExplainer(train_bb.values, feature_names=features_names,
                                          class_names=class_values, discretize_continuous=False)

calime_exp, calime_data, calime_neighbor_gen_time = calime_explainer.explain_instance(data_row,
                                                                                     bb.predict_proba,
                                                                                     graph, generative_model)

In [None]:
import sklearn.datasets
import sklearn.ensemble
import numpy as np
import lime.lime_tabular
from __future__ import print_function
np.random.seed(1)

In [None]:
iris = sklearn.datasets.load_iris()

In [None]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(train_bb.values, feature_names=features_names, class_names=class_values
                                                   , discretize_continuous=True)

In [None]:
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1)

In [None]:
test[i].shape

In [None]:
train_bb.values.shape

In [7]:
from math import dist

In [11]:
def AMD (og, new): #
    
    og = og.values.tolist()
    new = new.values.tolist()
    
    min_distances = []
    
    for record in new:
        distances = []
        for og_record in og:
            d = dist (record, og_record)
            distances.append(d)

        min_distances.append(min(distances))

    return statistics.mean(min_distances)

In [12]:
df_genetic = pd.read_json("/Users/martina/Downloads/df_genetic.json")
df_fair = pd.read_json("/Users/martina/Downloads/df_fair.json")
df_adult = pd.read_json("/Users/martina/Downloads/df_adult.json")

In [13]:
AMD(df_adult, df_genetic)

NameError: name 'statistics' is not defined