#### Metrics for three interpretability techniques: LIME, ANCHOR, CIU

Metrics tested are identity, separability, fidelity, and speed.

In [230]:
import os
import time
import tqdm
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from anchor import utils
from anchor import anchor_tabular

import metrics_rules

In [None]:
# Define the path to the datasets folder
datasets_folder = "../datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

        # Save folder name to list
        folder_names.append(folder_name)

X = [df.head(50) for df in X_list]
y = [df.head(50) for df in y_list]

In [None]:
# Preprocessing
def convert_to_numeric_and_impute(X_list, y_list):
    imputer = SimpleImputer(strategy='mean')
    label_encoder = LabelEncoder()

    def process_dataframe(df):
        for column in df.columns:
            if isinstance(df[column].iloc[0], csr_matrix):
                df[column] = df[column].apply(lambda x: x.toarray()[0,0] if x.shape[1] == 1 else x.toarray())

            df[column] = pd.to_numeric(df[column], errors='coerce')

            if df[column].dtype == 'object':
                # Fill NaN with a placeholder and then label encode
                df[column] = df[column].fillna('Missing')
                df[column] = label_encoder.fit_transform(df[column])
            else:
                if df[column].notna().any():
                    df[column] = imputer.fit_transform(df[[column]]).ravel()
                else:
                    df[column] = df[column].fillna(0)

        return df

    X_list = [process_dataframe(df) for df in X_list]
    y_list = [process_dataframe(df) for df in y_list]

    return X_list, y_list

X, y = convert_to_numeric_and_impute(X, y)

One dataset here, gen five later.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[0], y[0], test_size=0.2, random_state=555)

rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
rf.fit(X_train, y_train)

accuracy = accuracy_score(y_test, rf.predict(X_test))
print(f"Dataset {folder_names[i]} - Accuracy: {accuracy}")

Dataset 1050 - Accuracy: 0.7


In [None]:
class_names = np.unique(y_train).tolist()
attribute_names = X_train.columns.tolist()

explainer = anchor_tabular.AnchorTabularExplainer(
    class_names,
    attribute_names,
    X_train.values)

def exp_fn_blk(xtest):
    exp1 = []
    for i in tqdm.tqdm(range(len(xtest))):
        start_clock = time.time()
        exp = explainer.explain_instance(X_test.values[i], rf.predict, threshold=0.95)
        end_clock = time.time()
        calc_time = end_clock - start_clock
        exp_list = [0]*len(X_train.columns)
        for j in exp.features():
            exp_list[j] = 1
        exp1.append(exp_list)
    return np.array(exp1), calc_time
    
exp1 = exp_fn_blk(X_test)
exp2 = exp_fn_blk(X_test)

print((exp1[1] + exp2[1]) / 2)
print(metrics_rules.calc_identity_rules(exp1[0], exp2[0]))
print(metrics_rules.calc_separability_rules(exp1[0]))

100%|██████████| 10/10 [00:10<00:00,  1.04s/it]
100%|██████████| 10/10 [00:09<00:00,  1.10it/s]

0.29387831687927246
(10.0, 9, 10)
(14, 10, 100, 14.0)





In [None]:
# Precision
prec = 0
for i in tqdm.tqdm(range(len(X_test))):
    exp = explainer.explain_instance(X_test.values[i], rf.predict, threshold=0.95)
    prec += exp.precision()
print(prec/len(X_test))

100%|██████████| 10/10 [00:09<00:00,  1.05it/s]

0.9819878557758586





##### ANCHOR

In [241]:
# Store scores
identity_anchor_scores = []
separability_anchor_scores = []
speed_anchor_seconds = []
precision_scores = []

# for i in range(len(X)):
for i in range(5):
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.2, random_state=555)

    rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
    rf.fit(X_train, y_train)

    accuracy = accuracy_score(y_test, rf.predict(X_test))
    print(f"Dataset {folder_names[i]} - Accuracy: {accuracy}")

    explainer = anchor_tabular.AnchorTabularExplainer(
    np.unique(y_train).tolist(),
    X_train.columns.tolist(),
    X_train.values)

    def exp_fn_blk(xtest):
        exp1 = []
        prec = 0
        for i in tqdm.tqdm(range(len(xtest))):
            start_clock = time.time()
            exp = explainer.explain_instance(X_test.values[i], rf.predict, threshold=0.95)
            end_clock = time.time()
            prec += exp.precision()
            calc_time = end_clock - start_clock
            exp_list = [0]*len(X_train.columns)
            for j in exp.features():
                exp_list[j] = 1
            exp1.append(exp_list)
        return np.array(exp1), calc_time, prec

    exp1 = exp_fn_blk(X_test)
    exp2 = exp_fn_blk(X_test)

    precision_scores.append(exp1[2]/len(X_test))
    speed_anchor_seconds.append((exp1[1] + exp2[1]) / 2)
    identity_anchor_scores.append(metrics_rules.calc_identity_rules(exp1[0], exp2[0]))
    separability_anchor_scores.append(metrics_rules.calc_separability_rules(exp1[0]))

Dataset 1046 - Accuracy: 0.7


100%|██████████| 10/10 [00:08<00:00,  1.19it/s]
100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


Dataset 1049 - Accuracy: 1.0


100%|██████████| 10/10 [00:27<00:00,  2.77s/it]
100%|██████████| 10/10 [00:21<00:00,  2.17s/it]


Dataset 1050 - Accuracy: 0.8


100%|██████████| 10/10 [00:08<00:00,  1.19it/s]
100%|██████████| 10/10 [00:16<00:00,  1.65s/it]


Dataset 1053 - Accuracy: 1.0


100%|██████████| 10/10 [00:05<00:00,  1.77it/s]
100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


Dataset 1063 - Accuracy: 1.0


100%|██████████| 10/10 [00:00<00:00, 11.96it/s]
100%|██████████| 10/10 [00:00<00:00, 14.40it/s]


In [247]:
identity_anchor_scores

[(50.0, 5, 10), (50.0, 5, 10), (70.0, 3, 10), (70.0, 3, 10), (0.0, 10, 10)]

In [248]:
separability_anchor_scores

[(14, 10, 100, 14.0),
 (22, 10, 100, 22.0),
 (30, 10, 100, 30.0),
 (12, 10, 100, 12.0),
 (90, 10, 100, 90.0)]

In [249]:
speed_anchor_seconds

[0.3924214839935303,
 1.7786080837249756,
 2.4819605350494385,
 0.44764697551727295,
 0.06706643104553223]

In [250]:
precision_scores

[0.9821836593716078,
 0.972625886011248,
 0.994059405940594,
 0.996009553655405,
 1.0]

In [245]:
df_t = pd.concat([
    pd.Series(identity_anchor_scores, name='identity_anchor_scores'),
    pd.Series(separability_anchor_scores, name='separability_anchor_scores'),
    pd.Series(speed_anchor_seconds, name='speed_anchor_seconds'),
    pd.Series(precision_scores, name='precision_scores')
], axis=1)

In [251]:
df_t.to_csv('5_records_anchor.csv')