# CIU, LIME, ANCHOR

In [96]:
from ciu import determine_ciu
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import os
import pandas as pd
import warnings
import random
import time
import lime
import lime.lime_tabular
import tqdm
from anchor import anchor_tabular
import anchor_utils
import metrics_rules

warnings.filterwarnings('ignore')

## Reading in the datasets, preprocessing

In [90]:
# Define the path to the datasets folder
#datasets_folder = "../datasets"
datasets_folder = "AutoML_project2/datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

        # Save folder name to list
        folder_names.append(folder_name)

In [4]:
nr_of_rows = []

for i in range(len(X_list)):
    nr_of_rows.append(len(X_list[i]))

print(f"Min number of rows: {min(nr_of_rows)}")
print(f"Max number of rows: {max(nr_of_rows)}")

Min number of rows: 500
Max number of rows: 20000


In [60]:
# Example: Accessing the first dataset
first_attribute_names_df = attribute_names_list[1]
first_categorical_indicator_df = categorical_indicator_list[1]
first_X_df = X_list[1]
first_y_df = y_list[1]

In [61]:
first_X_df

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL
0,17,11,5,2,8,20.0,6,0.25,10,2,...,10,25.0,0.11,53.0,49.0,23.0,12,57.0,31.25,24
1,2,9,3,0,1,16.0,5,0.56,6,2,...,8,14.0,0.36,13.0,24.0,7.0,14,14.0,10.00,9
2,2,5,1,1,1,6.0,3,0.17,2,3,...,3,7.0,0.13,16.0,28.0,9.0,14,23.0,10.53,18
3,4,5,1,0,0,8.0,3,0.30,4,2,...,4,10.0,0.19,13.0,16.0,10.0,9,16.0,0.00,10
4,7,5,1,3,0,0.0,3,0.15,0,0,...,0,10.0,0.11,26.0,46.0,7.0,7,28.0,15.00,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,1,7,2,0,0,0.0,4,0.40,0,0,...,0,14.0,0.31,13.0,35.0,6.0,10,13.0,0.00,10
1454,12,3,8,0,6,4.0,2,0.11,2,2,...,2,14.0,0.05,28.0,40.0,19.0,13,39.0,24.00,19
1455,0,1,0,0,0,0.0,1,1.00,0,0,...,0,2.0,1.00,0.0,2.0,0.0,2,1.0,0.00,0
1456,8,9,0,0,10,0.0,5,0.42,0,0,...,0,10.0,0.16,20.0,38.0,9.0,20,32.0,45.45,12


In [62]:
first_y_df

Unnamed: 0,c
0,False
1,False
2,False
3,False
4,False
...,...
1453,False
1454,False
1455,False
1456,False


In [8]:
first_attribute_names_df

Unnamed: 0,0
0,id
1,start
2,end
3,event
4,size


In [9]:
first_categorical_indicator_df

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12436 entries, 12646 to 7270
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      12436 non-null  float64
 1   start   12436 non-null  float64
 2   end     12436 non-null  float64
 3   event   12436 non-null  int64  
 4   size    12436 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 582.9 KB


In [63]:
X_train, X_test, y_train, y_test = train_test_split(first_X_df, first_y_df, test_size = 0.2, random_state = 42)

In [64]:
le = LabelEncoder()
imputer_cat = SimpleImputer(strategy = 'most_frequent')
imputer_num = SimpleImputer(strategy = 'mean')
for col in X_train.columns:
    # Preprocessing categorical columns
    if X_train[col].dtype == 'object':
        le.fit(X_train[col])
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        imputer_cat.fit(X_train[col])
        X_train[col] = imputer_cat.transform(X_train[col])
        X_test[col] = imputer_cat.transform(X_test[col])
    # Preprocessing numerical columns
    else:
        imputer_num.fit(X_train[col].values.reshape(-1, 1))
        X_train[col] = imputer_num.transform(X_train[col].values.reshape(-1, 1))
        X_test[col] = imputer_num.transform(X_test[col].values.reshape(-1, 1))

In [65]:
random_state = 42
exp_iter = 10
random.seed(random_state)

test_x = X_test.values
n_classes = len(np.unique(y_train))
feat_list = X_train.columns.tolist()
X = np.vstack((X_train.values, test_x))

In [33]:
#n_classes = len(np.unique(y_train))
#feat_list = X_train.columns.tolist()

In [66]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

## CIU

In [37]:
def exp_fn_ciu(xtest):
    exp1 = []
    for i in range(len(xtest)):
        exp = determine_ciu(X_test.iloc[i:i+1], model.predict_proba, X_train.to_dict('list'), samples = 1000, prediction_index = 1)
        exp_list = [[feat_list.index(i), exp.ci[i]] for i in exp.ci]
        exp1.append(exp_list)
    return np.array(exp1)

In [68]:
exp1 = exp_fn_ciu(X_test[:10])
exp2 = exp_fn_ciu(X_test[:10])

In [58]:
def calc_identity(exp1, exp2):
    dis = np.array([np.array_equal(exp1[i], exp2[i]) for i in range(len(exp1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability(exp):
    wrong = 0
    for i in range(exp.shape[0]):
        for j in range(exp.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(exp[i], exp[j])
            if eq:
                wrong = wrong + 1
    total = exp.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong, total, total**2, score

def calc_stability(exp, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(exp[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = KMeans(n_clusters = n_clusters, random_state = 1, n_init = 10, init = init)
    ct.fit(exp)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

def enc_exp(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        for j in range(len(exp[i])):
            enc_exp[i][int(exp[i,j,0])] = exp[i,j,1]
    return enc_exp

def normalize_test(X_train, X_test):
    X_test_norm = X_test.copy()
    for i in X_train.columns:
        scaler = MinMaxScaler()
        scaler.fit(X_train[i].values.reshape(-1,1))
        X_test_norm[i] = scaler.transform(X_test[i].values.reshape(-1,1))

    return X_test_norm

def calc_similarity(exp, X_test_norm):
    dbscan = DBSCAN(eps = 0.5, min_samples = 10)
    dbscan.fit(X_test_norm[:400])
    labels = dbscan.labels_
    mean_dist = []
    for i in np.unique(labels):
        mean_dist.append(np.mean(pairwise_distances(exp[np.where(labels == i), :, 1].squeeze(), metric = 'euclidean')))
    return np.min(mean_dist)

def calc_time(xtest):
    exp_times = []
    for i in range(len(xtest)):
        start_time = time.time()
        exp_fn_ciu(X_test[i:i+1])
        end_time = time.time()
        exp_times.append(end_time - start_time)
    average_time = np.mean(exp_times)
    return average_time

In [89]:
identity_ciu = calc_identity(exp1,exp2)
print(identity_ciu)

separability_ciu = calc_separability(test_x[:100])
print(separability_ciu)

time_ciu = calc_time(X_test[:10])
print(time_ciu)

#enc1 = enc_exp(exp1, len(feat_list))
#stability_ciu = calc_stability(enc1, y_test[:100])
#print(stability_ciu)

X_test_norm = normalize_test(X_train, X_test)
similarity_ciu = calc_similarity(exp1, X_test_norm[:10])
print(similarity_ciu)

(90.0, 1, 10)
(8, 100, 10000, 0.08)
1.2846079111099242
0.5486537206603467


## LIME

In [101]:
class_names = np.unique(y_test)
lime_explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names = feat_list, class_names=class_names, discretize_continuous=True)
exp_fn = lambda i: lime_explainer.explain_instance(X_test.iloc[i], model.predict_proba, num_features=len(X_test.columns))
def exp_fn_blk(xtest, exp_fn):
    exp1 = []
    for i in tqdm.tqdm(range(len(xtest))):
        exp = exp_fn(i)
        exp1.append(exp.as_map()[exp.available_labels()[0]])
    return np.array(exp1)
exp_fn_wrap = lambda x: np.array(exp_fn_blk(x, exp_fn))

In [102]:
exp1 = exp_fn_blk(test_x[:10], exp_fn)
exp2 = exp_fn_blk(test_x[:10], exp_fn)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.38it/s]


In [104]:
identity_lime = calc_identity(exp1,exp2)
print(identity_lime)

separability_lime = calc_separability(test_x[:100])
print(separability_lime)

time_lime = calc_time(X_test[:10])
print(time_lime)

#enc1 = enc_exp(exp1, len(feat_list))
#stability_lime = calc_stability(enc1, y_test[:100])
#print(stability_lime)

X_test_norm = normalize_test(X_train, X_test)
similarity_lime = calc_similarity(exp1, X_test_norm[:10])
print(similarity_lime)

(100.0, 0, 10)
(8, 100, 10000, 0.08)
1.0553532600402833
0.26628894764842564


## ANCHOR

In [107]:
#pip install anchor-exp

In [119]:
anchor_explainer = anchor_tabular.AnchorTabularExplainer(
    np.unique(y_train).tolist(),
    X_train.columns.tolist(),
    X_train.values
)

def exp_fn_anchor(xtest):
    exp1 = []
    for i in tqdm.tqdm(range(len(xtest))):
        exp = anchor_explainer.explain_instance(X_test.values[i], model.predict, threshold=0.95)
        exp_list = [0]*len(X_train.columns)
        for j in exp.features():
            exp_list[j] = 1
        exp1.append(exp_list)
    return np.array(exp1)

In [120]:
exp1 = exp_fn_anchor(X_test[:10])
exp2 = exp_fn_anchor(X_test[:10])

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.57it/s]


In [125]:
identity_anchor = metrics_rules.calc_identity_rules(exp1, exp2)
print(identity_anchor)

separability_anchor = metrics_rules.calc_separability_rules(exp1)
print(separability_anchor)

time_anchor = calc_time(X_test[:10])
print(time_anchor)

#stability_anchor = metrics_rules.calc_stability_rules(exp1, y_test)
#print(stability_anchor)

X_test_norm = metrics_rules.normalize_test(X_train, X_test)
similarity_anchor = metrics_rules.calc_similarity(exp1, X_test_norm[:10])
print(similarity_anchor)

(10.0, 9, 10)
(72, 10, 100, 72.0)
0.5997083187103271
0.2545584412271571
