#### ANCHOR

In [5]:
import os
import time
import tqdm
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from anchor import utils
from anchor import anchor_tabular

import metrics_rules

In [6]:
# Define the path to the datasets folder
datasets_folder = "../datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

        # Save folder name to list
        folder_names.append(folder_name)

# Subsetting for less expensive runs
X_list = [df.head(50) for df in X_list]
y_list = [df.head(50) for df in y_list]

# For testing the techniques
X = X_list[:40]
y = y_list[:40]

# Names of chosen datasets
X_folder_names = folder_names[:40]

# For testing later
X_list_test = X_list[-10:]
y_list_test = y_list[-10:]

In [7]:
# Preprocessing
def convert_to_numeric_and_impute(X_list, y_list):
    imputer = SimpleImputer(strategy='mean')
    label_encoder = LabelEncoder()

    def process_dataframe(df):
        for column in df.columns:
            if isinstance(df[column].iloc[0], csr_matrix):
                df[column] = df[column].apply(lambda x: x.toarray()[0,0] if x.shape[1] == 1 else x.toarray())

            df[column] = pd.to_numeric(df[column], errors='coerce')

            if df[column].dtype == 'object':
                # Fill NaN with a placeholder and then label encode
                df[column] = df[column].fillna('Missing')
                df[column] = label_encoder.fit_transform(df[column])
            else:
                if df[column].notna().any():
                    df[column] = imputer.fit_transform(df[[column]]).ravel()
                else:
                    df[column] = df[column].fillna(0)

        return df

    X_list = [process_dataframe(df) for df in X_list]
    y_list = [process_dataframe(df) for df in y_list]

    return X_list, y_list

X, y = convert_to_numeric_and_impute(X, y)
X_list_test, y_list_test = convert_to_numeric_and_impute(X_list_test, y_list_test)

In [11]:
# Store scores
identity_anchor_scores = []
separability_anchor_scores = []
speed_anchor_seconds = []
# precision_scores = []
accuracy_scores = []
dataset_indeces = []

for i in range(len(X)):
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.2, random_state=555)

    rf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
    rf.fit(X_train, y_train)

    accuracy = accuracy_score(y_test, rf.predict(X_test))
    print(f"Dataset {folder_names[i]} - Accuracy: {accuracy}")
    dataset_indeces.append(folder_names[i])
    accuracy_scores.append(accuracy)

    explainer = anchor_tabular.AnchorTabularExplainer(
    np.unique(y_train).tolist(),
    X_train.columns.tolist(),
    X_train.values)

    def exp_fn_blk(xtest):
        exp1 = []
        prec = 0
        for i in tqdm.tqdm(range(len(xtest))):
            start_clock = time.time()
            exp = explainer.explain_instance(X_test.values[i], rf.predict, threshold=0.95)
            end_clock = time.time()
            prec += exp.precision()
            calc_time = end_clock - start_clock
            exp_list = [0]*len(X_train.columns)
            for j in exp.features():
                exp_list[j] = 1
            exp1.append(exp_list)
        return np.array(exp1), calc_time, prec

    exp1 = exp_fn_blk(X_test)
    exp2 = exp_fn_blk(X_test)

    # precision_scores.append(exp1[2]/len(X_test))
    speed_anchor_seconds.append((exp1[1] + exp2[1]) / 2)
    identity_anchor_scores.append(metrics_rules.calc_identity_rules(exp1[0], exp2[0]))
    separability_anchor_scores.append(metrics_rules.calc_separability_rules(exp1[0]))

df_t = pd.concat([
    pd.Series(dataset_indeces, name='dataset_indeces'),
    #pd.Series(accuracy_scores, name='accuracy_scores'),
    pd.Series(identity_anchor_scores, name='identity_anchor_scores'),
    pd.Series(separability_anchor_scores, name='separability_anchor_scores'),
    pd.Series(speed_anchor_seconds, name='speed_anchor_seconds'),
    #pd.Series(precision_scores, name='precision_scores')
], axis=1)

# df_t.to_csv('records_anchor.csv')

Dataset 1046 - Accuracy: 0.7


100%|██████████| 10/10 [00:10<00:00,  1.04s/it]
100%|██████████| 10/10 [00:11<00:00,  1.13s/it]


Dataset 1049 - Accuracy: 1.0


100%|██████████| 10/10 [00:28<00:00,  2.81s/it]
100%|██████████| 10/10 [00:28<00:00,  2.85s/it]


Dataset 1050 - Accuracy: 0.8


100%|██████████| 10/10 [00:10<00:00,  1.07s/it]
100%|██████████| 10/10 [00:17<00:00,  1.73s/it]
