#### Metrics for three interpretability techniques: LIME, ANCHOR, CIU

Metrics tested are identity, separability, fidelity, and speed.

In [16]:
import os
import time
import tqdm
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from anchor import utils
from anchor import anchor_tabular

import metrics_rules

In [22]:
# Define the path to the datasets folder
datasets_folder = "../datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

        # Save folder name to list
        folder_names.append(folder_name)

# Subsetting for less expensive runs
X_list = [df.head(50) for df in X_list]
y_list = [df.head(50) for df in y_list]

# For testing the techniques
X = X_list[:40]
y = y_list[:40]

# Names of chosen datasets
X_folder_names = folder_names[:40]

# For testing later
X_list_test = X_list[-10:]
y_list_test = y_list[-10:]

# Names of testing folder names
X_folder_names_test = folder_names[-10:]

In [23]:
# Preprocessing
def convert_to_numeric_and_impute(X_list, y_list):
    imputer = SimpleImputer(strategy='mean')
    label_encoder = LabelEncoder()

    def process_dataframe(df):
        for column in df.columns:
            if isinstance(df[column].iloc[0], csr_matrix):
                df[column] = df[column].apply(lambda x: x.toarray()[0,0] if x.shape[1] == 1 else x.toarray())

            df[column] = pd.to_numeric(df[column], errors='coerce')

            if df[column].dtype == 'object':
                # Fill NaN with a placeholder and then label encode
                df[column] = df[column].fillna('Missing')
                df[column] = label_encoder.fit_transform(df[column])
            else:
                if df[column].notna().any():
                    df[column] = imputer.fit_transform(df[[column]]).ravel()
                else:
                    df[column] = df[column].fillna(0)

        return df

    X_list = [process_dataframe(df) for df in X_list]
    y_list = [process_dataframe(df) for df in y_list]

    return X_list, y_list

X, y = convert_to_numeric_and_impute(X, y)
X_list_test, y_list_test = convert_to_numeric_and_impute(X_list_test, y_list_test)

In [24]:
# Metadata generation
from pymfe.mfe import MFE

# Check all available meta-features in the package
# print(MFE.valid_metafeatures()) # <- should choose more?????

columns = ['attr_to_inst',  'cat_to_num',  'freq_class.mean',  'freq_class.sd',  'inst_to_attr',  'max.mean',  'max.sd',  'min.mean',  'min.sd',  'nr_cor_attr',  'nr_norm',  'sd.mean',  'sd.sd']

metadata_df = pd.DataFrame(columns=columns)

for i in range(len(X)):

    mfe = MFE(features=["attr_to_inst", "cat_to_num", "freq_class", "inst_to_attr", "sd", "nr_norm", "nr_cor_attr", "min", "max"])
    mfe.fit(np.array(X[i]), np.array(y[i]))
    ft = mfe.extract(
        sd={"ddof": 0},
        nr_norm={"method": "all", "failure": "hard", "threshold": 0.025},
        nr_cor_attr={"threshold": 0.6},
    )

    new = pd.DataFrame(np.array(ft[1]).reshape(1, -1), columns=ft[0])
    metadata_df = metadata_df.append(new, ignore_index=True)

# metadata_df['folder'] = X_folder_names
# metadata_df['folder'] = metadata_df['folder'].astype(int)
# metadata_df.head()

# df_t.to_csv('metadata_merged.csv')

##### Testing different techniques

In [26]:
# Training metafeatures
df_meta = pd.read_csv('metadata_merged.csv', index_col=0)
df_meta = df_meta.drop(columns= 'best_technique')

# Reading in data generated on training data
df_lime=pd.read_csv('records_lime.csv', index_col=0).drop(columns = 'lime_Dataset')
df_ciu=pd.read_csv('records_ciu.csv', index_col=0).drop(columns = 'ciu_Dataset')
df_anchor=pd.read_csv('records_anchor.csv', index_col=0).drop(columns = 'anchor_Dataset')

In [27]:
# Define metrics
common_columns = ['Fidelity', 'Identity', 'Separability', 'Speed']

for col in common_columns:
    max_value = max(df_lime['lime_' + col].max(), df_ciu['ciu_' + col].max(), df_anchor['anchor_' + col].max())
    df_lime['lime_' + col] /= max_value
    df_ciu['ciu_' + col] /= max_value
    df_anchor['anchor_' + col] /= max_value

merged_df = df_lime.merge(df_ciu, on='folder').merge(df_anchor, on='folder')

merged_df['lime_score'] = merged_df['lime_Fidelity'] + merged_df['lime_Identity'] - merged_df['lime_Separability'] - merged_df['lime_Speed']
merged_df['ciu_score'] = merged_df['ciu_Fidelity'] + merged_df['ciu_Identity'] - merged_df['ciu_Separability'] - merged_df['ciu_Speed']
merged_df['anchor_score'] = merged_df['anchor_Fidelity'] + merged_df['anchor_Identity'] - merged_df['anchor_Separability'] - merged_df['anchor_Speed']

merged_df['best_dataset'] = merged_df[['lime_score', 'ciu_score', 'anchor_score']].idxmax(axis=1)

selected_datasets = merged_df[['folder', 'best_dataset']]
selected_datasets=selected_datasets.merge(df_meta, on='folder')
selected_datasets = selected_datasets.drop(columns='folder')

selected_datasets.head()

Unnamed: 0,best_dataset,attr_to_inst,cat_to_num,freq_class.mean,freq_class.sd,inst_to_attr,max.mean,max.sd,min.mean,min.sd,nr_cor_attr,nr_norm,sd.mean,sd.sd
0,ciu_score,0.24,0.0,1.0,0.0,4.166667,0.595833,1.150537,-0.869333,1.195318,0.151515,4.0,0.408634,0.211223
1,lime_score,0.42,0.0,0.5,0.48,2.380952,2612.36381,10464.872064,0.764286,0.421704,0.595238,7.0,716.383714,2875.465961
2,ciu_score,0.32,0.0,0.1,0.026833,3.125,98.125,5.072906,0.9375,3.630922,0.125,2.0,30.379573,7.083823
3,lime_score,0.7,0.0,0.142857,0.092229,1.428571,117.528571,229.00371,30.262857,68.736894,0.065546,2.0,24.665882,56.092301
4,ciu_score,0.14,0.0,1.0,0.0,7.142857,31.64,13.351233,4.571429,8.583325,0.095238,0.0,8.264854,2.973742


#### Creating a meta model

In [28]:
# Training the metamodel
X = selected_datasets.drop(['best_dataset'], axis=1)
y = selected_datasets['best_dataset']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("Classification Report for the Best Model:")
print(classification_report(y_test, y_pred))

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Classification Report for the Best Model:
              precision    recall  f1-score   support

   ciu_score       1.00      1.00      1.00        10
  lime_score       1.00      1.00      1.00         6

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



#### Testing

In [30]:
# Metadata generation for test data
columns = ['attr_to_inst',  'cat_to_num',  'freq_class.mean',  'freq_class.sd',  'inst_to_attr',  'max.mean',  'max.sd',  'min.mean',  'min.sd',  'nr_cor_attr',  'nr_norm',  'sd.mean',  'sd.sd']

metadata_df_test = pd.DataFrame(columns=columns)

for i in range(len(X_list_test)):

    mfe = MFE(features=["attr_to_inst", "cat_to_num", "freq_class", "inst_to_attr", "sd", "nr_norm", "nr_cor_attr", "min", "max"])
    mfe.fit(np.array(X_list_test[i]), np.array(y_list_test[i]))
    ft = mfe.extract(
        sd={"ddof": 0},
        nr_norm={"method": "all", "failure": "hard", "threshold": 0.025},
        nr_cor_attr={"threshold": 0.6},
    )

    new = pd.DataFrame(np.array(ft[1]).reshape(1, -1), columns=ft[0])
    metadata_df_test = metadata_df_test.append(new, ignore_index=True)

metadata_df_test['folder'] = X_folder_names_test
metadata_df_test['folder'] = metadata_df_test['folder'].astype(int)
metadata_df_test

Unnamed: 0,attr_to_inst,cat_to_num,freq_class.mean,freq_class.sd,inst_to_attr,max.mean,max.sd,min.mean,min.sd,nr_cor_attr,nr_norm,sd.mean,sd.sd,folder
0,0.14,0.0,0.1,0.021909,7.142857,1.0,0.0,0.0,0.0,0.0,0.0,0.461235,0.042153,40496
1,0.7,0.0,1.0,0.0,1.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42
2,0.1,0.0,1.0,0.0,10.0,39.2,51.394163,17.6,27.111621,0.0,1.0,5.666059,7.054412,451
3,0.6,0.0,0.5,0.04,1.666667,1.0,0.0,-0.966667,0.179505,0.02069,0.0,0.73125,0.222545,4534
4,0.64,0.0,1.0,0.0,1.5625,0.002553,0.003049,-0.001513,0.002409,0.076613,5.0,0.000671,0.000759,4538
5,0.08,0.0,1.0,0.0,12.5,3.25,3.418699,0.0,0.0,0.0,0.0,0.968246,1.034553,469
6,0.18,0.0,1.0,0.0,5.555556,22.444444,30.430654,10.333333,27.828842,0.0,3.0,2.740387,4.278423,470
7,0.18,0.0,1.0,0.0,5.555556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50
8,0.36,0.0,1.0,0.0,2.777778,187.444444,207.204468,82.222222,63.702918,0.372549,4.0,22.986923,36.177679,54
9,0.32,0.0,1.0,0.0,3.125,11.75,1.713914,0.9375,1.675886,0.091667,4.0,2.231966,0.401749,6


In [None]:
def predict_best_dataset(input_data):

    input_df = pd.DataFrame([input_data])
    input_df = input_df[X.columns]

    prediction = best_model.predict(input_df)

    return prediction[0][:-6]

##### Example on one test dataset and its recommendation

In [37]:
input_data = metadata_df_test.drop(columns='folder').iloc[0].to_dict()
input_data_formatted = {k: round(v, 2) for k, v in input_data.items()}

predicted_dataset = predict_best_dataset(input_data_formatted)
print("Predicted Best Model for your cause, considering your dataset is", predicted_dataset.upper())

Predicted Best Model for your cause, considering your dataset is CIU
