In [1]:
#pip install skope-rules

In [108]:
#pip install py-ciu==0.1.1

In [83]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random
from ciu import determine_ciu
import six
import sys
import os
sys.modules['sklearn.externals.six'] = six
from skrules import SkopeRules
import openml
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

In [137]:
## New proposed separability

#calculate the centroid (mean vector) of the feature values. 
#This represents the average position of each class in the feature space
def calculate_centroids(X, labels):
    unique_labels = np.unique(labels)
    centroids = {label: X[labels == label].mean(axis=0) for label in unique_labels}
    return centroids

#Calculate the average variance within each class. 
#This measures how spread out each class is around its centroid.
def calculate_within_class_variance(X, labels, centroids):
    unique_labels = np.unique(labels)
    variances = {label: ((X[labels == label] - centroids[label])**2).mean() for label in unique_labels}
    total_variance = np.mean(list(variances.values()))
    return total_variance

#Calculate the distance (e.g., Euclidean distance) between the centroids of each pair of classes. 
#This measures how far apart the classes are from each other.
def calculate_between_class_separation(centroids):
    unique_labels = list(centroids.keys())
    separations = []
    for i in range(len(unique_labels)):
        for j in range(i+1, len(unique_labels)):
            separation = np.linalg.norm(centroids[unique_labels[i]] - centroids[unique_labels[j]])
            separations.append(separation)
    avg_separation = np.mean(separations)
    return avg_separation

def calculate_separability(X, labels):
    centroids = calculate_centroids(X, labels)
    within_class_var = calculate_within_class_variance(X, labels, centroids)
    between_class_sep = calculate_between_class_separation(centroids)
    
    if within_class_var == 0:  
        return np.inf
    
    separability_score = between_class_sep / within_class_var
    return separability_score #Higher values indicate better separability

In [159]:
def calc_identity(exp1, exp2):
    dis = np.array([np.array_equal(exp1[i], exp2[i]) for i in range(len(exp1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total - true) / total
    return score * 100, true, total

def calc_stability(exp, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(exp[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = sklearn.cluster.KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(exp)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

def enc_exp(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        for j in range(len(exp[i])):
            enc_exp[i][int(exp[i,j,0])] = exp[i,j,1]
    return enc_exp

In [176]:
def permute(x, x_dash):
    x = x.copy()
    x_dash = x_dash.copy()
    x_rand = np.random.random(x.shape[0])
    x_new = [x[i] if x_rand[i] > 0.5 else x_dash[i] for i in range(len(x))]
    x_dash_new = [x_dash[i] if x_rand[i] > 0.5 else x[i] for i in range(len(x))]
    return x_new, x_dash_new

def calc_trust_score(test_x, exp, m, feat_list, model):
    total_recalls = []
    for i in range(len(test_x)):
        feat_score = np.zeros((len(feat_list)))
        for _ in range(m):
            x = test_x[i].copy()
            x_dash = test_x[np.random.randint(0,len(test_x))].copy()
            x_perm, x_dash_perm = permute(x, x_dash)
            for j in range(len(feat_list)):
                z = np.concatenate((x_perm[:j+1], x_dash_perm[j+1:]))
                z_dash = np.concatenate((x_dash_perm[:j], x_perm[j:]))
                p_z = model.predict_proba(np.array(z).reshape(1, -1))
                p_z_dash = model.predict_proba(z_dash.reshape(1,-1))
                feat_score[j] = feat_score[j] + np.linalg.norm(p_z-p_z_dash)
        feat_score = feat_score/m
        gold_feat_fs = np.argpartition(feat_score, -6)[-6:]
        recall = len(set(exp[i][:6, 0]).intersection(set(gold_feat_fs)))/6
        total_recalls.append(recall)
    return np.mean(total_recalls)

In [120]:
datasets_folder = "datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)
        X_list = [df.head(50) for df in X_list]
        y_list = [df.head(50) for df in y_list]

        # Save folder name to list
        folder_names.append(folder_name)

In [181]:
import pandas as pd
import time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ciu import determine_ciu


def create_pipeline(X):
    """
    Create a pipeline based on the types of features in X.
    """
    # Separate numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create transformers for numeric and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    return preprocessor

def exp_fn_ciu(xtest, model, X_train):
    exp1 = []
    for i in range(len(xtest)):
        exp = determine_ciu(xtest.iloc[i:i+1], model.predict_proba, X_train.to_dict('list'), samples=1000, prediction_index=1)
        exp_list = [[feat_list.index(i), exp.ci[i]] for i in exp.ci]
        exp1.append(exp_list)
    return np.array(exp1)

def permute(x, x_dash):
    x = x.copy()
    x_dash = x_dash.copy()
    x_rand = np.random.random(x.shape[0])
    x_new = [x[i] if x_rand[i] > 0.5 else x_dash[i] for i in range(len(x))]
    x_dash_new = [x_dash[i] if x_rand[i] > 0.5 else x[i] for i in range(len(x))]
    return x_new, x_dash_new

def calc_trust_score(test_x, exp, m, feat_list, model):
    total_recalls = []
    for i in range(len(test_x)):
        feat_score = np.zeros((len(feat_list)))
        for _ in range(m):
            x = test_x[i].copy()
            x_dash = test_x[np.random.randint(0,len(test_x))].copy()
            x_perm, x_dash_perm = permute(x, x_dash)
            for j in range(len(feat_list)):
                z = np.concatenate((x_perm[:j+1], x_dash_perm[j+1:]))
                z_dash = np.concatenate((x_dash_perm[:j], x_perm[j:]))
                z = np.array(z).reshape(1, -1)
                z_dash = np.array(z_dash).reshape(1, -1)
                
                p_z = model.predict_proba(z)
                p_z_dash = model.predict_proba(z_dash)
                feat_score[j] = feat_score[j] + np.linalg.norm(p_z-p_z_dash)
        feat_score = feat_score/m
        gold_feat_fs = np.argpartition(feat_score, -6)[-6:]
        recall = len(set(exp[i][:6, 0]).intersection(set(gold_feat_fs)))/6
        total_recalls.append(recall)
    return np.mean(total_recalls)

ciu_identity_scores = []
ciu_lime_separability_scores = []
ciu_speed_scores = []
ciu_fidelity_scores = []
for i in range(len(X_list)):
    X, y = X_list[i], y_list[i].squeeze()  # Ensure y is a 1D array
    
    # Convert X and y to numeric if not already
    X = X.apply(pd.to_numeric, errors='coerce')  # 'coerce' option converts non-numeric values to NaN
    
    #Calculate overall separability
    ciu_lime_separability = calculate_separability(X, y)
    ciu_lime_separability_scores.append(ciu_lime_separability)
    
    preprocessor = create_pipeline(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    feat_list = X_test.columns.tolist()
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', RandomForestClassifier())])
    
    test_x = X_test.values
    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    print(f"Dataset {folder_names[i]} - Accuracy: {accuracy_score(y_test, pipeline.predict(X_test))}")
    
    #Applying bulk CIU
    start_time = time.time()
    bulk_ciu_result = exp_fn_ciu(X_test, pipeline, X_train)
    end_time = time.time()
    ciu_speed = end_time - start_time
    ciu_speed_scores.append(ciu_speed)
    bulk_ciu_result2 = exp_fn_ciu(X_test, pipeline, X_train)
    
    #Calculating identity score
    ciu_identity = calc_identity(bulk_ciu_result, bulk_ciu_result2)
    ciu_identity_scores.append(ciu_identity[0])
    
    
    #Calculating fidelity scores
    #ciu_fidelity = calc_trust_score(test_x, bulk_ciu_result, 5, feat_list, pipeline)
    #ciu_fidelity_scores.append(ciu_fidelity)
    # Calculating stability
    #enc1 = enc_exp(bulk_ciu_result, len(feat_list))
    #ciu_stability = calc_stability(enc1, y_test)   
    
    print(f"Identity score is {ciu_identity[0]}")
    print(f"Separability score is {ciu_lime_separability}")
    #print(f"Fidelity score is {ciu_fidelity}")
    #print(f"Stability score is {ciu_stability}")
    print(f"Speed: {round(ciu_speed, 2)}")

Dataset 1046 - Accuracy: 0.9
Identity score is 60.0
Separability score is 6.935585161831817e-06
Speed: 1.09
Dataset 1053 - Accuracy: 1.0
Identity score is 10.0
Separability score is 9.436535017662588e-05
Speed: 5.21


In [182]:
df_t = pd.concat([
    pd.Series(ciu_lime_separability_scores, name='Separability_scores'),
    pd.Series(ciu_identity_scores, name='CIU_identity_scores'),
    pd.Series(ciu_speed_scores, name='CIU_speed_scores')
], axis=1)

In [183]:
df_t.to_csv('2_records_ciu.csv')