# CIU

In [None]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.cluster
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random
from ciu import determine_ciu
import six
import sys
import os
sys.modules['sklearn.externals.six'] = six
from skrules import SkopeRules
import openml
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
import pandas as pd
import time
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ciu import determine_ciu
import sklearn
from sklearn.ensemble import RandomForestClassifier
import tqdm
import metrics_rules

In [None]:
# Define the path to the datasets folder
datasets_folder = "datasets"

# Initialize empty lists to store dataframes for each file
folder_names = []
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

        # Save folder name to list
        folder_names.append(folder_name)

# Subsetting for less expensive runs
X_list = [df.head(50) for df in X_list]
y_list = [df.head(50) for df in y_list]

# For testing the techniques
X = X_list[:40]
y = y_list[:40]

# Names of chosen datasets
X_folder_names = folder_names[:40]

# For testing later
X_list_test = X_list[-10:]
y_list_test = y_list[-10:]

In [None]:
# Preprocessing
def convert_to_numeric_and_impute(X_list, y_list):
    imputer = SimpleImputer(strategy='mean')
    label_encoder = LabelEncoder()

    def process_dataframe(df):
        for column in df.columns:
            if isinstance(df[column].iloc[0], csr_matrix):
                df[column] = df[column].apply(lambda x: x.toarray()[0,0] if x.shape[1] == 1 else x.toarray())

            df[column] = pd.to_numeric(df[column], errors='coerce')

            if df[column].dtype == 'object':
                # Fill NaN with a placeholder and then label encode
                df[column] = df[column].fillna('Missing')
                df[column] = label_encoder.fit_transform(df[column])
            else:
                if df[column].notna().any():
                    df[column] = imputer.fit_transform(df[[column]]).ravel()
                else:
                    df[column] = df[column].fillna(0)

        return df

    X_list = [process_dataframe(df) for df in X_list]
    y_list = [process_dataframe(df) for df in y_list]

    return X_list, y_list

X, y = convert_to_numeric_and_impute(X, y)
X_list_test, y_list_test = convert_to_numeric_and_impute(X_list_test, y_list_test)

In [None]:
def interpret_ciu_as_prediction(ciu_result, threshold=0.5):
    # Assuming ciu_result is a list of tuples (feature, importance)
    # And that a higher cumulative importance suggests a particular class (e.g., class 1)
    cumulative_importance = sum(importance for feature, importance in ciu_result)
    return 1 if cumulative_importance > threshold else 0

def calculate_fidelity_score(X_test, model, ciu_results):
    model_predictions = model.predict(X_test)
    ciu_predictions = [interpret_ciu_as_prediction(ciu_result) for ciu_result in ciu_results]
    correct_predictions = sum(ciu_pred == model_pred for ciu_pred, model_pred in zip(ciu_predictions, model_predictions))
    fidelity_score = correct_predictions / len(X_test)
    return fidelity_score

In [None]:
df_interp = pd.DataFrame(columns=["Dataset", "Fidelity", "Identity", "Separability", "Speed"])


for i in range(len(X_list)):
    print(i)
    X, y = X_list[i], y_list[i].squeeze()  # Ensure y is a 1D array
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=555)
    feat_list = X_train.columns.tolist()
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    def exp_fn_blk(xtest):
        exp1 = []
        for i in range(len(xtest)):
            exp = determine_ciu(X_test.iloc[i:i+1], model.predict_proba, X_train.to_dict('list'), samples = 100, prediction_index = 1)
            exp_list = [[feat_list.index(i), exp.ci[i]] for i in exp.ci]
            exp1.append(exp_list)
        return np.array(exp1)


    start_time = time.time() 
    exp1 = exp_fn_blk(X_test[:100])
    exp2 = exp_fn_blk(X_test[:100])
    end_time = time.time()
    speed = end_time - start_time 
    
    ciu_fidelity = calculate_fidelity_score(X_test, model, exp1)
    
    
    df_interp = df_interp.append({
        "Dataset": i,
        "Fidelity": ciu_fidelity,
        "Identity": metrics_rules.calc_identity_rules(exp1[0], exp2[0])[0],
        "Separability": metrics_rules.calc_separability_rules(exp1[0])[0],
        "Speed": speed
    }, ignore_index=True)

In [None]:
# Adding "ciu_" prefix to every column name
df_interp.columns = ['ciu_' + col for col in df_interp.columns]

df_interp

In [None]:
df_interp.to_csv('records_ciu.csv')