In [1]:
import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing

from src.transition_system import transition_system, indexs_for_window, list_to_str
from src.function_store import StoreTestRun, extract_algo_name, generate_cfe, get_case_id, prepare_df_for_ml, \
    activity_n_resources, get_test_cases, get_prefix_of_activities, validate_transition, cases_with_activity_to_avoid


import pandas as pd
import os
import pickle
import random
import subprocess
from IPython.display import display
from joblib import dump, load
from math import ceil
from time import time
import warnings
from wrapt_timeout_decorator import timeout

# from multiprocessing import Process
# from timeout_decorator import timeout, TimeoutError
# import signal
from typing import Tuple, Any, List, Union
from collections import Counter
import utils
from time import sleep
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 500)
if "src" in os.getcwd():
    os.chdir("../")

# Suppress all warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2



# Dataset: Bank Account Closure

## Setup Variables

In [2]:
KPI = "activity_occurrence"  # activity_occurrence, ...
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
WINDOW_SIZE = 3
REDUCED_KPI_TIME = 90
TOTAL_CFS = 15                        # Number of CFs DiCE algorithm should produce
TRAIN_DATA_SIZE = 164_927             # 164_927
DICE_METHOD = "random"
RESULTS_FILE_PATH_N_NAME = "experiment_results/random-local-a01.csv"
activity_to_avoid = "Back-Office Adjustment Requested"
# ToDo store the model configurations in the pkl file as well and retrieve them from here
proximity_weight = 0.2
sparsity_weight = 0.2
diversity_weight = 5.0

case_id_name = 'REQUEST_ID'  # The case identifier column name.
# start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"
resource_column_name = "CE_UO"

## Load Data

In [3]:
data_dir = "./preprocessed_datasets/"

train_dataset_file = "bank_acc_train.csv"
test_dataset_file = "bank_acc_test.csv"
test_pickle_dataset_file = "bank_acc_test.pkl"
# test_dataset_file = "bank_acc_test-500.csv"
# test_pickle_dataset_file = "bank_acc_test-500.pkl"
df = pd.read_csv("./data/bank_account_closure.csv")  # Use full dataset for transition systens
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

### Basic Preprocessing of the dataset

In [4]:
df = df.fillna("missing")
# df_train = df_train[:TRAIN_DATA_SIZE]  # 31_066
# df_test = df_test[: 19_041]
print(f"Rows in df_train: {len(df_train):,} Cases in df_train: {len(df_train[case_id_name].unique()):,}")
print(f"Rows in df_test: {len(df_test):,} Cases in df_test: {len(df_test[case_id_name].unique()):,}")
percentage_of_train_data = 100 * len(df_train[case_id_name].unique()) / len(df[case_id_name].unique())
print(f"Percentage of train data: {percentage_of_train_data:.2f}%")
case_ids_with_activity_to_avoid_train, _ = cases_with_activity_to_avoid(df_train, case_id_name, activity_column_name, activity_to_avoid)
case_ids_with_activity_to_avoid_test, _ = cases_with_activity_to_avoid(df_test,case_id_name, activity_column_name, activity_to_avoid)
print(f"Number of cases with activity_to_avoid in trainset: {len(case_ids_with_activity_to_avoid_train)}")
print(f"Percentage of activity_to_avoid cases in trainset: {100 * len(case_ids_with_activity_to_avoid_train) / len(df_train[case_id_name].unique()):.2f}%")
print(f"Number of cases with activity_to_avoid in testset: {len(case_ids_with_activity_to_avoid_test)}")
df_train.head()

Rows in df_train: 127,763 Cases in df_train: 21,172
Rows in df_test: 4,406 Cases in df_test: 715
Percentage of train data: 65.29%
Number of cases with activity_to_avoid in trainset: 2605
Percentage of activity_to_avoid cases in trainset: 12.30%
Number of cases with activity_to_avoid in testset: 715


Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,20175000168,1539175692,Client Recess,1 - Client lost,Service closure Request with network responsib...,1539175786,00044,APPLICANT,0,0,46092,Wednesday,94,569699,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20175000168,1539175786,Client Recess,1 - Client lost,Service closure Request with BO responsibility,1539602104,BOC,BACK-OFFICE,94,94,46186,Wednesday,426318,569605,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,20175000168,1539602104,Client Recess,1 - Client lost,Pending Request for Reservation Closure,1539602345,BOC,BACK-OFFICE,426412,426318,40504,Monday,241,143287,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20175000168,1539602345,Client Recess,1 - Client lost,Pending Liquidation Request,1539745391,BOC,BACK-OFFICE,426653,241,40745,Monday,143046,143046,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,20175000642,1541672782,Bank Recess,1 - Client lost,Request created,1541672852,00624,APPLICANT,0,0,37582,Thursday,70,578503,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Total completed cases
len(df_train[case_id_name].unique()) + len(df_test[case_id_name].unique())

21887

In [6]:
# # === Analysis of all the columns in the dataset
# variable_type_analysis(df, case_id_name=case_id_name, activity_name=activity_column_name)

### Prepare the Test Dataset

In [7]:
# === Unpickle the Standard test-set. To standardize the test across different parameters.
test_cases = get_test_cases(None, None, load_dataset=True, path_and_filename=os.path.join(data_dir, test_pickle_dataset_file))

# === (Uncomment) Command if using new df_test and want to generate test_cases
# test_cases = get_test_cases(df_test, case_id_name, load_dataset=False, path_and_filename=os.path.join(data_dir, test_pickle_dataset_file))
print(f"Number of cases in the test set: {len(test_cases):,}")

# # === Pickle dataset for comparison of different methods
# with open(os.path.join(data_dir, test_pickle_dataset_file), 'wb') as file:
#     pickle.dump(test_cases, file)

Number of cases in the test set: 715


In [8]:
# # === Generate a CSV of cut Test Traces
# df_cut_test_traces = pd.DataFrame(columns=df_test.columns)
# for cut_trace in test_cases:
#     df_cut_test_traces = pd.concat([df_cut_test_traces, cut_trace], axis="rows")
# df_cut_test_traces.to_csv("bank_acc_test_cut.csv", index=False)

In [9]:
# temp
# Some dignostic test for the test_cases
cases_y_0 = []
length_of_cases = []
outcome_name = "Back-Office Adjustment Requested"

for df_test_trace in test_cases:
    test_trace_start = time()
    # print("Case number:", cases_done)
    # print(f"Case number is state_obj: {state_obj.run_state['cases_done']}")
    length_of_cases.append( len(df_test_trace) )
    query_case_id = get_case_id(df_test_trace, case_id_name)
    X_test, y_test = prepare_df_for_ml(df_test_trace, case_id_name, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])
    # Check if y_test is 0 then don't generate CFE
    if y_test.iloc[-1] == 0:
        cases_y_0.append(query_case_id)
print(f"Cases with length of 3: {len([ clen for clen in length_of_cases if clen == 3])}")
print(f"Cases with length <= 2: {len([ clen for clen in length_of_cases if clen <= 2])}")
len(cases_y_0)

Cases with length of 3: 73
Cases with length <= 2: 0


224

In [10]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']

cols_to_vary = [activity_column_name, resource_column_name]

outcome_name = "Back-Office Adjustment Requested"

X_train, y_train = prepare_df_for_ml(df_train, case_id_name, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])
# X_test, y_test = prepare_df_for_ml(df_test, outcome_name)
continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "activity_duration", '# ACTIVITY=Service closure Request with network responsibility',
                    '# ACTIVITY=Service closure Request with BO responsibility', '# ACTIVITY=Pending Request for Reservation Closure', '# ACTIVITY=Pending Liquidation Request',
                    '# ACTIVITY=Request completed with account closure', '# ACTIVITY=Request created', '# ACTIVITY=Authorization Requested',
                    '# ACTIVITY=Evaluating Request (NO registered letter)', '# ACTIVITY=Network Adjustment Requested', '# ACTIVITY=Pending Request for acquittance of heirs',
                    '# ACTIVITY=Request deleted', '# ACTIVITY=Back-Office Adjustment Requested', '# ACTIVITY=Evaluating Request (WITH registered letter)',
                    '# ACTIVITY=Request completed with customer recovery', '# ACTIVITY=Pending Request for Network Information',]
categorical_features = ["CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', BalancedRandomForestClassifier(criterion='gini',
                                                                         max_depth=None,
                                                                         max_features='sqrt',
                                                                         min_samples_leaf=1,
                                                                         min_samples_split=2,
                                                                         n_estimators=100,
                                                                         replacement=False,
                                                                         sampling_strategy='not minority',
                                                                          n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
data_model = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
ml_backend = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
method = DICE_METHOD  # genetic, kdtree, random
explainer = Dice(data_model, ml_backend, method=method)  # Categorical features do not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

## Experiment with Single Queries

In [12]:
sidx = 50
eidx = 100

df_train[sidx: eidx][[ "REQUEST_ID", "CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "Back-Office Adjustment Requested"] ]
# df_train[sidx: eidx]

Unnamed: 0,REQUEST_ID,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,Back-Office Adjustment Requested
50,20177003282,Bank Recess,1 - Client lost,Service closure Request with network responsib...,00012,APPLICANT,0
51,20177003282,Bank Recess,1 - Client lost,Authorization Requested,00012,DIRECTOR,0
52,20177003282,Bank Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,0
53,20177003282,Bank Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0
54,20177003282,Bank Recess,1 - Client lost,Pending Liquidation Request,BOC,BACK-OFFICE,0
55,20177004025,Bank Recess,1 - Client lost,Request created,00573,APPLICANT,0
56,20177004025,Bank Recess,1 - Client lost,Service closure Request with network responsib...,missing,APPLICANT,0
57,20177004025,Bank Recess,1 - Client lost,Authorization Requested,00448,DIRECTOR,0
58,20177004025,Bank Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,0
59,20177004025,Bank Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0


In [11]:
sidx = 94
eidx = sidx + 1  # 620

query_instances = X_train[sidx: eidx]
cfe = explainer.generate_counterfactuals(query_instances, total_CFs=15,  features_to_vary=cols_to_vary, desired_class=0,  # desired_class="opposite",
                                                permitted_range = {"ACTIVITY": ['Service closure Request with network responsibility',
                                                                            'Service closure Request with BO responsibility',
                                                                            'Pending Request for Reservation Closure', 'Pending Liquidation Request',
                                                                            'Request created','Authorization Requested', 'Evaluating Request (NO registered letter)',
                                                                            'Network Adjustment Requested', 'Evaluating Request (WITH registered letter)',
                                                                            'Pending Request for Network Information']})  # 'Back-Office Adjustment Requested'

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:01<00:00,  1.13s/it]

Query instance (original outcome : 0)





Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Inheritance,missing,Service closure Request with BO responsibility,BOC,BACK-OFFICE,36990953,35264982,46992,Wednesday,597936,1,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,-,-,Request created,00571,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
1,-,-,-,00266,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
2,-,-,Request created,00474,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
3,-,-,-,00565,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
4,-,-,Pending Liquidation Request,00211,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
5,-,-,Pending Request for Reservation Closure,00539,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
6,-,-,-,SB14,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
7,-,-,-,00653,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
8,-,-,Evaluating Request (WITH registered letter),00885,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
9,-,-,Request created,00120,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


## Apply Transition System Validation to the above Single Query Results

In [12]:
## Apply Transition System Validation to the above Single Query Results
# === Load the Transition Graph
_, transition_graph = transition_system(df, case_id_name=case_id_name, activity_column_name=activity_column_name, window_size=WINDOW_SIZE)

# === Load Activity and Resource Validation data structure
resource_columns_to_validate = [activity_column_name, resource_column_name]
valid_resources = activity_n_resources(df, resource_columns_to_validate, threshold_percentage=100)
# len(valid_resources)

### Check Feature Importance score

In [13]:
# imp = explainer.local_feature_importance(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)
# imp.local_importance

## Experiment with Multiple Queries

In [14]:
state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

state_obj.run_state["cases_done"]

0

In [18]:
# %%capture
start_from_case = state_obj.run_state["cases_done"]
for df_test_trace in test_cases[cases_done:]:
    test_trace_start = time()
    # print("Case number:", cases_done)
    # print(f"Case number is state_obj: {state_obj.run_state['cases_done']}")
    query_case_id = get_case_id(df_test_trace, case_id_name)

    # if 0 < len(df_test_trace) <= 2:
    #     print("too small", cases_done, query_case_id)
    #     result_value = query_case_id
    #     state_obj.add_cfe_to_results( ("cases_too_small", result_value) )
    #     cases_stored = state_obj.save_state()
    #     cases_done += 1
    #     continue

    X_test, y_test = prepare_df_for_ml(df_test_trace, case_id_name, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])

    # # Check if y_test is 0 then don't generate CFE (to inform if we split at a point when the activity has already occurred. but now is doesn't matter)
    # if y_test.iloc[-1] == 0:
    #     result_value = query_case_id
    #     state_obj.add_cfe_to_results(("cases_zero_in_y", result_value))
    #     cases_stored = state_obj.save_state()
    #     cases_done += 1
    #     continue

    # Access the last row of the truncated trace to replicate the behavior of a running trace
    query_instances = X_test.iloc[-1:]

    try:
        cfe = generate_cfe(explainer, query_instances, None, total_cfs=TOTAL_CFS, features_to_vary=cols_to_vary, kpi=KPI,
                           proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight )

        result_value = (query_case_id, cfe)
        state_obj.add_cfe_to_results(("cfe_before_validation", result_value))  # save after cfe validation

        prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=WINDOW_SIZE, activity_column_name=activity_column_name,)
        cfe_df = validate_transition(cfe, prefix_of_activities=prefix_of_activities, transition_graph=transition_graph, valid_resources=valid_resources,
                                     activity_column_name=activity_column_name, resource_columns_to_validate=resource_columns_to_validate)
        if len(cfe_df) > 0:
            result_value = (query_case_id, cfe_df)
            state_obj.add_cfe_to_results(("cfe_after_validation", result_value))

        cases_stored = state_obj.save_state()

    except UserConfigValidationException as err:
        result_value = query_case_id
        state_obj.add_cfe_to_results(("cfe_not_found", result_value))
        cases_stored = state_obj.save_state()
    except TimeoutError as err:  # When function takes too long
        result_value = query_case_id
        print("TimeoutError caught:", err)
        state_obj.add_cfe_to_results(("cfe_not_found", result_value))
        cases_stored = state_obj.save_state()
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        result_value = query_case_id
        state_obj.add_cfe_to_results(("cases_includes_new_data", result_value))
        cases_stored = state_obj.save_state()
    # This error is seen occurring on when running lots of loops on the server
    except AttributeError as e:
        print("AttributeError caught:", e)
        state_obj.add_cfe_to_results(("exceptions", query_case_id))
        cases_stored = state_obj.save_state()
    # except Exception as err:
    #     print(f"Broadest Exception handler invoked", err)
    #     state_obj.add_cfe_to_results(("exceptions", query_case_id))
    #     cases_stored = state_obj.save_state()

    cases_done += 1
    # Just for a sanity check
    print(f"Time it took: { round( ((time() - test_trace_start) / 60 ), 3) }")
    assert cases_done == cases_stored
    if cases_done >= 20:
        break

100%|██████████| 1/1 [00:00<00:00,  7.27it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec





UserConfigValidationException: No counterfactuals found for any of the query points! Kindly check your configuration.

In [19]:
state_obj.get_run_state_df()

Unnamed: 0,cfe_before_validation,cfe_after_validation,cfe_not_found,cases_includes_new_data,cases_too_small,cases_zero_in_y,exceptions,cases_done
0,10,1,0,0,0,0,0,10


# Results:

### View Valid cases and their Counterfactual Examples (CFEs)
You can load saved configs and run them independently of the above code

In [20]:
view_from_saved_file = True  # Set this option true and specify the different file name.
pickle_file = "random-a08.pkl"
if view_from_saved_file:
    if not os.path.exists( f"./experiment_results/{pickle_file}"):
        result = subprocess.run(['scp', f'labnum08:git_repos/explainable-prescriptive-analytics/experiment_results/{pickle_file}', 'experiment_results/'], capture_output=True, text=True)

        # return code of 0 means the command executed successfully
        if result.returncode != 0:
            print("There is an Error in the command")
        else:
            print("successful")
    else:
        print(f"File already exists")

    RESULTS_FILE_PATH_N_NAME = f"experiment_results/{pickle_file.split('.')[0]}.csv"
else:
    print(f"Using results from file used in the above code")

successful


In [81]:
if view_from_saved_file:
    RESULTS_FILE_PATH_N_NAME = f"experiment_results/{pickle_file.split('.')[0]}.csv"
state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

print("Cases tested: ", state_obj.run_state["cases_done"])
def print_results():
    for case_id, cfe_df in state_obj.run_state["cfe_after_validation"]:
        yield case_id, cfe_df

generator = print_results()

Cases tested:  1096


In [82]:
case_id, cfe_df = next(generator)

print(f"Original Test Case:")
# === Find test_case with case_id
for df_test_trace in test_cases:
    if get_case_id(df_test_trace, case_id_name) == case_id:
        display(df_test_trace)
        df_test_trace.to_csv("temp.csv", index=False)

print(f"Counterfactuals for the last row:")
cfe_df.to_csv("cfe_temp.csv", index=False)
cfe_df

Original Test Case:


Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,20183001089,1520245359,Bank Recess,1 - Client lost,Request created,1520245395,SB24,APPLICANT,0,0,37359,Monday,36,16585730,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,20183001089,1520245395,Bank Recess,1 - Client lost,Service closure Request with network responsib...,1524239518,missing,APPLICANT,36,36,37395,Monday,3994123,16585694,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,20183001089,1524239518,Bank Recess,1 - Client lost,Authorization Requested,1524239835,00443,DIRECTOR,3994159,3994123,57118,Friday,317,12591571,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,20183001089,1524239835,Bank Recess,1 - Client lost,Network Adjustment Requested,1532446170,00443,APPLICANT,3994476,317,57435,Friday,8206335,12591254,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1
4,20183001089,1532446170,Bank Recess,1 - Client lost,Authorization Requested,1532446231,00443,DIRECTOR,12200811,8206335,55770,Tuesday,61,4384919,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1


Counterfactuals for the last row:


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Bank Recess,1 - Client lost,Authorization Requested,581,DIRECTOR,12200811,8206335,55770,Tuesday,61,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0


In [83]:
### From case_id view test trace/ case and the full trace/ case

In [84]:
state_obj.run_state["cfe_not_found"][:10]

[20178008599,
 20181002993,
 20182009564,
 20182011180,
 20184005244,
 20184005305,
 20184006674,
 20184007422,
 20185001269,
 20185005324]

In [102]:
search_case_id = 20184005305
for df_test_trace in test_cases:
    query_case_id = get_case_id(df_test_trace, case_id_name)
    if query_case_id == search_case_id:
        df_test_trace_temp = df_test_trace
        display(df_test_trace_temp)

# only print part of the trace that is not present in the df_test_trace
df_trace = df_test[df_test[case_id_name] == search_case_id]

display(df_trace.iloc[len(df_test_trace_temp) - 1:])  # -1 gives overlap of 1 row in the df_trace print
# display(df_trace)

Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,20184005305,1523960091,Client Recess,1 - Client lost,Request created,1523960173,SB11,APPLICANT,0,0,36891,Tuesday,82,24194798,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,20184005305,1523960173,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),1525078738,00294,DIRECTOR,82,82,36973,Tuesday,1118565,24194716,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,20184005305,1547743086,Client Recess,1 - Client lost,Service closure Request with network responsib...,1547743112,00294,APPLICANT,23782995,23782913,59886,Thursday,26,411803,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
3,20184005305,1547743112,Client Recess,1 - Client lost,Service closure Request with BO responsibility,1547827116,BOC,BACK-OFFICE,23783021,26,59912,Thursday,84004,411777,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
4,20184005305,1547827116,Client Recess,1 - Client lost,Pending Request for Reservation Closure,1547827208,BOC,BACK-OFFICE,23867025,84004,57516,Friday,92,327773,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1


Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
50,20184005305,1547827116,Client Recess,1 - Client lost,Pending Request for Reservation Closure,1547827208,BOC,BACK-OFFICE,23867025,84004,57516,Friday,92,327773,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1
51,20184005305,1547827208,Client Recess,1 - Client lost,Pending Liquidation Request,1548126085,BOC,BACK-OFFICE,23867117,92,57608,Friday,298877,327681,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1
52,20184005305,1548126085,Client Recess,1 - Client lost,Back-Office Adjustment Requested,1548154889,BOC,BACK-OFFICE,24165994,298877,10885,Tuesday,28804,28804,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0


### Run CFE on a case from df_test
This is to experiment and see if running the CFE generator can produce an output if we run on the same trace (which doesn't produce output on the prefix in the test_set)
but on a different prefix (or split point). Use `sidx` (start index) to make a split manually anywhere in the testset.

In [104]:
for i in range(51, 52):
    sidx = i  # 58
    eidx = sidx + 1
    try:
        X_test, y_test = prepare_df_for_ml(df_test, case_id_name, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])
        query_instances = X_test[sidx: eidx]
        cfe = explainer.generate_counterfactuals(query_instances, total_CFs=15, desired_class="opposite",  # desired_class=0, #
                                                        features_to_vary=cols_to_vary,
                                                        permitted_range = {"ACTIVITY": ['Service closure Request with network responsibility',
                                                                                    'Service closure Request with BO responsibility',
                                                                                    'Pending Request for Reservation Closure', 'Pending Liquidation Request',
                                                                                    'Request created','Authorization Requested', 'Evaluating Request (NO registered letter)',
                                                                                    'Network Adjustment Requested', 'Evaluating Request (WITH registered letter)',
                                                                                    'Pending Request for Network Information']})  # 'Back-Office Adjustment Requested'
    except UserConfigValidationException:
        print("Error on idx: ", i)

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  1.22it/s]

Only 1 (required 15)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Query instance (original outcome : 0)





Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Client Recess,1 - Client lost,Pending Liquidation Request,BOC,BACK-OFFICE,23867117,92,57608,Friday,298877,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,-,-,-,SDCTCC,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1


In [99]:
df_test[45:55]

Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
45,20184005244,1536894288,Client Recess,1 - Client lost,Back-Office Adjustment Requested,1537173189,BOC,BACK-OFFICE,12937058,150065,11088,Friday,278901,278901,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
46,20184005305,1523960091,Client Recess,1 - Client lost,Request created,1523960173,SB11,APPLICANT,0,0,36891,Tuesday,82,24194798,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
47,20184005305,1523960173,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),1525078738,00294,DIRECTOR,82,82,36973,Tuesday,1118565,24194716,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
48,20184005305,1547743086,Client Recess,1 - Client lost,Service closure Request with network responsib...,1547743112,00294,APPLICANT,23782995,23782913,59886,Thursday,26,411803,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
49,20184005305,1547743112,Client Recess,1 - Client lost,Service closure Request with BO responsibility,1547827116,BOC,BACK-OFFICE,23783021,26,59912,Thursday,84004,411777,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
50,20184005305,1547827116,Client Recess,1 - Client lost,Pending Request for Reservation Closure,1547827208,BOC,BACK-OFFICE,23867025,84004,57516,Friday,92,327773,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1
51,20184005305,1547827208,Client Recess,1 - Client lost,Pending Liquidation Request,1548126085,BOC,BACK-OFFICE,23867117,92,57608,Friday,298877,327681,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1
52,20184005305,1548126085,Client Recess,1 - Client lost,Back-Office Adjustment Requested,1548154889,BOC,BACK-OFFICE,24165994,298877,10885,Tuesday,28804,28804,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
53,20184006674,1524155208,Client Recess,1 - Client lost,Request created,1524155220,00409,APPLICANT,0,0,59208,Thursday,12,15438013,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
54,20184006674,1524155220,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),1524156387,SB11,DIRECTOR,12,12,59220,Thursday,1167,15438001,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


## Deprecated Code

In [21]:
60*60*24

86400

### Pratice subprocess module

In [20]:
pickle_file = "random-a01.pkl"
result = subprocess.run(['scp', f'labnum08:git_repos/explainable-prescriptive-analytics/experiment_results/{pickle_file}', 'experiment_results/'], capture_output=True, text=True)

# return code of 0 means the command executed successfully
if result.returncode != 0:
    print("There is an Error in the command")
else:
    print("successful")

''

In [None]:
def get_query_instance(sidx=14, eidx=16):
    assert eidx - sidx == 2, "One row represents the current action and the next one represents the suggested action"
    current_step = X_train[sidx: sidx+1]
    expected_next_step = X_train[eidx-1: eidx]
    return current_step, expected_next_step
# current_step, query_instances = get_query_instance(14, 16)

In [None]:
def validate_transition(cfe):
    """ Deprecated function.
    Works with transition graph which has single activity as key.
    Args:
        cfe: Dice counterfactual object.
    """
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    # expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[ ["ACTIVITY", "Involved_ST_Function_Div"] ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df