In [1]:
import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from src.transition_system import transition_system, indexs_for_window, list_to_str
from src.function_store import StoreTestRun, extract_algo_name, generate_cfe, get_case_id, prepare_df_for_ml, \
    activity_n_resources, get_test_cases, get_prefix_of_activities, validate_transition

import pandas as pd
import os
import pickle
import random
import subprocess
from IPython.display import display
from math import ceil
from time import time
import warnings
from wrapt_timeout_decorator import timeout

# from multiprocessing import Process
# from timeout_decorator import timeout, TimeoutError
# import signal
from typing import Tuple, Any
from collections import Counter
import utils
from time import sleep
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 500)
if "src" in os.getcwd():
    os.chdir("../")

# Suppress all warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

# Dataset: VINST (Volvo IT Belgium)
link: https://www.win.tue.nl/bpi/doku.php?id=2013:challenge

## Setup Variables

In [2]:
KPI = "total_time"  # activity_occurrence, total_time, ...
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
WINDOW_SIZE = 3
REDUCED_KPI_TIME = 90
TOTAL_CFS = 50                        # Number of CFs DiCE algorithm should produce
TRAIN_DATA_SIZE = 39_375               # 39_375
DICE_METHOD = "random"
RESULTS_FILE_PATH_N_NAME = "experiment_results/random-t01-local.csv"
proximity_weight = 0.2
sparsity_weight = 0.2
diversity_weight = 5.0

case_id_name = 'SR_Number'  # The case identifier column name.
start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"
resource_column_name = "Involved_ST"

## Load Data

In [3]:
data_dir = "./preprocessed_datasets/"

train_dataset_file = "vinst_train.csv"
test_dataset_file = "vinst_test.csv"
test_pickle_dataset_file = "vinst_test.pkl"
# test_dataset_file = "vinst_test-500.csv"
# test_pickle_dataset_file = "vinst_test-500.pkl"
df = pd.read_csv("./data/VINST cases incidents.csv")  # Use full dataset for transition systens
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

### Basic Preprocessing of the dataset

In [8]:
df = df.fillna("missing")

# df_train = df_train[:TRAIN_DATA_SIZE]  # 39_375
# df_test = df_test[: 18_594]
total = len(df_train) + len(df_test)
print(f"Percentage of trainset rows: {len(df_train)/total }")
print(f"Rows in df_train: {len(df_train):,} Cases in df_train: {len(df_train[case_id_name].unique()):,}")
print(f"Rows in df_test: {len(df_test):,} Cases in df_test: {len(df_test[case_id_name].unique()):,}")
percentage_of_train_data = 100 * len(df_train[case_id_name].unique()) / len(df[case_id_name].unique())
print(f"Percentage of train data: {percentage_of_train_data:.2f}%")
# df_train.info()

Percentage of trainset rows: 0.6792423536717901
Rows in df_train: 39,375 Cases in df_train: 6,166
Rows in df_test: 18,594 Cases in df_test: 1,377
Percentage of train data: 81.63%


In [10]:
len(df_train[case_id_name].unique()) + len(df_test[case_id_name].unique())

7543

### Prepare the Test Dataset

In [5]:
# === Unpickle the Standard test-set. To standardize the test across different parameters.
test_cases = get_test_cases(None, None, load_dataset=True, path_and_filename=os.path.join(data_dir, test_pickle_dataset_file))

# === Command if using new df_test and want to generate test_cases
# test_cases = get_test_cases(df_test, case_id_name, load_dataset=False, path_and_filename=os.path.join(data_dir, test_pickle_dataset_file))
print(f"Number of cases in the test set: {len(test_cases):,}")

# # === Pickle dataset for comparison of different methods
# with open(os.path.join(data_dir, test_pickle_dataset_file), 'wb') as file:
#     pickle.dump(test_cases, file)

Number of cases in the test set: 1,377


In [6]:
# # === Generate a CSV of cut Test Traces
# df_cut_test_traces = pd.DataFrame(columns=df_test.columns)
# for cut_trace in test_cases:
#     df_cut_test_traces = pd.concat([df_cut_test_traces, cut_trace], axis="rows")
# df_cut_test_traces.to_csv("vinst_test_cut.csv", index=False)

In [12]:
# Assuming you have X and y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
outcome_name = "lead_time"
X_train, y_train = prepare_df_for_ml(df_train, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])

# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [10, 50, 100, 200, 500, 1000],
    'regressor__max_features': ['auto', 'sqrt'],
    'regressor__max_depth' : [4,5,6,7,8, 16, 32],
    'regressor__criterion' :['squared_error', 'absolute_error']
}

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "# ACTIVITY=In Progress", "# ACTIVITY=Awaiting Assignment",
                       "# ACTIVITY=Resolved", "# ACTIVITY=Assigned", "# ACTIVITY=Closed", "# ACTIVITY=Wait - User", "# ACTIVITY=Wait - Implementation", "# ACTIVITY=Wait",
                       "# ACTIVITY=Wait - Vendor", "# ACTIVITY=In Call", "# ACTIVITY=Wait - Customer", "# ACTIVITY=Unmatched", "# ACTIVITY=Cancelled"]
categorical_features = ["Status", "ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST", "SR_Latest_Impact", "Product", "Country", "Owner_Country",
                        "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a base model
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('regressor', RandomForestRegressor(n_jobs=7))])

# rf = RandomForestRegressor(random_state=42)

# Initiate the grid search model
grid_search = GridSearchCV(estimator=clf,
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_parameters = grid_search.best_params_
print(f"Best parameters: {best_parameters}")

# Get the best estimator
best_model = grid_search.best_estimator_

# Now you can fit & predict using the best model
best_model.fit(X_train, y_train)
predictions = best_model.predict(X_train)

  warn(
  warn(


KeyboardInterrupt: 

In [9]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']
# cols_to_vary.extend(["ACTIVITY"])

# option 2:
cols_to_vary = [activity_column_name, resource_column_name]

outcome_name = "lead_time"

X_train, y_train = prepare_df_for_ml(df_train, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "# ACTIVITY=In Progress", "# ACTIVITY=Awaiting Assignment",
                       "# ACTIVITY=Resolved", "# ACTIVITY=Assigned", "# ACTIVITY=Closed", "# ACTIVITY=Wait - User", "# ACTIVITY=Wait - Implementation", "# ACTIVITY=Wait",
                       "# ACTIVITY=Wait - Vendor", "# ACTIVITY=In Call", "# ACTIVITY=Wait - Customer", "# ACTIVITY=Unmatched", "# ACTIVITY=Cancelled"]
categorical_features = ["Status", "ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST", "SR_Latest_Impact", "Product", "Country", "Owner_Country",
                        "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('regressor', RandomForestRegressor(n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
data_model = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
ml_backend = dice_ml.Model(model=model, backend="sklearn", model_type='regressor')
method = DICE_METHOD  # genetic, kdtree, random
explainer = Dice(data_model, ml_backend, method=method)  # Method random does not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

## Experiment with Single Queries

In [14]:
sidx = 87
eidx = 97

df_train[sidx: eidx]
# df_train[sidx: eidx]

Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
87,1-506071646,1300185210,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,694282,362,38010,Tuesday,36593280,20,13,0,1,0,3,0,0,0,0,0,0,0,37287562
88,1-506071646,1300189096,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,698168,3886,41896,Tuesday,36589394,20,13,0,1,0,4,0,0,0,0,0,0,0,37287562
89,1-506071646,1300196341,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,705413,7245,49141,Tuesday,36582149,21,13,0,1,0,4,0,0,0,0,0,0,0,37287562
90,1-506071646,1300871608,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,1380680,675267,33208,Wednesday,35906882,21,13,0,1,0,5,0,0,0,0,0,0,0,37287562
91,1-506071646,1300871609,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,1380681,1,33209,Wednesday,35906881,22,13,0,1,0,5,0,0,0,0,0,0,0,37287562
92,1-506071646,1300871841,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,1380913,232,33441,Wednesday,35906649,23,13,0,1,0,5,0,0,0,0,0,0,0,37287562
93,1-506071646,1300957938,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,1467010,86097,33138,Thursday,35820552,23,13,0,1,0,6,0,0,0,0,0,0,0,37287562
94,1-506071646,1300958005,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,1467077,67,33205,Thursday,35820485,24,13,0,1,0,6,0,0,0,0,0,0,0,37287562
95,1-506071646,1300969210,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,1478282,11205,44410,Thursday,35809280,24,14,0,1,0,6,0,0,0,0,0,0,0,37287562
96,1-506071646,1300969246,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,1478318,36,44446,Thursday,35809244,25,14,0,1,0,6,0,0,0,0,0,0,0,37287562


In [13]:
sidx = 14
eidx = sidx + 1
total_time_upper_bound = int( y_train[sidx] * (REDUCED_KPI_TIME / 100) )  # A percentage of the original total time of the trace
query_instances = X_train[sidx: eidx]

cfe = explainer.generate_counterfactuals(query_instances, total_CFs=20, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                         proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:07<00:00,  7.40s/it]

Only 0 (required 20)  Diverse Counterfactuals found for the given configuation, perhaps  change the query instance or the features to vary...; total time taken: 00 min 00 sec





UserConfigValidationException: No counterfactuals found for any of the query points! Kindly check your configuration.

In [10]:
## Apply Transition System Validation to the above Single Query Results
# === Load the Transition Graph
_, transition_graph = transition_system(df, case_id_name=case_id_name, activity_column_name=activity_column_name, window_size=WINDOW_SIZE)

# === Load Activity and Resource Validation data structure
resource_columns_to_validate = [activity_column_name, resource_column_name, 'Country', 'Owner_Country']
valid_resources = activity_n_resources(df, resource_columns_to_validate, threshold_percentage=100)
# len(valid_resources)

In [11]:
# # Case 1: For experimenting with single query
# prefix_of_activities = get_prefix_of_activities(expected_activity_index=sidx, event_log=df_train, window_size=window_size, activity_column_name=activity_column_name)

# # Case 2: When running in test mode
# prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=window_size, activity_column_name=activity_column_name)

# # Test Validate Transitions Function
# validate_transition(cfe, prefix_of_activities, transition_graph, valid_resources)
# cfe.visualize_as_dataframe(show_only_changes=True)
# print(f"Valid Counterfactual Set")

## Experiment with Multiple Queries

In [13]:
state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

state_obj.run_state["cases_done"]

5

In [29]:
# %%capture
start_from_case = state_obj.run_state["cases_done"]
for df_test_trace in test_cases[start_from_case:]:
    test_trace_start = time()
    # print("Case number:", cases_done)
    # print(f"Case number is state_obj: {state_obj.run_state['cases_done']}")

    query_case_id = get_case_id(df_test_trace, case_id_name)

    # == temp
    if query_case_id != "1-733561941":
        continue

    print(f"length of : {len(df_test_trace)}")
    # ===
    print("for caseID: ", query_case_id)

    X_test, y_test = prepare_df_for_ml(df_test_trace, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])
    # Access the last row of the truncated trace to replicate the behavior of a running trace
    query_instances = X_test.iloc[-1:]
    total_time_upper_bound = int( y_test.iloc[-1] * (REDUCED_KPI_TIME / 100) )  # A percentage of the original total time of the trace

    try:
        cfe = generate_cfe(explainer, query_instances, total_time_upper_bound, features_to_vary=cols_to_vary,
                           total_cfs=TOTAL_CFS, kpi=KPI, proximity_weight=proximity_weight, sparsity_weight=sparsity_weight,
                           diversity_weight=diversity_weight)

        result_value = (query_case_id, cfe)
        state_obj.add_cfe_to_results(("cfe_before_validation", result_value))  # save after cfe validation

        prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=WINDOW_SIZE, activity_column_name=activity_column_name)
        cfe_df = validate_transition(cfe, prefix_of_activities=prefix_of_activities, transition_graph=transition_graph, valid_resources=valid_resources,
                                     activity_column_name=activity_column_name, resource_columns_to_validate=resource_columns_to_validate)

        if len(cfe_df) > 0:
            result_value = (query_case_id, cfe_df)
            state_obj.add_cfe_to_results(("cfe_after_validation", result_value))

        cases_stored = state_obj.save_state()

    except UserConfigValidationException:
        state_obj.add_cfe_to_results(("cfe_not_found", query_case_id))
        cases_stored = state_obj.save_state()
    except TimeoutError as err:  # When function takes too long
        print("TimeoutError caught:", err)
        state_obj.add_cfe_to_results(("cfe_not_found", query_case_id))
        cases_stored = state_obj.save_state()
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        state_obj.add_cfe_to_results(("cases_includes_new_data", query_case_id))
        cases_stored = state_obj.save_state()
    # This error is seen occurring on when running lots of loops on the server
    except AttributeError as e:
        print("AttributeError caught:", e)
        state_obj.add_cfe_to_results(("exceptions", query_case_id))
        cases_stored = state_obj.save_state()
    # except Exception as err:
    #     print(f"Broadest Exception handler invoked", err)
    #     state_obj.add_cfe_to_results(("exceptions", query_case_id))
    #     cases_stored = state_obj.save_state()

    cases_done += 1
    # Just for a sanity check
    print(f"Time it took: { round( ((time() - test_trace_start) / 60 ), 3) }")
    assert cases_done == cases_stored
    if cases_done >= 5:
        break

length of : 3
for caseID:  1-733561941


100%|██████████| 1/1 [00:00<00:00,  6.59it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Time it took: 0.003





In [67]:
state_obj.get_run_state_df()

Unnamed: 0,cfe_before_validation,cfe_after_validation,cfe_not_found,cases_includes_new_data,cases_too_small,cases_zero_in_y,exceptions,cases_done
0,3,2,0,2,0,0,0,5


# Results:

### View Valid cases and their Counterfactual Examples (CFEs)
You can load saved configs and run them independently of the above code

In [24]:
view_from_saved_file = True  # Set this option true and specify the different file name.
pickle_file = "genetic-t03.pkl"
if view_from_saved_file:
    if not os.path.exists( f"./experiment_results/{pickle_file}"):
        result = subprocess.run(['scp', f'labnum08:git_repos/explainable-prescriptive-analytics/experiment_results/{pickle_file}', 'experiment_results/'], capture_output=True, text=True)

        # return code of 0 means the command executed successfully
        if result.returncode != 0:
            print("There is an Error in the command")
        else:
            print("successful")
    else:
        print(f"File already exists")

    RESULTS_FILE_PATH_N_NAME = f"experiment_results/{pickle_file.split('.')[0]}.csv"
else:
    print(f"Using results from file used in the above code")

File already exists


In [25]:
if view_from_saved_file:
    RESULTS_FILE_PATH_N_NAME = f"experiment_results/{pickle_file.split('.')[0]}.csv"
state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

print("Cases tested: ", state_obj.run_state["cases_done"])
def print_results():
    for case_id, cfe_df in state_obj.run_state["cfe_after_validation"]:
        yield case_id, cfe_df

generator = print_results()

Cases tested:  2493


In [26]:
case_id, cfe_df = next(generator)

print(f"Original Test Case:")
# === Find test_case with case_id
for df_test_trace in test_cases:
    if get_case_id(df_test_trace) == case_id:
        display(df_test_trace)
        df_test_trace.to_csv("temp.csv", index=False)

print(f"Counterfactuals for the last row:")
cfe_df.to_csv("cfe_temp.csv", index=False)
cfe_df

Original Test Case:


Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,1-557297151,1313748596,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,0.0,0.0,36596.0,Friday,22684333.0,0,0,0,0,0,0,0,0,0,0,0,0,0,22684333.0
1,1-557297151,1313748671,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,75.0,75.0,36671.0,Friday,22684258.0,1,0,0,0,0,0,0,0,0,0,0,0,0,22684333.0
2,1-557297151,1313748760,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Belgium,164.0,89.0,36760.0,Friday,22684169.0,2,0,0,0,0,0,0,0,0,0,0,0,0,22684333.0
3,1-557297151,1313752993,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,4397.0,4233.0,40993.0,Friday,22679936.0,2,1,0,0,0,0,0,0,0,0,0,0,0,22684333.0
4,1-557297151,1313753045,Queued,Awaiting Assignment,A2_1,Org line C,D5,Medium,PROD542,nl,Netherlands,4449.0,52.0,41045.0,Friday,22679884.0,3,1,0,0,0,0,0,0,0,0,0,0,0,22684333.0
5,1-557297151,1313753046,Queued,Awaiting Assignment,A2_1,Org line C,D5,Medium,PROD542,nl,Netherlands,4450.0,1.0,41046.0,Friday,22679883.0,3,2,0,0,0,0,0,0,0,0,0,0,0,22684333.0
6,1-557297151,1313755125,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,6529.0,2079.0,43125.0,Friday,22677804.0,3,3,0,0,0,0,0,0,0,0,0,0,0,22684333.0
7,1-557297151,1313755132,Accepted,Assigned,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,6536.0,7.0,43132.0,Friday,22677797.0,4,3,0,0,0,0,0,0,0,0,0,0,0,22684333.0
8,1-557297151,1313762689,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,14093.0,7557.0,50689.0,Friday,22670240.0,4,3,0,1,0,0,0,0,0,0,0,0,0,22684333.0
9,1-557297151,1313762746,Accepted,Wait - User,A2_1,Org line C,D5,Medium,PROD542,nl,Belgium,14150.0,57.0,50746.0,Friday,22670183.0,5,3,0,1,0,0,0,0,0,0,0,0,0,22684333.0


Counterfactuals for the last row:


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Queued,Awaiting Assignment,A2_1,Org line C,D5,Medium,PROD542,nl,Netherlands,18474.0,69.0,55070.0,Friday,6.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12615722.62
1,Queued,Awaiting Assignment,A2_1,Org line C,D6,Medium,PROD542,nl,Netherlands,18474.0,69.0,55070.0,Friday,6.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19692520.48
2,Queued,Awaiting Assignment,A2_1,Org line C,D2,Medium,PROD542,nl,Netherlands,18474.0,69.0,55070.0,Friday,6.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19936662.34


### From case_id view test trace/ case and the full trace/ case

In [27]:
# state_obj.run_state["cfe_not_found"][:10]

In [30]:
search_case_id = "1-529096847"
for df_test_trace in test_cases:
    query_case_id = get_case_id(df_test_trace, case_id_name)
    if query_case_id == search_case_id:
        df_test_trace_temp = df_test_trace
        display(df_test_trace_temp)

# only print part of the trace that is not present in the df_test_trace
df_trace = df_test[df_test[case_id_name] == search_case_id]

display(df_trace.iloc[len(df_test_trace_temp) - 1:])  # -1 gives overlap of 1 row in the df_trace print

Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,1-529096847,1306425713,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,0.0,0.0,57713.0,Thursday,30352776.0,0,0,0,0,0,0,0,0,0,0,0,0,0,30352776.0
1,1-529096847,1306425743,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,30.0,30.0,57743.0,Thursday,30352746.0,1,0,0,0,0,0,0,0,0,0,0,0,0,30352776.0
2,1-529096847,1306426238,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,525.0,495.0,58238.0,Thursday,30352251.0,2,0,0,0,0,0,0,0,0,0,0,0,0,30352776.0
3,1-529096847,1306752377,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,326664.0,326139.0,38777.0,Monday,30026112.0,2,1,0,0,0,0,0,0,0,0,0,0,0,30352776.0
4,1-529096847,1306752512,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,326799.0,135.0,38912.0,Monday,30025977.0,3,1,0,0,0,0,0,0,0,0,0,0,0,30352776.0
5,1-529096847,1306752762,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,327049.0,250.0,39162.0,Monday,30025727.0,3,2,0,0,0,0,0,0,0,0,0,0,0,30352776.0
6,1-529096847,1306753070,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,327357.0,308.0,39470.0,Monday,30025419.0,4,2,0,0,0,0,0,0,0,0,0,0,0,30352776.0
7,1-529096847,1306843287,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,417574.0,90217.0,43287.0,Tuesday,29935202.0,4,2,0,0,0,1,0,0,0,0,0,0,0,30352776.0
8,1-529096847,1306843410,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,417697.0,123.0,43410.0,Tuesday,29935079.0,5,2,0,0,0,1,0,0,0,0,0,0,0,30352776.0
9,1-529096847,1306914730,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,489017.0,71320.0,28330.0,Wednesday,29863759.0,5,3,0,0,0,1,0,0,0,0,0,0,0,30352776.0


Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
80,1-529096847,1314783700,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,8357987.0,120.0,34900.0,Wednesday,21994789.0,16,6,0,3,0,7,0,1,0,0,0,0,0,30352776.0
81,1-529096847,1314794385,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,8368672.0,10685.0,45585.0,Wednesday,21984104.0,16,7,0,3,0,7,0,1,0,0,0,0,0,30352776.0
82,1-529096847,1314794402,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,8368689.0,17.0,45602.0,Wednesday,21984087.0,17,7,0,3,0,7,0,1,0,0,0,0,0,30352776.0
83,1-529096847,1314794555,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,8368842.0,153.0,45755.0,Wednesday,21983934.0,17,8,0,3,0,7,0,1,0,0,0,0,0,30352776.0
84,1-529096847,1314795087,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,8369374.0,532.0,46287.0,Wednesday,21983402.0,18,8,0,3,0,7,0,1,0,0,0,0,0,30352776.0
85,1-529096847,1314801039,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,8375326.0,5952.0,52239.0,Wednesday,21977450.0,18,9,0,3,0,7,0,1,0,0,0,0,0,30352776.0
86,1-529096847,1314801052,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,8375339.0,13.0,52252.0,Wednesday,21977437.0,19,9,0,3,0,7,0,1,0,0,0,0,0,30352776.0
87,1-529096847,1314801334,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,8375621.0,282.0,52534.0,Wednesday,21977155.0,19,10,0,3,0,7,0,1,0,0,0,0,0,30352776.0
88,1-529096847,1314801516,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,8375803.0,182.0,52716.0,Wednesday,21976973.0,20,10,0,3,0,7,0,1,0,0,0,0,0,30352776.0
89,1-529096847,1314802146,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,8376433.0,630.0,53346.0,Wednesday,21976343.0,20,10,0,3,0,8,0,1,0,0,0,0,0,30352776.0


### Run CFE on a case from df_test
This is to experiment and see if running the CFE generator can produce an output if we run on the same trace (which doesn't produce output on the prefix in the test_set)
but on a different prefix (or split point). Use `sidx` (start index) to make a split manually anywhere in the testset.

In [45]:
# df_test[70: 80]

In [33]:
# pd.concat([ X_test[70:80], y_test[70:80] ], axis=1)

In [34]:
sidx = 77
eidx = sidx + 1
X_test, y_test = prepare_df_for_ml(df_test, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])

total_time_upper_bound = int( y_test[sidx] * (REDUCED_KPI_TIME / 100) )  # A percentage of the original total time of the trace
query_instances = X_test[sidx: eidx]
print("total_time_upper_bound", total_time_upper_bound / SECONDS_TO_DAYS)

cfe = explainer.generate_counterfactuals(query_instances, total_CFs=20, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                         proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)

# cfe = generate_cfe(explainer, query_instances, total_time_upper_bound, features_to_vary=cols_to_vary,
#                            total_cfs=TOTAL_CFS, kpi=KPI, proximity_weight=proximity_weight, sparsity_weight=sparsity_weight,
#                            diversity_weight=diversity_weight)
cfe.visualize_as_dataframe(show_only_changes=True)

total_time_upper_bound 316.1747453703704


100%|██████████| 1/1 [00:01<00:00,  1.31s/it]

Query instance (original outcome : 23300204)





Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,In Progress,missing,Org line V7,V29 2nd,Medium,PROD542,nl,Belgium,5172125.0,620508.0,45838.0,Monday,14,6,0,3,0,7,0,0,0,0,0,0,0,23300204.0



Diverse Counterfactual set (new outcome: [0, 27317498])


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,-,-,-,Org line V5,B6,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22669316.0
1,-,-,-,-,N15 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22719882.0
2,-,-,-,-,A12,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22719882.0
3,-,-,E_2,-,N25 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22719882.0
4,-,-,-,Org line C,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23249636.0
5,-,-,C_5,-,S47,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22719882.0
6,-,Wait,-,Org line V3,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23223578.0
7,-,-,-,Org line G2,S15,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22669316.0
8,-,-,-,Org line G4,M8,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22669316.0
9,-,-,E_5,-,N31 3rd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22719882.0


In [42]:
22669315.64 / SECONDS_TO_DAYS

262.37633842592595

In [None]:
# for _, cfe in cfes_list:
#     cfe.visualize_as_dataframe(show_only_changes=True)
# cfes_list[9][1]

## Observation
The output usually comes within 2 minutes

#### TODOS
- Check how many queries (rows) have valid CFEs
- How to make CFE not, ACTIVITY column not to take the value "Pending Liquidation request"
- Discover how DiCE handle constraints ( For us do it in post-processing step )
- CE_OU - resource column, you can modify it
- Modify just the resource (col: CE_UO) & Activity (col: ACTIVITY)
- build a transition system to validate activity and then resource
- Do the above for total time case
- I
- Implement activity validation and resource validation to the above
-   To do this effectively maybe figure out how the DiCE post process the constraints and implement this in that layer

In [None]:
query_instances = X_train[sidx: eidx]
cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)
cfe.visualize_as_dataframe(show_only_changes=True)

genetic_cfes = []

for idx, query_instances in X_train.iterrows():
    query_instances = query_instances.to_frame().transpose()
    # query_instances = X_train[0:1]  # an interesting query`

    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)

    genetic_cfes.append( (idx, cfe) )

-Implement post processing steps
    - Transition transition_system with full traces
    - 4 tuple comparison with country and owner country



## Deprecated Code

In [29]:
from multiprocessing import Process

def long_process_function(xval):
    print(f"Now sleeping for {xval}")
    sleep(xval)
    print("Done sleeping!!!")

def increment_50_seconds():
    print("Increment 50 seconds")
    for i in range(50):
        sleep(1)
        print(f"Counter at: {i}")


if __name__ == '__main__':
    try:
        p1 = Process(target=increment_50_seconds, name="Process_increment_50_seconds")

        p1.start()

        p1.join(timeout=5)

        print("-------------- Before long func ------------------")
        long_process_function(10)
        print("-------------- After long func ------------------")
        p1.terminate()

        if p1.exitcode is None:
            print("Time is out")

    except:
        print("Exception caught")



Increment 50 seconds
Counter at: 0
Counter at: 1
Counter at: 2
Counter at: 3
Counter at: 4
-------------- Before long func ------------------
Now sleeping for 10
Counter at: 5
Counter at: 6
Counter at: 7
Counter at: 8
Counter at: 9
Counter at: 10
Counter at: 11
Counter at: 12
Counter at: 13
Done sleeping!!!
-------------- After long func ------------------
Time is out


#### Practice `wrapt_timeout_decorator` module

In [45]:

@timeout(5)
def mytest(message):
    # this example does NOT work on windows, please check the section
    # "use with Windows" in the README.rst
    print(message)
    try:
        for i in range(1,10):
            time.sleep(1)
            print('{} seconds have passed'.format(i))

    except TimeoutError as e:
        print("TimeoutError caught:", e)
    # This error is seen occurring on when running lots of loops on the server
    except AttributeError as e:
        print("AttributeError caught:", e)

if __name__ == '__main__':
    mytest('starting')

starting
1 seconds have passed
2 seconds have passed
3 seconds have passed
4 seconds have passed
TimeoutError caught: Function mytest timed out after 5.0 seconds


================================================================

#### Pratice `timeout_decorator` module

In [18]:

class MyTimeOutError(AssertionError):
     """Thrown when a timeout occurs in `timeout_decorator`"""
     pass

def timeout_decorator(timeout_seconds):

    def timeout_wrapper(original_function):

        def _timeout_handler(signum, frame):
            raise MyTimeOutError("Function execution timed out.")

        def wrapper(*args, **kwargs):
            # Set the signal handler
            signal.signal(signal.SIGALRM, _timeout_handler)
            # Set the alarm for the specified timeout duration
            signal.alarm(timeout_seconds)

            result = original_function(*args, **kwargs)

            return result
        return wrapper
    return timeout_wrapper

In [19]:
import signal
from time import sleep

@timeout_decorator(4)
def long_running_function():
    for i in range(10):
        sleep(1)

try:
    # Call the long-running function
    long_running_function()

except MyTimeOutError as e:
    # Handle the timeout error
    print(str(e))
finally:
    # Cancel the alarm
    signal.alarm(0)


Function execution timed out.


==============================

### Practice Creating decorators

In [35]:
import signal
def deco_bar2(my_arg1):
    def deco_bar(original_function):
        def wrapper(*args, **kwargs):
            print("I'm deco bar() before")
            print(f"argument passed to decorator:{my_arg1}")
            result = original_function(*args, **kwargs)

            print("I'm deco bar() after")

            return result
        return wrapper
    return deco_bar

@deco_bar2("arg_to_deco")
def foo(valx):
    print(f"I'm foo({valx})")
    return 1

def baz(*args, **kwargs):
    print(args, kwargs)


In [36]:
foo(valx="foo_arg2")

I'm deco bar() before
argument passed to decorator:arg_to_deco
I'm foo(foo_arg2)
I'm deco bar() after


1

In [None]:
def validate_transition(cfe):
    """ Deprecated function.
    Works with transition graph which has single activity as key.
    Args:
        cfe: Dice counterfactual object.
    """
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    # expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[ ["ACTIVITY", "Involved_ST_Function_Div"] ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df