In [1]:
import dice_ml
from dice_ml import Dice # Version 0.9

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

import pandas as pd  # Version 1.5

## Helper Function

In [6]:
def prepare_df_for_ml(df, case_id_name, outcome_name, columns_to_remove=None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    :param str outcome_name: name of the target column.
    """
    # Before training for ml we need to remove columns that can are not needed for ML model.
    if columns_to_remove is None:
        columns_to_remove = ["Change_Date+Time", "time_remaining"]
    df = df.drop([case_id_name], axis="columns")
    df = df.drop(columns_to_remove, axis="columns")
    X = df.drop([outcome_name], axis=1)
    y = df[outcome_name]
    return X, y

## Load Data

In [15]:
train_dataset_file = "vinst_train.csv"
test_case = "vinst_test_case.csv"
df_train = pd.read_csv( train_dataset_file)
df_test = pd.read_csv( test_case )

## Train ML Model (Random Forest)

In [0]:
case_id_name = 'SR_Number'  # The case identifier column name.
start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"
resource_column_name = "Involved_ST"

cols_to_vary = [activity_column_name, resource_column_name]

outcome_name = "lead_time"

X_train, y_train = prepare_df_for_ml(df_train, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "# ACTIVITY=In Progress",
                       "# ACTIVITY=Awaiting Assignment",
                       "# ACTIVITY=Resolved", "# ACTIVITY=Assigned", "# ACTIVITY=Closed", "# ACTIVITY=Wait - User",
                       "# ACTIVITY=Wait - Implementation", "# ACTIVITY=Wait",
                       "# ACTIVITY=Wait - Vendor", "# ACTIVITY=In Call", "# ACTIVITY=Wait - Customer",
                       "# ACTIVITY=Unmatched", "# ACTIVITY=Cancelled"]
categorical_features = ["Status", "ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST",
                        "SR_Latest_Impact", "Product", "Country", "Owner_Country",
                        "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestRegressor(n_jobs=7))])
model = clf.fit(X_train, y_train)


## Create DiCE model

In [8]:
data_model = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                          continuous_features=continuous_features,
                          outcome_name=outcome_name)

ml_backend = dice_ml.Model(model=model, backend="sklearn", model_type='regressor')
method = "genetic"
explainer = Dice(data_model, ml_backend, method=method)



### Generate Counterfactual for the Test Case
The code goes in an almost infinite loop. Almost because maybe after hours it may conclude, but it didn't conclude in at least 10 minutes.

In [18]:
X_test, y_test = prepare_df_for_ml(df_test, case_id_name, outcome_name, columns_to_remove=["Change_Date+Time", "time_remaining"])
# Access the last row of the truncated trace to replicate the behavior of a running trace
query_instances = X_test.iloc[-1:]
total_time_upper_bound = int( y_test.iloc[-1] * ( 95 / 100) )  # A percentage of the original total time of the trace

print(f"Total time upper bound: {total_time_upper_bound}")
cfe = explainer.generate_counterfactuals(query_instances, total_CFs=50,
                                                 desired_range=[0, total_time_upper_bound],
                                                 features_to_vary=cols_to_vary)

Total time upper bound: 17608780


  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
 

KeyboardInterrupt: 