# Generating counterfactuals for multi-class classification and regression models
This notebook will demonstrate how the DiCE library can be used for multiclass classification and regression for scikit-learn models.
You can use any method ("random", "kdtree", "genetic"), just specific it in the method argument in the initialization step. The rest of the code is completely identical.
For demonstration, we will be using the genetic algorithm for CFs.

http://interpret.ml/DiCE/dice_ml.explainer_interfaces.html#dice_ml.explainer_interfaces.dice_tensorflow1.DiceTensorFlow1.generate_counterfactuals

In [1]:
import dice_ml
from dice_ml import Dice

from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

Let's recall numerical results from RANS simulations

In [3]:
pp = 'postProcessing/sampleDict/5000/'

Us = pd.read_csv(pp+'XoR0.0_U.csv')
Is = pd.read_csv(pp+'XoR0.0_I1_I2_I3_I4_I5.csv')
Ts = pd.read_csv(pp+'XoR0.0_T1_T2_T3_T4_T5_T6_T7_T8_T9_T10_R.csv')

Us.drop('y', axis=1, inplace=True)
Ts.drop('y', axis=1, inplace=True)
Is.drop('y', axis=1, inplace=True)

#target_U_0 = Us[["U_0"]]
#target_Rs = Ts[["R_0", "R_1", "R_2", "R_3", "R_4", "R_5"]]

# concatenating df1 and df2 along rows
#df_v = pd.concat([Us, Is, Ts], axis=0)

# concatenating df3 and df4 along columns
df_h = pd.concat([Us, Is, Ts], axis=1)
df_h.drop(["U_1", "U_2","R_0", "R_1", "R_2", "R_3", "R_4", "R_5"], axis=1, inplace=True)

In [4]:
df_h

Unnamed: 0,U_0,I1,I2,I3,I4,I5,T1_0,T1_1,T1_2,T1_3,...,T9_2,T9_3,T9_4,T9_5,T10_0,T10_1,T10_2,T10_3,T10_4,T10_5
0,0.524105,1.0,-0.985615,-0.253116,0.085059,-0.490401,-0.741625,5.036742e-14,1.830723e-20,-3.420699e-27,...,6.846288e-21,-1.279227e-27,-4.649650e-34,-1.690025e-40,-0.000334,2.269553e-17,8.249224e-24,-1.541365e-30,-5.602453e-37,-2.036343e-43
1,0.524105,1.0,-0.985615,-0.253116,0.085059,-0.490401,-0.741625,4.996395e-14,7.509881e-20,2.296661e-15,...,2.350096e-20,-3.277932e-16,4.613895e-20,-6.248649e-19,-0.000334,9.917023e-17,-1.075604e-20,6.246357e-18,-8.792140e-22,1.190729e-20
2,0.667018,1.0,-0.984068,-0.160480,0.057839,-0.477428,-0.719408,-1.009797e-01,4.407005e-03,5.913638e-01,...,1.282252e-03,-8.488930e-02,3.506808e-03,-1.618226e-04,-0.001839,2.026426e-02,-8.062894e-04,1.835213e-03,-6.121095e-05,3.498425e-06
3,0.718770,1.0,-0.828705,-0.072198,0.021992,-0.389404,-0.464212,1.515925e-01,-5.753189e-03,3.557242e-01,...,8.012341e-05,-1.536856e-01,6.450841e-03,-2.929674e-04,-0.009574,7.265065e-02,-2.903123e-03,9.556094e-03,-4.224041e-04,1.821657e-05
4,0.718446,1.0,-0.585487,-0.122418,0.026780,-0.270434,-0.237957,3.216580e-01,-1.181508e-02,5.852985e-02,...,-1.336589e-03,-1.495199e-01,6.145111e-03,-2.850266e-04,-0.014564,3.657950e-02,-1.347896e-03,1.453662e-02,-6.196640e-04,2.771084e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,0.364636,1.0,-1.000014,-0.000167,0.000056,-0.500007,0.003702,-7.064234e-01,2.021491e-06,-3.805969e-03,...,1.332522e-06,-1.663523e-01,5.562750e-07,-3.171137e-04,0.000056,2.555984e-07,-4.330389e-09,-5.553600e-05,9.413632e-07,-1.058671e-07
65,0.254645,1.0,-1.000007,-0.000346,0.000115,-0.500003,0.002500,-7.064287e-01,1.334954e-06,-2.725825e-03,...,7.301335e-07,-1.663512e-01,3.571840e-07,-3.171116e-04,0.000115,3.563416e-07,-1.869983e-09,-1.150500e-04,5.158085e-07,-2.193173e-07
66,0.131208,1.0,-0.999995,-0.000094,0.000031,-0.499998,0.001357,-7.064319e-01,6.618390e-07,-1.417055e-03,...,6.890410e-07,-1.663491e-01,1.628992e-07,-3.171075e-04,0.000031,2.173743e-07,-8.298312e-11,-3.111019e-05,4.867696e-07,-5.930467e-08
67,0.037622,1.0,-0.999986,0.001366,-0.000455,-0.499993,0.000734,-7.061365e-01,6.578105e-10,1.754679e-04,...,2.626255e-06,-1.659282e-01,-1.508398e-08,-3.163053e-04,-0.000455,6.072904e-08,1.589023e-09,4.536083e-04,1.854044e-06,8.647035e-07


In [5]:
outcome_name = "U_0"
continuous_features = df_h.drop(outcome_name, axis=1).columns.tolist()
target = df_h[outcome_name]

Let's upload experimental data. To begin, let's consider only one profile, later on all profiles should be taken.

In [6]:
#exp = 'ref_data/postProcessing/sampleDict/0/'

exp_data = pd.read_csv('../../DATA/BoR_Data/data_Oct_2021/turbStat/BoRDataCompilation_Medium.csv')
exp_data

#vel = exp_data['Ux']
#vel

Unnamed: 0,x/R,r/R,y/R,Ux,Ur,<ux^2>,<ur^2>,<uxur>
0,-0.1000,0.0099,0.9901,0.9079,0.0144,14.5002,0.8891,-3.0369
1,-0.1000,0.0396,0.9604,0.9206,0.0524,12.5207,2.1417,-3.4063
2,-0.1000,0.0495,0.9505,0.9260,0.0646,11.0438,2.6117,-3.6930
3,-0.1000,0.0593,0.9407,0.9342,0.0754,9.2999,2.9268,-3.8233
4,-0.1000,0.0692,0.9308,0.9439,0.0845,7.6858,3.1250,-3.7712
...,...,...,...,...,...,...,...,...
8720,51.3667,0.0656,0.9344,1.1837,-0.0028,0.2337,0.1617,0.0091
8721,51.3667,0.0551,0.9449,1.1841,-0.0010,0.2305,0.1610,0.0057
8722,51.3667,0.0446,0.9554,1.1845,0.0007,0.2295,0.1605,0.0024
8723,51.3667,0.0341,0.9659,1.1847,0.0022,0.2291,0.1598,0.0004


# Regression

For regression, we will use sklearn's California Housing dataset. This dataset contains California house prices. More information at https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In [7]:
# Split data into train and test
datasetX = df_h.drop(outcome_name, axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

categorical_features = x_train.columns.difference(continuous_features)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
regr = Pipeline(steps=[('preprocessor', transformations),
                               ('regressor', RandomForestRegressor())])
model_ = regr.fit(x_train, y_train)

In [8]:
d_ = dice_ml.Data(dataframe=df_h, continuous_features=continuous_features, outcome_name=outcome_name)
# We provide the type of model as a parameter (model_type)
m_ = dice_ml.Model(model=model_, backend="sklearn", model_type='regressor')

In [9]:
exp_genetic = Dice(d_, m_, method="genetic")

As we can see below, all the target values will lie in the desired range

In [None]:
# Multiple queries can be given as input at once
query_instances = x_test[2:3]
genetic = exp_genetic.generate_counterfactuals(query_instances,
                                               total_CFs=7,
                                               desired_range=[4.0, 5.0], #None
                                               #desired_class="opposite",
                                               #permitted_range=None, 
                                               features_to_vary=["I1","I2","I3","I4","I5"]
                                              )
genetic.visualize_as_dataframe(show_only_changes=True)

  0%|                                                                                                                                                                                 | 0/1 [00:00<?, ?it/s]