In [21]:
import numpy 
import pandas as pd
import dice_ml
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings("ignore") 
pd.options.display.max_rows = 500

In [37]:
data = pd.read_csv('../data/TotalClothingValue.csv', index_col=0)

In [38]:
# Working only on TSV now
data = data.drop(columns=['TPV', 'TCV', 'TSL'])

In [39]:
features = data.drop('TSV', axis=1).columns.tolist()
target = data['TSV']

In [40]:
datasetX = data.drop('TSV', axis=1)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

In [42]:
categorical_features = x_train.columns.difference(features)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)])

regr = Pipeline(steps=[('preprocessor', transformations),
                        ('regressor', RandomForestRegressor())])
model = regr.fit(x_train, y_train)

In [None]:
import pickle
pickle.dump(model, open('../models/tsv.pkl', 'wb'))

In [28]:
y_pred = model.predict(x_test)

In [43]:
d = dice_ml.Data(dataframe=data, continuous_features=features, outcome_name='TSV')
m = dice_ml.Model(model=model, backend='sklearn', model_type='regressor')

In [44]:
exp = dice_ml.Dice(d, m, method='genetic')

In [45]:
always_immutable = ['AvgMaxDailyTemp','AvgMinDailyTemp','School','DAY','StartTime']
freezed = always_immutable + []

features_to_vary = data.columns.difference(freezed).to_list()

In [47]:
features_to_vary.remove('TSV')

In [48]:
query_instances = x_test[12:15]

In [49]:
query_instances

Unnamed: 0_level_0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC
Sno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1479,2,4,1,1,19.6,6.6,59.6,14.8,3,8,1,1,1.69,2,4
192,2,1,0,5,20.5,5.0,57.53,15.3,5,9,0,0,1.26,2,4
1389,1,4,1,3,22.4,4.7,72.0,14.5,3,9,1,1,1.5,2,4


In [50]:
cf = exp.generate_counterfactuals(query_instances=query_instances, total_CFs=4, desired_range=[0.0, 2.0], features_to_vary=features_to_vary)

  0%|          | 0/3 [00:57<?, ?it/s]


KeyboardInterrupt: 

In [21]:
cf.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome : -1)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,2.0,4.0,1.0,1.0,19.6,6.6,59.599998,14.8,3.0,8.0,1.0,1.0,1.69,2.0,4.0,-0.908333



Diverse Counterfactual set (new outcome: [0.0, 2.0])


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,3.0,3.0,-,-,21.2,5.6,62.3,14.0,-,-,0.0,-,1.69,-,-,0.0
0,3.0,2.0,-,-,22.9,6.6,57.2,15.0,-,-,0.0,-,1.63,-,-,0.0
0,3.0,2.0,-,-,22.9,6.6,56.9,15.0,-,-,0.0,-,1.69,1.0,-,0.05
0,-,2.0,-,-,22.2,5.2,56.9,13.5,4.0,-,-,-,1.44,-,-,0.02


Query instance (original outcome : -1)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,2.0,1.0,0.0,5.0,20.5,5.0,57.529999,15.3,5.0,9.0,0.0,0.0,1.26,2.0,4.0,-0.807333



Diverse Counterfactual set (new outcome: [0.0, 2.0])


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,-,-,-,-,-,-,57.5,15.3,-,11.0,-,-,1.21,-,-,0.0
0,-,-,-,-,-,-,57.5,15.3,-,11.0,-,-,1.07,-,-,0.0
0,-,-,-,-,-,-,57.5,15.3,-,10.0,1.0,-,1.07,-,-,0.015
0,-,-,-,-,-,-,57.5,15.3,-,10.0,1.0,-,0.9,1.0,-,0.34


Query instance (original outcome : 0)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,1.0,4.0,1.0,3.0,22.4,4.7,72.0,14.5,3.0,9.0,1.0,1.0,1.5,2.0,4.0,-0.04



Diverse Counterfactual set (new outcome: [0.0, 2.0])


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,-,-,-,-,22.4,4.7,-,-,-,8.0,0.0,-,1.71,-,1.0,0.6
0,5.0,-,-,1.0,20.6,4.6,68.8,13.7,-,-,-,-,1.55,-,-,0.0
0,2.0,3.0,-,1.0,22.4,10.1,74.2,17.0,-,-,-,-,1.58,-,-,0.04
