In [11]:
import numpy 
import pandas as pd
import dice_ml
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings("ignore") 
pd.options.display.max_rows = 500

In [12]:
data = pd.read_csv('../data/TotalClothingValue.csv', index_col=0)

In [13]:
# Working only on TSV now
data = data.drop(columns=['TPV', 'TCV', 'TSL'])

In [14]:
features = data.drop('TSV', axis=1).columns.tolist()
target = data['TSV']

In [15]:
datasetX = data.drop('TSV', axis=1)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

In [17]:
categorical_features = x_train.columns.difference(features)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)])

regr = Pipeline(steps=[('preprocessor', transformations),
                        ('regressor', RandomForestRegressor())])
model = regr.fit(x_train, y_train)

In [33]:
import pickle
pickle.dump(model, open('../models/tsv_full.pkl', 'wb'))

In [19]:
y_pred = model.predict(x_test)

In [20]:
d = dice_ml.Data(dataframe=data, continuous_features=features, outcome_name='TSV')
m = dice_ml.Model(model=model, backend='sklearn', model_type='regressor')

In [26]:
exp = dice_ml.Dice(d, m, method='random')

In [29]:
always_immutable = ['AvgMaxDailyTemp','AvgMinDailyTemp','School','StartTime']
freezed = always_immutable + []

features_to_vary = data.columns.difference(freezed).to_list()

In [30]:
features_to_vary.remove('TSV')

In [22]:
query_instances = x_test[12:15]

In [23]:
query_instances

Unnamed: 0_level_0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC
Sno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1479,2,4,1,1,19.6,6.6,59.6,14.8,3,8,1,1,1.69,2,4
192,2,1,0,5,20.5,5.0,57.53,15.3,5,9,0,0,1.26,2,4
1389,1,4,1,3,22.4,4.7,72.0,14.5,3,9,1,1,1.5,2,4


In [48]:
cf = exp.generate_counterfactuals(query_instances=query_instances, total_CFs=4, desired_range=[0.0, 2.0], features_to_vary=features_to_vary)

  0%|          | 0/3 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [00:00<00:01,  1.53it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec


100%|██████████| 3/3 [00:01<00:00,  2.04it/s]


In [49]:
cf.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome : -1)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,2,4,1,1,19.6,6.6,59.599998,14.8,3,8,1,1,1.69,2,4,-1.0



No counterfactuals found!
Query instance (original outcome : 0)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,2,1,0,5,20.5,5.0,57.529999,15.3,5,9,0,0,1.26,2,4,-0.0



Diverse Counterfactual set (new outcome: [0.0, 2.0])


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,-,-,-,-,-,-,57.53,15.3,3.0,-,-,-,1.26,1.0,-,0.10833333432674408
1,-,-,-,-,-,-,57.53,15.3,-,11.0,-,-,1.21,-,-,-
2,-,-,-,-,-,-,57.53,15.3,4.0,-,-,-,1.26,1.0,-,0.0533333346247673
3,-,-,-,-,-,-,57.53,15.3,-,13.0,-,-,1.26,1.0,-,0.019999999552965164


Query instance (original outcome : 0)


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,1,4,1,3,22.4,4.7,72.0,14.5,3,9,1,1,1.5,2,4,-0.0



Diverse Counterfactual set (new outcome: [0.0, 2.0])


Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,TotalCLOwithChair,SwC,MC,TSV
0,-,-,-,-,22.4,4.7,-,-,-,-,0.0,-,1.7,-,1.0,0.2399999946355819
1,-,-,-,-,22.4,4.7,-,17.1,-,-,0.0,-,1.7,-,1.0,0.1666666716337204
2,-,-,-,-,22.4,4.7,-,-,-,12.0,-,-,1.71,-,1.0,0.0116666667163372
3,-,-,-,-,22.4,4.7,-,-,-,-,-,-,1.7,-,1.0,0.0931666642427444


In [80]:
print(cf.cf_examples_list[0].final_cfs_df)

None


In [88]:
for i in cf.cf_examples_list:
    if i != None:
        print(type(i.final_cfs_df))

<class 'NoneType'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
