In [1]:
import numpy 
import pandas as pd
import dice_ml
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings("ignore") 
pd.options.display.max_rows = 500



In [2]:
data = pd.read_csv('../data/TotalClothingValue+3Binary.csv', index_col=0)
data = data.dropna()

In [3]:
# Working only on TSV now
data = data.drop(columns=['TPV', 'TCV', 'TSL'])

In [4]:
features = data.drop('TSV', axis=1).columns.tolist()
target = data['TSV']

In [5]:
datasetX = data.drop('TSV', axis=1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

In [18]:
categorical_features = x_train.columns.difference(features)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)])

regr = Pipeline(steps=[('preprocessor', transformations),
                        ('classifier', RandomForestClassifier())])
model = regr.fit(x_train, y_train)

In [29]:
y_pred = model.predict(x_test)

In [30]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6127450980392157


In [8]:
import pickle
#pickle.dump(model, open('../models/tsv+3_full.pkl', 'wb'))
#model = pickle.load(open('../models/tsv+3_full.pkl', 'rb'))

In [20]:
d = dice_ml.Data(dataframe=data, continuous_features=features, outcome_name='TSV')
m = dice_ml.Model(model=model, backend='sklearn', model_type='classifier')

In [21]:
exp = dice_ml.Dice(d, m, method='random')

In [22]:
always_immutable = ['AvgMaxDailyTemp','AvgMinDailyTemp','School','StartTime']
freezed = always_immutable + []

features_to_vary = data.columns.difference(freezed).to_list()

In [23]:
features_to_vary.remove('TSV')

In [24]:
query_instances = x_test[:]


In [None]:
query_instances

In [25]:
cf = exp.generate_counterfactuals(query_instances=query_instances, total_CFs=4, desired_class=1, features_to_vary=features_to_vary)

  0%|          | 0/408 [00:00<?, ?it/s]

100%|██████████| 408/408 [03:22<00:00,  2.01it/s]


In [None]:
cf.visualize_as_dataframe(show_only_changes=True)

In [26]:
r = []
for i in range(len(x_test)):
    r.append(x_test[i:i+1])
    if cf.cf_examples_list[i].final_cfs_df is not None:
        r.append(cf.cf_examples_list[i].final_cfs_df)

r2 = pd.concat(r)

In [27]:
r2

Unnamed: 0,DAY,School,SchoolType,StartTime,AvgMaxDailyTemp,AvgMinDailyTemp,AvgIndoorRelativeHumidity,IndoorTempDuringSurvey,Grade,Age,Gender,FormalClothing,Scarf/Cap,Tie,Stockings,TotalCLOwithChair,SwC,MC,TSV
939,1.0,3.0,1.0,3.0,22.4,10.1,46.3,17.0,5.0,10.0,0.0,1.0,1.0,0.0,0.0,1.16,1.0,4.0,
0,1.0,3.0,1.0,3.0,22.4,10.1,46.3,17.0,5.0,6.0,0.0,1.0,1.0,0.0,0.0,1.75,1.0,4.0,1.0
1,1.0,3.0,1.0,3.0,22.4,10.1,65.8,17.0,5.0,6.0,0.0,1.0,1.0,0.0,0.0,1.16,1.0,4.0,1.0
2,1.0,3.0,1.0,3.0,22.4,10.1,46.3,17.0,5.0,11.0,0.0,1.0,1.0,0.0,0.0,1.16,2.0,4.0,1.0
3,1.0,3.0,1.0,3.0,22.4,10.1,46.3,17.0,5.0,6.0,0.0,0.0,1.0,0.0,0.0,1.16,1.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,2.0,4.0,1.0,1.0,19.6,6.6,59.6,13.1,3.0,7.0,1.0,1.0,1.0,0.0,0.0,1.33,2.0,4.0,
0,2.0,4.0,1.0,1.0,19.6,6.6,51.1,13.1,3.0,7.0,1.0,1.0,1.0,0.0,0.0,1.33,2.0,4.0,1.0
1,2.0,4.0,1.0,1.0,19.6,6.6,59.6,13.1,3.0,7.0,1.0,1.0,1.0,0.0,0.0,1.92,3.0,4.0,1.0
2,2.0,4.0,1.0,1.0,19.6,6.6,72.7,13.1,3.0,7.0,1.0,1.0,1.0,0.0,0.0,1.33,3.0,4.0,1.0


In [None]:
# r2.to_csv('Total+3_Humidity.csv')