In [None]:
import numpy 
import pandas as pd
import dice_ml
import xgboost as xgb
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore") 
pd.options.display.max_rows = 500

In [None]:
TARGET = 'TCV'

In [None]:
data = pd.read_csv('../data/TotalClothingValue+3Binary.csv', index_col=0)
data = data.dropna()

In [None]:
mapping = {-3: 0, -2: 1, -1: 2, 1: 3, 2: 4, 3: 5}
data[TARGET] = data[TARGET].replace(mapping)

In [None]:
columns = ['TSV', 'TPV', 'TCV', 'TSL']
columns.remove(TARGET)

In [None]:
# Working only on TSV now
data = data.drop(columns=columns)

In [None]:
features = data.drop(TARGET, axis=1).columns.tolist()
target = data[TARGET]

In [None]:
datasetX = data.drop(TARGET, axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

In [None]:
categorical_features = x_train.columns.difference(features)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)])

regr = Pipeline(steps=[('preprocessor', transformations),
                        ('classifier', xgb.XGBClassifier())])
model = regr.fit(x_train, y_train)

In [None]:
d = dice_ml.Data(dataframe=data, continuous_features=features, outcome_name=TARGET)
m = dice_ml.Model(model=model, backend='sklearn', model_type='classifier')

In [None]:
exp = dice_ml.Dice(d, m, method='random')

In [None]:
always_immutable = ['DAY', 'StartTime', 'AvgMaxDailyTemp','AvgMinDailyTemp', 'School', 'SchoolType']
freezed = always_immutable + ['FormalClothing','SwC','MC','AvgIndoorRelativeHumidity', 'IndoorTempDuringSurvey','Gender', 'TotalCLOwithChair',
                                'Grade', 'Age']

features_to_vary = data.columns.difference(freezed).to_list()

In [None]:
features_to_vary.remove(TARGET)

In [None]:
query_instances = x_test[:]

In [None]:
# cobj = exp.global_feature_importance(query_instances, total_CFs=10, desired_class= 2, posthoc_sparsity_param=None)
# print(cobj.summary_importance)

In [None]:
cf = exp.generate_counterfactuals(query_instances=query_instances, total_CFs=4, desired_class=4, features_to_vary=features_to_vary)

In [None]:
r = []
not_gen = 0
for i in range(len(x_test)):
    r.append(x_test[i:i+1])
    if cf.cf_examples_list[i].final_cfs_df is not None:
        r.append(cf.cf_examples_list[i].final_cfs_df)
    else:
        not_gen += 1

r2 = pd.concat(r)

In [None]:
mapping = {0: -3, 1: -2, 2: -1, 4: 1, 5: 2, 6: 3}
r2[TARGET] = r2[TARGET].replace(mapping)

In [None]:
r2.to_csv(f'../results/{TARGET}/TC9.csv')

In [None]:
print(not_gen)