In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
import torch
import numpy as np

from counterfactuals.datasets import LawDataset, AdultDataset
from counterfactuals.cf_methods.ppcef import PPCEF
from counterfactuals.generative_models import MaskedAutoregressiveFlow
from counterfactuals.discriminative_models import MultilayerPerceptron
from counterfactuals.losses import MulticlassDiscLoss
from counterfactuals.metrics import evaluate_cf

In [60]:
datasets = {
    "adult": (
        AdultDataset("../data/adult.csv"),
        "adult_disc_model.pt",
        "adult_flow.pth",
    ),
    "law": (LawDataset("../data/law.csv"), "law_disc_model.pt", "law_flow.pth"),
}

dataset, disc_model_path, gen_model_path = datasets["adult"]

In [61]:
dataset = AdultDataset("../data/adult.csv")

In [62]:
dataset.raw_data

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,39,Government,Bachelors,Single,White-Collar,White,Male,40,0
1,50,Self-Employed,Bachelors,Married,White-Collar,White,Male,13,0
2,38,Private,HS-grad,Divorced,Blue-Collar,White,Male,40,0
3,53,Private,School,Married,Blue-Collar,Other,Male,40,0
4,28,Private,Bachelors,Married,Professional,Other,Female,40,0
...,...,...,...,...,...,...,...,...,...
1275,51,Government,Bachelors,Married,White-Collar,White,Male,40,1
1276,46,Private,Bachelors,Married,Sales,White,Male,40,1
1277,67,Private,Bachelors,Married,Service,White,Male,7,1
1278,47,Private,HS-grad,Married,Blue-Collar,White,Male,35,0


In [64]:
categorical = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "race",
    "gender",
]
for col in categorical:
    num = len(dataset.raw_data[col].dropna().unique())
    print(num)

4
8
5
6
2
2


In [65]:
dataset.intervals

[(2, 6), (6, 14), (14, 19), (19, 25), (25, 27), (27, 29)]

In [4]:
# dataset = AdultDataset("../data/adult.csv")
# dataset = GermanCreditDataset("../data/german_credit.csv")
# dataset = LawDataset("../data/law.csv")

In [5]:
disc_model = MultilayerPerceptron(dataset.X_test.shape[1], [512, 512], 2)
# disc_model.fit(
#     dataset.train_dataloader(batch_size=128, shuffle=True),
#     dataset.test_dataloader(batch_size=128, shuffle=False),
#     epochs=5000,
#     patience=100,
#     lr=1e-3,
#     checkpoint_path=disc_model_path,
# )
# disc_model.load("adult_disc_model.pt")
disc_model.load(disc_model_path)

  self.load_state_dict(torch.load(path))


In [6]:
y_pred = disc_model.predict(dataset.X_test).detach().numpy().flatten()
print("Test accuracy:", (y_pred == dataset.y_test).mean())

Test accuracy: 0.7635135135135135


In [7]:
dataset.y_train = disc_model.predict(dataset.X_train).detach().numpy()
dataset.y_test = disc_model.predict(dataset.X_test).detach().numpy()

In [8]:
gen_model = MaskedAutoregressiveFlow(
    features=dataset.X_train.shape[1],
    hidden_features=16,
    num_blocks_per_layer=4,
    num_layers=8,
    context_features=1,
    batch_norm_within_layers=True,
    batch_norm_between_layers=True,
    use_random_permutations=True,
)
train_dataloader = dataset.train_dataloader(
    batch_size=256, shuffle=True, noise_lvl=0.03
)
test_dataloader = dataset.test_dataloader(batch_size=256, shuffle=False)

# gen_model.fit(
#     train_dataloader,
#     train_dataloader,
#     learning_rate=1e-3,
#     patience=100,
#     num_epochs=500,
#    checkpoint_path=gen_model_path
# )
# gen_model.load("adult_flow.pth")
gen_model.load(gen_model_path)

  self.load_state_dict(torch.load(path))


In [9]:
# torch.nn.functional.softmax(torch.rand(3, 4), dim=1)

In [None]:
cf = PPCEF(
    gen_model=gen_model,
    disc_model=disc_model,
    disc_model_criterion=MulticlassDiscLoss(),
    neptune_run=None,
)
cf_dataloader = dataset.test_dataloader(batch_size=1024, shuffle=False)
log_prob_threshold = torch.quantile(gen_model.predict_log_prob(cf_dataloader), 0.25)
deltas, X_orig, y_orig, y_target, logs = cf.explain_dataloader(
    cf_dataloader,
    alpha=100,
    log_prob_threshold=log_prob_threshold,
    epochs=20000,
    categorical_intervals=dataset.intervals,
)

Discriminator loss: 0.5728, Prob loss: 164.3296:   2%|▏         | 468/20000 [00:06<03:36, 90.15it/s]

In [None]:
X_cf = X_orig + deltas

evaluate_cf(
    disc_model=disc_model,
    gen_model=gen_model,
    X_cf=X_cf,
    model_returned=np.ones(X_cf.shape[0]),
    continuous_features=dataset.numerical_features,
    categorical_features=dataset.categorical_features,
    X_train=dataset.X_train,
    y_train=dataset.y_train,
    X_test=X_orig,
    y_test=y_orig,
    median_log_prob=log_prob_threshold,
    y_target=y_target,
)

{'coverage': 1.0,
 'validity': 0.6441441441441441,
 'actionability': 0.0,
 'sparsity': 1.0,
 'proximity_categorical_hamming': 0.8697565622096709,
 'proximity_categorical_jaccard': 0.8697565622096709,
 'proximity_continuous_manhattan': 0.9260526231383366,
 'proximity_continuous_euclidean': 0.8697565622096709,
 'proximity_continuous_mad': 2.0653113809370622,
 'proximity_l2_jaccard': 0.8697565622096709,
 'proximity_mad_hamming': 2.0653113809370622,
 'prob_plausibility': 0.0,
 'log_density_cf': -39664604.0,
 'log_density_test': 13.249523,
 'lof_scores_cf': 4.313986,
 'lof_scores_test': 1.1692128,
 'isolation_forest_scores_cf': -0.0882531255878314,
 'isolation_forest_scores_test': 0.05782682392755528}

In [None]:
# torch.nn.functional.gumbel_softmax(torch.rand(4, 3), tau=0.1, dim=1)

In [None]:
X_cf = X_orig + deltas
X_cf_cat = X_cf.copy()

for interval in dataset.intervals:
    begin, end = interval
    max_indices = np.argmax(X_cf_cat[:, begin:end], axis=1)
    X_cf_cat[:, begin:end] = np.eye(X_cf_cat[:, begin:end].shape[1])[max_indices]

In [None]:
evaluate_cf(
    disc_model=disc_model,
    gen_model=gen_model,
    X_cf=X_cf_cat,
    model_returned=np.ones(X_cf_cat.shape[0]),
    continuous_features=dataset.numerical_features,
    categorical_features=dataset.categorical_features,
    X_train=dataset.X_train,
    y_train=dataset.y_train,
    X_test=X_orig,
    y_test=y_orig,
    median_log_prob=log_prob_threshold,
    y_target=y_target,
)

{'coverage': 1.0,
 'validity': 0.740990990990991,
 'actionability': 0.0,
 'sparsity': 0.29902979902979904,
 'proximity_categorical_hamming': 0.19302926251184455,
 'proximity_categorical_jaccard': 0.40563934746235497,
 'proximity_continuous_manhattan': 0.46348221536762624,
 'proximity_continuous_euclidean': 0.40563934746235497,
 'proximity_continuous_mad': 1.618620157344019,
 'proximity_l2_jaccard': 0.40563934746235497,
 'proximity_mad_hamming': 1.4060100723935087,
 'prob_plausibility': 0.29954954954954954,
 'log_density_cf': 9.062393,
 'log_density_test': 13.249523,
 'lof_scores_cf': 1.8406732,
 'lof_scores_test': 1.1692128,
 'isolation_forest_scores_cf': -0.028903640399058225,
 'isolation_forest_scores_test': 0.05782682392755528}

In [None]:
from collections import defaultdict
import bisect

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

SEED = 42


class TargetEncoderNormalizingDataCatalog:
    def __init__(self, data):
        self.data_frame = data.raw
        self.continous = data.continous
        self.categoricals = data.categoricals
        self.feature_names = self.categoricals + self.continous
        self.scaler = StandardScaler()
        self.target = data.target
        self.data_catalog = data
        self.convert_to_target_encoding_form()
        self.normalize_feature()
        self.encoded_feature_name = ""
        self.immutables = data.immutables

    def convert_to_target_encoding_form(self):
        self.cat_dict = {}
        self.target_encoded_dict = {}
        for feature in self.categoricals:
            tmp_dict = defaultdict(lambda: 0)
            data_tmp = pd.DataFrame(
                {
                    feature: self.data_frame[feature],
                    self.target: self.data_frame[self.target],
                }
            )
            target_mean = data_tmp.groupby(feature)[self.target].mean()
            self.target_encoded_dict[feature] = target_mean
            for cat in target_mean.index.tolist():
                tmp_dict[cat] = target_mean[cat]
            self.cat_dict[feature] = dict(tmp_dict)

            tmp = np.repeat(np.nan, self.data_frame.shape[0])
            kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
            for idx_1, idx_2 in kf.split(self.data_frame):
                target_mean = data_tmp.iloc[idx_1].groupby(feature)[self.target].mean()
                tmp[idx_2] = self.data_frame[feature].iloc[idx_2].map(target_mean)
            self.data_frame[feature] = tmp

        self.data_frame[self.categoricals] = self.data_frame[self.categoricals].astype(
            "float"
        )

    def normalize_feature(self):
        self.data_frame[self.feature_names] = self.scaler.fit_transform(
            self.data_frame[self.feature_names]
        )

    def denormalize_continuous_feature(self, df):
        df[self.feature_names] = self.scaler.inverse_transform(df[self.feature_names])
        return df

    def convert_from_targetenc_to_original_forms(self, df):
        for cat in self.categoricals:
            d = self.cat_dict[cat]
            # ソート済みのキーと値のリストを作成
            sorted_keys = sorted(d.keys(), key=lambda k: d[k])
            sorted_values = [d[k] for k in sorted_keys]

            values = df[cat].values
            replace_values = []
            for val in values:
                # 二分探索でbに最も近い値のインデックスを見つける
                index = bisect.bisect_left(sorted_values, val)

                # 最も近い値のインデックスを範囲内に収める
                if index == len(sorted_values):
                    index -= 1
                elif index > 0 and abs(sorted_values[index] - val) > abs(
                    sorted_values[index - 1] - val
                ):
                    index -= 1

                # 最も絶対値の差が小さいキーを見つける
                closest_key = sorted_keys[index]
                replace_values.append(closest_key)
            df[cat] = replace_values
        return df

In [None]:
df = pd.DataFrame({"feature": [1, 1, 2, 2, 1, 3, 2], "target": [0, 1, 0, 1, 1, 0, 1]})
df

Unnamed: 0,feature,target
0,1,0
1,1,1
2,2,0
3,2,1
4,1,1
5,3,0
6,2,1


In [None]:
categorical_feature_values = df.groupby("feature")["target"].mean()
categorical_feature_values

feature
1    0.666667
2    0.666667
3    0.000000
Name: target, dtype: float64