In [1]:
import copy
import tqdm
import optuna
import pickle

import learn2clean
import learn2clean.loading.reader as rd 
import learn2clean.qlearning.qlearner as ql
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load_dataset import load
from classifier import *
from utils import *
from metrics import *  # include fairness and corresponding derivatives
from API_Design_a import MissingValueError, SamplingError, Injector
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import mutual_info_score, auc, roc_curve, roc_auc_score, f1_score
from scipy.stats import wasserstein_distance
from optuna.samplers import *

ModuleNotFoundError: No module named 'load_dataset'

In [6]:
# ignore all the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = 'adult'
sens_attr = 'gender'

In [None]:
X_train, X_test, y_train, y_test = load(dataset)

In [None]:
X_train_orig = copy.deepcopy(X_train)
X_test_orig = copy.deepcopy(X_test)

# Use 1/4 of training data as validation set
X_train_orig, X_val_orig, y_train, y_val = \
    train_test_split(X_train_orig, y_train, test_size=0.25, random_state=seed)

In [None]:
X_train_orig, X_val_orig, X_test_orig = (X_train_orig.reset_index(drop=True), 
                                         X_val_orig.reset_index(drop=True),
                                         X_test_orig.reset_index(drop=True))
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

## Learn2Clean with Decision Tree

In [None]:
X_train_orig.columns

In [None]:
# create pattern function given subpopulation
def create_pattern(col_list, lb_list, ub_list):
    try:
        assert len(col_list) == len(lb_list) == len(ub_list)
    except:
        print(col_list, lb_list, ub_list)
        raise SyntaxError
    def pattern(data_X, data_y):
        binary_indicators = []
        for i in data_X.index:
            satisfaction = True
            for j in range(len(col_list)):
                if col_list[j] == 'Y':
                    if (data_y.loc[i] < lb_list[j]) or (data_y.loc[i] > ub_list[j]):
                        satisfaction = False
                        break
                else:
                    if (data_X.loc[i, col_list[j]] < lb_list[j]) or (data_X.loc[i, col_list[j]] > ub_list[j]):
                        satisfaction = False
                        break
            if satisfaction:
                binary_indicators.append(1)
            else:
                binary_indicators.append(0)
        return np.array(binary_indicators)
    return pattern



In [None]:
import os

if not(os.path.exists('./save/')):
    os.system('mkdir ./save/')

In [None]:
# clean data
learn2clean_dataset = {"train": X_train_orig, "test": X_test_orig, 
                       "target": y_train, "target_test": y_test}

# file name only used for the name of saved log, cannot be None.
# target goal is the name of y (y_train and y_test should be pandas Series)
l2c_c1assification1 = ql.Qlearner(dataset=learn2clean_dataset, goal='CART', target_goal='income',
                                  threshold=0.6, target_prepare=None, 
                                  file_name='adult', verbose=False)
baseline_auc = l2c_c1assification1.learn2clean()
baseline_auc

In [None]:
budget = 4000
col_list = ['education', 'marital', 'gender', 'Y']
def objective(trial, budget=budget/2, col_id=3, precedent_injection=[]):
    lb_list = []
    ub_list = []
    for col in col_list:
        if col == 'Y':
            mv_lb = trial.suggest_int(col+'_mv_lb', y_train.min(), y_train.max())
            lb_list.append(mv_lb)
            mv_interval = trial.suggest_int(col+'_mv_int', 0, y_train.max() - mv_lb)
            mv_ub = mv_interval + mv_lb
            ub_list.append(mv_ub) 
        else:
            mv_lb = trial.suggest_int(col+'_mv_lb', X_train_orig[col].min(), X_train_orig[col].max())
            lb_list.append(mv_lb)
            mv_interval = trial.suggest_int(col+'_mv_int', 0, X_train_orig[col].max() - mv_lb)
            mv_ub = mv_interval + mv_lb
            ub_list.append(mv_ub)

    mv_pattern = create_pattern(col_list, lb_list, ub_list)
    mv_pattern_len = np.sum(mv_pattern(X_train_orig, y_train))
    if mv_pattern_len == 0:
        raise optuna.exceptions.TrialPruned()
    mv_num = min(mv_pattern_len, budget)
    
    mv_err = MissingValueError(col_id, mv_pattern, mv_num/mv_pattern_len)
    injecter = Injector(error_seq = precedent_injection + [mv_err])
    dirty_X_train_orig, dirty_y_train, _, _ = injecter.inject(X_train_orig.copy(), y_train.copy(),
                                                              X_train_orig, y_train)
    
    # learn2clean
    learn2clean_dataset = {"train": dirty_X_train_orig, "test": X_test_orig, 
                           "target": dirty_y_train, "target_test": y_test}

    l2c_c1assification1 = ql.Qlearner(dataset=learn2clean_dataset, goal='CART', target_goal='income',
                                      target_prepare=None, file_name='adult', verbose=False)
    auc = l2c_c1assification1.learn2clean()
    
    auc_drop = auc - baseline_auc
    trial.set_user_attr('num_errs', mv_num)
    trial.set_user_attr('auc_drop', auc_drop)
    trial.set_user_attr('error_injector', injecter)
    return auc_drop


# optimize for first injection (3rd column: education)
study1 = optuna.create_study(sampler=TPESampler(seed=budget))
study1.optimize(lambda trial: objective(trial, budget=budget, col_id=3, 
                                        precedent_injection=[]), n_trials=50)

learned_err_injection_list = study1.best_trial.user_attrs['error_injector'].error_seq
num_errs_used = study1.best_trial.user_attrs['num_errs']
print(f"Injected {num_errs_used} errors.")

Now we use the learned injector to verify

In [None]:
learned_injector = study1.best_trial.user_attrs['error_injector']
dirty_X_train_orig, dirty_y_train, _, _ = learned_injector.inject(X_train_orig.copy(), y_train.copy(),
                                                                  X_train_orig, y_train)

learn2clean_dataset = {"train": dirty_X_train_orig, "test": X_test_orig, 
                       "target": dirty_y_train, "target_test": y_test}

l2c_c1assification1 = ql.Qlearner(dataset=learn2clean_dataset, goal='CART', target_goal='income',
                                  target_prepare=None, file_name='adult', verbose=False)
auc = l2c_c1assification1.learn2clean()
auc

In [None]:
baseline_auc - auc