In [23]:
from tensorflow import keras
import os
import joblib
import numpy as np
from explain import  get_relevance, get_critical_neurons
import tensorflow as tf
# from tensorflow import set_random_seed
from scalelayer import  ScaleLayer
from numpy.random import seed
import itertools
import time
import copy
from preprocessing import pre_census_income
import tensorflow.keras.backend as K
import argparse

seed(1)
tf.random.set_random_seed(2)
config = tf.ConfigProto()  
config.gpu_options.allow_growth=True 
sess = tf.Session(config=config)

K.set_session(sess)

def my_loss_fun(y_true, y_pred):
    # do whatever you want
    return y_pred
class ScaleLayer(tf.keras.layers.Layer):
    def __init__(self, dense_len, min=-1, max=1, **kwargs):
        super(ScaleLayer, self).__init__(**kwargs)
        tf.keras.constraints.MinMaxNorm()
        self.scale = K.variable([[1. for x in range(dense_len)]], name='ffff',
                                constraint=lambda t: tf.clip_by_value(t, min, max))
        self.dense_len = dense_len
    def call(self, inputs, **kwargs):
        m = inputs * self.scale
        return m
    def get_config(self):
        config = {'dense_len': self.dense_len}
        base_config = super(ScaleLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
def construct_model(neurons, top_layer, name, min, max, need_weights=True):
    in_shape = X_train.shape[1:]
    input = keras.Input(shape=in_shape)
    layer1 = keras.layers.Dense(30, name="layer1")
    d1 = ScaleLayer(30, min, max)
    layer2 = keras.layers.Dense(20, name="layer2")
    d2 = ScaleLayer(20, min, max)
    layer3 = keras.layers.Dense(15, name="layer3")
    d3 = ScaleLayer(15, min, max)
    layer4 = keras.layers.Dense(15, name="layer4")
    d4 = ScaleLayer(15, min, max)
    layer5 = keras.layers.Dense(10,name="layer5")
    d5 = ScaleLayer(10, min, max)
    layer6 = keras.layers.Dense(1, activation="sigmoid", name="layer6")

    layer_lst = [layer1, layer2, layer3, layer4, layer5]
    ds = [d1, d2, d3, d4, d5]
    for layer in layer_lst[0: top_layer]:
        layer.trainable = False

    x = input
    for i, l in enumerate(layer_lst):
        x = l(x)
        if i < top_layer:
            x = ds[i](x)
    x = layer6(x)

    if not need_weights:
        return keras.Model(input, x)

    w = 0.
    for i, re in enumerate(neurons):
        neg = re[0]
        pos = re[1]
        d = ds[i]
        for m in neg:
            w = tf.math.add(w, d.weights[0][0][m])
        for n in pos:
            w = tf.math.subtract(w, d.weights[0][0][n])
    new_w = tf.identity(tf.reshape(w, [1, 1]), name=name)

    model = keras.Model(input, [x, new_w])
    return model

def similar_set(X, num_attribs, protected_attribs, constraint):
    # find all similar inputs corresponding to different combinations of protected attributes with non-protected attributes unchanged
    similar_X = []
    protected_domain = []
    for i in protected_attribs:
        protected_domain = protected_domain + [list(range(constraint[i][0], constraint[i][1]+1))]
    all_combs = np.array(list(itertools.product(*protected_domain)))
    for i, comb in enumerate(all_combs):
        X_new = copy.deepcopy(X)
        for a, c in zip(protected_attribs, comb):
            X_new[:, a] = c
        similar_X.append(X_new)
    return similar_X

pos_map = { 'a': [0],
            'r': [6],
            'g': [7],
            'a&r': [0, 6],
            'a&g': [0, 7],
            'r&g': [6, 7]
            }

attr = 'g'
protected_attribs = pos_map[attr]

data_name = f"data/adult/C-{attr}_ids_EIDIG_INF.npy"
# dis_data = np.load(data_name)

dis_data = pre_census_income.X_train
num_attribs = len(dis_data[0])
new_data = dis_data.copy()
new_data[:, 7] = 1 - dis_data[:, 7]

similar_X = similar_set(dis_data, num_attribs, protected_attribs, pre_census_income.constraint)
# similar_X = [dis_data, new_data]

layer_map = [('layer1', 'scale_layer_5'), ('layer2', 'scale_layer_6'), ('layer3', 'scale_layer_7'), ('layer4', 'scale_layer_8'), ('layer5', 'scale_layer_8'), ('layer6', 'scale_layer_8')]

layer_index = 0

In [24]:
from preprocessing import pre_census_income
from tensorflow.keras.models import Model

model_path = "models/diff_adult_g_gated_4_diff.h5"
# adult_g_gated_4_0.3_0.2_p-0.1_p0.9.h5
model = keras.models.load_model(model_path, custom_objects={'ScaleLayer': ScaleLayer})
model.summary()

layer_name = layer_map[layer_index][1]
inter_model = Model(model.input, model.get_layer(layer_name).output)
                                 
layer_name = layer_map[layer_index][0]
inter_model_before = Model(model.input, model.get_layer(layer_name).output)                                 
                                 
inter_output_ori = inter_model.predict(similar_X[0])
inter_output_adv = inter_model.predict(similar_X[1])

inter_output_ori_before = inter_model_before.predict(similar_X[0])
inter_output_adv_before = inter_model_before.predict(similar_X[1])
                                 
print((np.abs(inter_output_adv - inter_output_ori)).sum())
print((np.abs(inter_output_adv_before - inter_output_ori_before)).sum())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
layer1 (Dense)               (None, 30)                390       
_________________________________________________________________
scale_layer_5 (ScaleLayer)   (None, 30)                30        
_________________________________________________________________
layer2 (Dense)               (None, 20)                620       
_________________________________________________________________
scale_layer_6 (ScaleLayer)   (None, 20)                20        
_________________________________________________________________
layer3 (Dense)               (None, 15)                315       
_________________________________________________________________
scale_layer_7 (ScaleLayer)   (None, 15)                15  

In [25]:
weight= model.get_layer('scale_layer_5').get_weights()
print("weight", weight)

weight [array([[ 7.5974339e-04, -5.3571258e-04, -5.2292697e-04, -8.3338644e-05,
         1.9389287e-03,  2.4732639e-04, -4.9206632e-05,  2.5967529e-04,
        -1.2399700e-01,  6.0464849e-04,  9.1479821e-03,  2.2583730e-04,
         2.8091979e-03,  1.4539480e-03,  6.5577053e-04, -1.7852495e-04,
         9.0948762e-03, -7.5216143e-05,  8.5201718e-05, -6.2989420e-04,
         1.3095517e-02, -2.0224368e-04, -1.3083528e-03, -2.0436089e-02,
         9.8191772e-04, -2.0736131e-04, -2.0158633e-03, -1.9681137e-03,
        -8.8466989e-04,  4.5708522e-02]], dtype=float32)]


In [26]:
from explain import  get_relevance, get_critical_neurons

def my_filter(layer_critical, total_num):
    i_unique, i_counts = np.unique(layer_critical, return_counts=True)
    i_rates = i_counts / total_num
    i_sort = np.where(i_rates > 0.2)[0]  # np.argsort(i_counts*-1)
    i_critical = i_unique[i_sort]
    return i_critical

def get_path_dict():
    saved_model_path = "models/finetuned_models_protected_attributes/adult/"
    path_ls = os.listdir(saved_model_path)
    path_dict = {}
    path_dict['r'] = [saved_model_path+p for p in path_ls if "r_adult" in p]
    path_dict['g'] = [saved_model_path+p for p in path_ls if "g_adult" in p]
    path_dict['a'] = [saved_model_path+p for p in path_ls if "a_adult" in p]
    path_dict['r'].sort()
    path_dict['g'].sort()
    path_dict['a'].sort()
    print(path_dict)
    return path_dict


def get_penalty_awarded(top_n, layer_num, total_num, income_critical, protected_critical_ls):
    neurons = []

    for i in range(layer_num):
        income_layer_critical = income_critical[i].flatten()
        i_critical = my_filter(income_layer_critical, total_num)
        current_penalty = None
        current_awarded = None
        filtered_criticals = []
        j = 0
        a = 'g'
        protected_layer_critical = protected_critical_ls[j][i].flatten()
        p_critical = my_filter(protected_layer_critical, total_num)
        filtered_criticals.append(p_critical)
        penalty = np.setdiff1d(p_critical, i_critical)
        awarded = np.setdiff1d(i_critical, p_critical)
        if current_penalty is None:
            current_penalty = penalty
        else:
            current_penalty = np.union1d(current_penalty, penalty)
        if current_awarded is None:
            current_awarded = awarded
        else:
#                 current_awarded = np.intersect1d(current_awarded, awarded)
            current_awarded = np.union1d(current_awarded, awarded)
        print("current_penalty", current_penalty, "current_awarded", current_awarded)
        neurons.append((current_penalty, current_awarded))
    neurons = neurons[1: (top_n + 1)]
    return neurons


path_dict = get_path_dict()
model_path = "models/adult_model.h5"

income_train_scores = get_relevance(model_path, pre_census_income.X_train,
                                        save_path=os.path.join('scores/adult', os.path.basename(model_path) + ".score"))
income_critical = get_critical_neurons(income_train_scores, 0.3)
finals = []

top_n = 4
protected_critical_ls = []

a = 'g'
path = path_dict[a][top_n - 1]
train_scores = get_relevance(path, pre_census_income.X_train,  save_path=os.path.join('scores/adult', os.path.basename(path) + ".score"))
protected_critical = get_critical_neurons(train_scores, 0.3)
protected_critical_ls.append(protected_critical)

layer_num = len(income_critical)
total_num = len(pre_census_income.X_train)
neurons = get_penalty_awarded(top_n, layer_num, total_num, income_critical, protected_critical_ls)

penalty = neurons[layer_index][0]
awarded = neurons[layer_index][1]
normal_neurons = [i for i in range(len(inter_output_adv[0])) if i not in neurons[layer_index][0] and i not in neurons[layer_index][1]]
print((np.abs(inter_output_adv[:, penalty] - inter_output_ori[:, penalty])).sum() / len(penalty))
print((np.abs(inter_output_adv[:, awarded] - inter_output_ori[:, awarded])).sum() / len(awarded))
print((np.abs(inter_output_adv[:, normal_neurons] - inter_output_ori[:, normal_neurons])).sum() / len(normal_neurons))
print("penalty", penalty)
print("awarded", awarded)
print("normal", normal_neurons)

print((np.abs(inter_output_adv - inter_output_ori)).sum())

{'r': ['models/finetuned_models_protected_attributes/adult/r_adult_model_1_0.986.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_2_0.946.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_3_0.881.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_4_0.871.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_5_0.859.h5'], 'g': ['models/finetuned_models_protected_attributes/adult/g_adult_model_1_0.997.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_2_0.968.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_3_0.848.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_4_0.826.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_5_0.768.h5'], 'a': ['models/finetuned_models_protected_attributes/adult/a_adult_model_1_0.994.h5', 'models/finetuned_models_protected_attributes/adult/a_adult_model_2_0.965.h5', 'models/finetuned_models_protec

In [27]:
model_path = "models/diff_adult_g_gated_4_diff.h5"
# adult_g_gated_4_0.3_0.2_p-0.1_p0.2.h5
model = keras.models.load_model(model_path, custom_objects={'ScaleLayer': ScaleLayer})
model.summary()

layer_name = layer_map[layer_index][1]
inter_model = Model(model.input, model.get_layer(layer_name).output)
                                 
layer_name = layer_map[layer_index][0]
inter_model_before = Model(model.input, model.get_layer(layer_name).output)                                 
                                 
inter_output_ori = inter_model.predict(similar_X[0])
inter_output_adv = inter_model.predict(similar_X[1])

inter_output_ori_before = inter_model_before.predict(similar_X[0])
inter_output_adv_before = inter_model_before.predict(similar_X[1])
                                 
print((np.abs(inter_output_adv - inter_output_ori)).sum())
print((np.abs(inter_output_adv_before - inter_output_ori_before)).sum())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
layer1 (Dense)               (None, 30)                390       
_________________________________________________________________
scale_layer_5 (ScaleLayer)   (None, 30)                30        
_________________________________________________________________
layer2 (Dense)               (None, 20)                620       
_________________________________________________________________
scale_layer_6 (ScaleLayer)   (None, 20)                20        
_________________________________________________________________
layer3 (Dense)               (None, 15)                315       
_________________________________________________________________
scale_layer_7 (ScaleLayer)   (None, 15)                15  

In [28]:
weight= model.get_layer('scale_layer_5').get_weights()
print("weight", weight)

weight [array([[ 7.5974339e-04, -5.3571258e-04, -5.2292697e-04, -8.3338644e-05,
         1.9389287e-03,  2.4732639e-04, -4.9206632e-05,  2.5967529e-04,
        -1.2399700e-01,  6.0464849e-04,  9.1479821e-03,  2.2583730e-04,
         2.8091979e-03,  1.4539480e-03,  6.5577053e-04, -1.7852495e-04,
         9.0948762e-03, -7.5216143e-05,  8.5201718e-05, -6.2989420e-04,
         1.3095517e-02, -2.0224368e-04, -1.3083528e-03, -2.0436089e-02,
         9.8191772e-04, -2.0736131e-04, -2.0158633e-03, -1.9681137e-03,
        -8.8466989e-04,  4.5708522e-02]], dtype=float32)]


In [29]:
def my_filter(layer_critical, total_num):
    i_unique, i_counts = np.unique(layer_critical, return_counts=True)
    i_rates = i_counts / total_num
    i_sort = np.where(i_rates > 0.2)[0]  # np.argsort(i_counts*-1)
    i_critical = i_unique[i_sort]
    return i_critical

def get_path_dict():
    saved_model_path = "models/finetuned_models_protected_attributes/adult/"
    path_ls = os.listdir(saved_model_path)
    path_dict = {}
    path_dict['r'] = [saved_model_path+p for p in path_ls if "r_adult" in p]
    path_dict['g'] = [saved_model_path+p for p in path_ls if "g_adult" in p]
    path_dict['a'] = [saved_model_path+p for p in path_ls if "a_adult" in p]
    path_dict['r'].sort()
    path_dict['g'].sort()
    path_dict['a'].sort()
    print(path_dict)
    return path_dict

def get_penalty_awarded(top_n, layer_num, total_num, income_critical, protected_critical_ls):
    neurons = []

    for i in range(layer_num):
        income_layer_critical = income_critical[i].flatten()
        i_critical = my_filter(income_layer_critical, total_num)
        print("i_critical", i_critical)
        current_penalty = None
        current_awarded = None
        filtered_criticals = []
        j = 0
        a = 'g'
        protected_layer_critical = protected_critical_ls[j][i].flatten()
        p_critical = my_filter(protected_layer_critical, total_num)
        print("p_critical", p_critical)
        filtered_criticals.append(p_critical)
        penalty = np.setdiff1d(p_critical, i_critical)
        awarded = np.setdiff1d(i_critical, p_critical)
        if current_penalty is None:
            current_penalty = penalty
        else:
            current_penalty = np.union1d(current_penalty, penalty)
        if current_awarded is None:
            current_awarded = awarded
        else:
#                 current_awarded = np.intersect1d(current_awarded, awarded)
            current_awarded = np.union1d(current_awarded, awarded)
        print("current_penalty", current_penalty, "current_awarded", current_awarded)
        neurons.append((current_penalty, current_awarded))
    neurons = neurons[1: (top_n + 1)]
    return neurons

In [30]:
path_dict = get_path_dict()
model_path = "models/adult_model.h5"

income_train_scores = get_relevance(model_path, pre_census_income.X_train,
                                        save_path=os.path.join('scores/adult', os.path.basename(model_path) + ".score"))
income_critical = get_critical_neurons(income_train_scores, 0.3)
finals = []

top_n = 4
protected_critical_ls = []

a = 'g'
path = path_dict[a][top_n - 1]
train_scores = get_relevance(path, pre_census_income.X_train,  save_path=os.path.join('scores/adult', os.path.basename(path) + ".score"))
protected_critical = get_critical_neurons(train_scores, 0.3)
protected_critical_ls.append(protected_critical)

layer_num = len(income_critical)
total_num = len(pre_census_income.X_train)
neurons = get_penalty_awarded(top_n, layer_num, total_num, income_critical, protected_critical_ls)

penalty = neurons[layer_index][0]
awarded = neurons[layer_index][1]
normal_neurons = [i for i in range(len(inter_output_adv[0])) if i not in neurons[layer_index][0] and i not in neurons[layer_index][1]]
print((np.abs(inter_output_adv[:, penalty] - inter_output_ori[:, penalty])).sum() / len(penalty))
print((np.abs(inter_output_adv[:, awarded] - inter_output_ori[:, awarded])).sum() / len(awarded))
print((np.abs(inter_output_adv[:, normal_neurons] - inter_output_ori[:, normal_neurons])).sum() / len(normal_neurons))
print("penalty", penalty)
print("awarded", awarded)
print("normal", normal_neurons)

print((np.abs(inter_output_adv - inter_output_ori)).sum())

# print(intermediate_output1 - intermediate_output0)

s = [0 for i in range(len(inter_output_ori[0]))]
for i in inter_output_ori:
    for j in range(len(i)):
        if i[j] > 0:   
            s[j] += 1
print("values", s)

s = [0 for i in range(len(inter_output_adv[0]))]
for i in inter_output_adv:
    for j in range(len(i)):
        if i[j] > 0:   
            s[j] += 1
print("values", s)
# [0, 2, 

{'r': ['models/finetuned_models_protected_attributes/adult/r_adult_model_1_0.986.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_2_0.946.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_3_0.881.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_4_0.871.h5', 'models/finetuned_models_protected_attributes/adult/r_adult_model_5_0.859.h5'], 'g': ['models/finetuned_models_protected_attributes/adult/g_adult_model_1_0.997.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_2_0.968.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_3_0.848.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_4_0.826.h5', 'models/finetuned_models_protected_attributes/adult/g_adult_model_5_0.768.h5'], 'a': ['models/finetuned_models_protected_attributes/adult/a_adult_model_1_0.994.h5', 'models/finetuned_models_protected_attributes/adult/a_adult_model_2_0.965.h5', 'models/finetuned_models_protec

In [31]:
r = np.sum(np.abs(inter_output_adv - inter_output_ori), axis=0)
r

array([ 2.71061   ,  3.0536819 ,  2.024089  ,  0.58497125, 15.076021  ,
        1.8527092 ,  0.569338  ,  1.9673985 , 17.34945   , 13.808918  ,
       14.42024   ,  1.6813678 , 22.541544  ,  3.9344692 ,  7.0689754 ,
        1.0338857 , 71.630424  ,  0.5526728 ,  1.275471  ,  0.97431767,
       43.557945  ,  0.81608135,  3.742669  , 41.862415  ,  2.7926705 ,
        2.3318763 , 23.685438  ,  4.577374  ,  1.0079285 , 60.42346   ],
      dtype=float32)

In [32]:
np.argsort(r)

array([17,  6,  3, 21, 19, 28, 15, 18, 11,  5,  7,  2, 25,  0, 24,  1, 22,
       13, 27, 14,  9, 10,  4,  8, 12, 26, 23, 20, 29, 16])

In [51]:
s = [0 for i in range(len(intermediate_output_ori_0[0]))]
for i in intermediate_output_ori_0:
    for j in range(len(i)):
        if i[j] > 0:   
            s[j] += 1
print(s)
# [0, 2, 5, 6, 8, 12]
#  [4, 9 ,10, 13, 14]

[13762, 14828, 28289, 27386, 31148, 1354, 29, 26, 24738, 417, 28818, 26726, 1344, 949, 24587]


In [24]:
model_path = "models/adult_model.h5"
model = keras.models.load_model(model_path)
model.summary()

attr = 'g'
protected_attribs = pos_map[attr]


data_name = f"data/adult/C-{attr}_ids_EIDIG_INF.npy"
dis_data = np.load(data_name)
num_attribs = len(dis_data[0])

similar_X = similar_set(dis_data, num_attribs, protected_attribs, pre_census_income.constraint)
layer_name = 'layer1'
intermediate_layer_model = Model(model.input, model.get_layer(layer_name).output)

intermediate_output0 = intermediate_layer_model.predict(similar_X[0])
intermediate_output1 = intermediate_layer_model.predict(similar_X[1])
print((np.abs(intermediate_output1 - intermediate_output0)).sum())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer1 (Dense)               (None, 30)                390       
_________________________________________________________________
layer2 (Dense)               (None, 20)                620       
_________________________________________________________________
layer3 (Dense)               (None, 15)                315       
_________________________________________________________________
layer4 (Dense)               (None, 15)                240       
_________________________________________________________________
layer5 (Dense)               (None, 10)                160       
_________________________________________________________________
layer6 (Dense)               (None, 1)                 11        
Total params: 1,736
Trainable params: 1,736
Non-trainable params: 0
______________________________________________________