#Imports and Dataset processing

In [31]:
import pandas as pd
import time
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
import copy
from sympy import Symbol
from sympy.solvers import solve
pd.options.mode.chained_assignment = None
import csv
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter('ignore')

Import Dataset

In [32]:

url = "https://raw.githubusercontent.com/niceIrene/remedy/main/datasets/CleanAdult_numerical_cat.csv"
data = pd.read_csv(url)
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,2,226802,1,7,4,6,3,2,1,0,0,0,38,0
1,1,2,89814,11,9,2,4,0,4,1,0,0,1,38,0
2,0,1,336951,7,12,2,10,0,4,1,0,0,0,38,1
3,1,2,160323,15,10,2,6,0,2,1,1,0,0,38,1
4,1,2,198693,0,6,4,7,1,4,1,0,0,0,38,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,0,2,257302,7,12,2,12,5,4,0,0,0,0,38,0
45218,1,2,154374,11,9,2,6,0,4,1,0,0,0,38,1
45219,2,2,151910,11,9,6,0,4,4,0,0,0,0,38,0
45220,0,2,201490,11,9,4,0,3,4,1,0,0,0,38,0


In [4]:
# get training and testing set

# protected attributes
columns_compas = ['age', 'marital-status','relationship', 'race','gender',
                                          'native-country']

# all columns of dataset
columns_all = ['age', 'workclass','education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race','gender', 'capital-gain', 'capital-loss','hours-per-week', 'native-country']

compas_y = 'income'
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

def get_train_test(data, split, list_cols, y_label):
  all_list = copy.deepcopy(list_cols)
  all_list.append(y_label)
  data = pd.DataFrame(data, columns = all_list)
  train_set,test_set = split_train_test(data,split)
  print(len(train_set), "train +", len(test_set), "test")
  train_x = pd.DataFrame(train_set, columns = list_cols)
  train_label = train_set[y_label]
  test_x = pd.DataFrame(test_set, columns = list_cols)
  test_label = test_set[y_label]
  return train_x, test_x, train_label, test_label, train_set, test_set

In [5]:
train_x, test_x, train_label, test_label, train_set, test_set  = get_train_test(data, 0.3, columns_all, compas_y)

31656 train + 13566 test


In [6]:
def fpr_onegroup(true, predict):
    fp = 0
    tn = 0
    for i in range(len(true)):
        if (true[i] == 0 and predict[i] == 1):
            fp += 1
        if(true[i] == 0 and predict[i] == 0):
            tn += 1
    return fp/(fp+tn)


def fnr_onegroup(true, predict):
    fn = 0
    tp = 0
    for i in range(len(true)):
        if (true[i] == 1 and predict[i] == 0):
            fn += 1
        if(true[i] == 1 and predict[i] == 1):
            tp += 1
    return fn/(fn+tp)

Initialize ML models of Choice

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer


filter_count = 30



scoring = make_scorer(accuracy_score)


# #####################
param_gridlg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}
logreg = LogisticRegression(random_state=42, max_iter=1000)
gridlg = GridSearchCV(logreg, param_grid=param_gridlg, scoring=scoring, cv=5)
# #####################
param_griddt = {
    'max_depth': [2, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeClassifier(random_state=42)
griddt = GridSearchCV(dt, param_grid=param_griddt, scoring=scoring, cv=5)

# #####################
param_gridrf = {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50, 100], 'random_state':[17]}
# param_gridrf = {
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
# }
rf = RandomForestClassifier(random_state=42)
gridrf = GridSearchCV(rf, param_grid=param_gridrf, scoring=scoring, cv=5)




# Divexplorer

In [8]:
!pip install DivExplorer==0.1.1

Collecting DivExplorer==0.1.1
  Downloading divexplorer-0.1.1-py3-none-any.whl (28 kB)
Collecting python-igraph>=0.8.3 (from DivExplorer==0.1.1)
  Downloading python_igraph-0.11.4-py3-none-any.whl (9.1 kB)
Collecting igraph==0.11.4 (from python-igraph>=0.8.3->DivExplorer==0.1.1)
  Downloading igraph-0.11.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable>=1.6.2 (from igraph==0.11.4->python-igraph>=0.8.3->DivExplorer==0.1.1)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.2.1->DivExplorer==0.1.1)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: texttable, jedi, igraph, python-igraph, DivExplorer
Successfu

In [9]:
def fairness_score_computation(d, metrics):
    sum_of_score = 0
    for idx, row in d.iterrows():
      sum_of_score += row['support'] * row[metrics]
    return sum_of_score

In [10]:
def get_test_predict(gridalg, train_x, alg_name):
  gridalg.fit(train_x, train_label)
  print("best {}".format(alg_name), gridalg.best_score_)
  test_predict = gridalg.predict(test_x)
  data_all = pd.concat([train_x,test_x])
  data_predict = gridalg.predict(data_all)
  test_predict = gridalg.predict(test_x)
  data['predicted'] = data_predict
  test_set['predicted'] = test_predict

In [11]:

# # for SVC
clf = SVC(kernel='rbf', C=6.0, gamma = 100, random_state =42)
# clf.fit(train_x, train_label)
# test_predict = clf.predict(test_x)
# test_set['predicted'] = test_predict

In [12]:
with open('Adult_results.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["Dataset","Remedy", "Algorithm","d_fpr","d_fnr", "d_acc", "model_acc"])
  # writer.writerow(["Adult", "Original", "DT", dfpr, dfnr, dacc, accuracy])

# For entire dataset


In [13]:
import itertools
def get_unfair_group(list_parse, entire = 1):
  unfair_group = []
  unfair_dict = {}
  names = []
  for col in columns_compas:
    found = False
    for item in list_parse:
      attr_given = item.split("=")[0]
      if col == attr_given:
        unfair_group.append(int(item.split("=")[1]))
        names.append(attr_given)
        unfair_dict[attr_given] = int(item.split("=")[1])
        found = True
  # if use the entire dataset
  if entire:
    return unfair_group, names, columns_compas, unfair_dict
        # break
    # if found == False:
    #   unfair_group.append(-1)
  return unfair_group, names, list(set(columns_compas).symmetric_difference(set(names))), unfair_dict
def candidate_groups(skew_candidates, unfair_dict, ordering, names):
  candidate_combos = []
  candidate_ind = {}
  num = 0
  for i in range(len(skew_candidates)+1):
    temp_candidate = list(itertools.combinations(skew_candidates, i))
    for tc in temp_candidate:
      candidate_ind[num] = list(tc)
      num += 1
  return candidate_ind

def name_val_dict(train_set,names):
  names_values = {}
  for n in names:
    names_values[n] = list(train_set[n].unique())
  return names_values



In [14]:
def get_temp(train_set, names, y_label):
  names2 = copy.deepcopy(names)
  names2.append(y_label)
  temp = train_set[names2]
  temp['cnt'] = 0
  temp2 = temp.groupby(names2)['cnt'].count().reset_index()
  temp2['cnt'].sum()
  return temp2, names
temp2, names = get_temp(train_set, columns_compas, compas_y)
temp2

Unnamed: 0,age,marital-status,relationship,race,gender,native-country,income,cnt
0,0,0,1,0,0,38,0,4
1,0,0,1,0,1,38,0,4
2,0,0,1,0,1,38,1,1
3,0,0,1,1,1,18,1,1
4,0,0,1,1,1,29,0,1
...,...,...,...,...,...,...,...,...
1743,2,6,4,4,0,38,1,8
1744,2,6,4,4,1,7,1,1
1745,2,6,4,4,1,25,1,1
1746,2,6,4,4,1,38,0,17


In [15]:
def get_temp_g(train_set, names, y_label):
  names2 = copy.deepcopy(names)
  names2.append(y_label)
  temp = train_set[names2]
  temp['cnt'] = 0
  temp_g = temp.groupby(names)['cnt'].count().reset_index()
  return temp, temp_g

In [16]:
unfair_group, unfair_names, skew_candidates, unfair_dict = get_unfair_group([])
print(unfair_group, unfair_names, skew_candidates, unfair_dict)
all_names = candidate_groups(skew_candidates, unfair_dict, columns_compas, unfair_names)
names_values = name_val_dict(train_set, names)

all_names_lst = list(all_names.keys())[1:] # CHANGED HERE
all_names_lst.reverse()
all_names_lst

[] [] ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country'] {}


[63,
 62,
 61,
 60,
 59,
 58,
 57,
 56,
 55,
 54,
 53,
 52,
 51,
 50,
 49,
 48,
 47,
 46,
 45,
 44,
 43,
 42,
 41,
 40,
 39,
 38,
 37,
 36,
 35,
 34,
 33,
 32,
 31,
 30,
 29,
 28,
 27,
 26,
 25,
 24,
 23,
 22,
 21,
 20,
 19,
 18,
 17,
 16,
 15,
 14,
 13,
 12,
 11,
 10,
 9,
 8,
 7,
 6,
 5,
 4,
 3,
 2,
 1]

# Helper Functions

## General Helper Functions


In [17]:
def get_one_degree_neighbors(temp2, names, group_lst):
    result = []
    for i in range(len(group_lst)):
        d = copy.copy(temp2)
        for k in range(len(group_lst)):
            if k != i:
                d = d[d[names[k]] == group_lst[k]]
            else:
                d = d[d[names[k]] != group_lst[k]]
        # print(d)
        result.append(d)
    return result

In [18]:
# compute the pos/neg ration of this neighbor
def compute_neighbors(group_lst, result):
    # compute the ratio of positive and negative records
    start2 = time.time()
    pos = 0
    neg = 0
    for r in result:
        total  = r['cnt'].sum()
        r = r[r[compas_y] == 1]
        pos += r['cnt'].sum()
        neg += total - r['cnt'].sum()
    if(neg == 0):
        return (pos, neg, -1)
    end2 = time.time()
    # print("The time to compute the neighbor counts for " +  str(group_lst) +" is " + str(end2-start2))
    return(pos, neg, pos/neg)

In [19]:
def compute_diff_add_and_remove(group_lst, temp2, need_positive_or_negative, label, names):
    d = copy.copy(temp2)
    for i in range(len(group_lst)):
        d = d[d[names[i]] == group_lst[i]]
        # print(len(d))
    total =  d['cnt'].sum()
    # Total here was 0: here, errors when this is commented out
    if total == 0:
      return -1
    d = d[d[label] == 1]
    pos = d['cnt'].sum()
    # print(d, group_lst)
    neg = total - pos
    result = get_one_degree_neighbors(temp2,names, group_lst)
    neighbors = compute_neighbors(group_lst, result)
    if(need_positive_or_negative == 1):
        # need pos
        x = Symbol('x')
        try:
          diff = solve((pos + x)/ (neg - x) - neighbors[2])[0]
        except:
          return -1

    else:
        #need negative
        x = Symbol('x')
        try:
          diff = solve((pos - x)/ (neg + x) - neighbors[2])[0]
        except:
          return -1
    return diff

In [20]:
def compute_diff_add(group_lst, temp2, names, label_y, need_positive_or_negative):
    d = copy.copy(temp2)

    for i in range(len(group_lst)):
        d = d[d[names[i]] == group_lst[i]]
    total =  d['cnt'].sum()
    d = d[d[label_y] == 1]
    pos = d['cnt'].sum()
    neg = total - pos
    result = get_one_degree_neighbors(temp2, names, group_lst)
    neighbors = compute_neighbors(group_lst, result)
    if(need_positive_or_negative == 1):

        x = Symbol('x')
        try:
          diff = solve((pos + x)/ neg -  neighbors[2])[0]
        except:
          return -1

        print(neighbors[2], pos, neg, diff)
    else:
        #need negative
        x = Symbol('x')
        try:
          diff = solve(pos/ (neg + x) -  neighbors[2])[0]
        except:
          return -1
    print(neighbors[2], pos, neg, diff)
    return diff

def compute_diff_remove(group_lst, temp2, names, label_y, need_positive_or_negative):
    d = copy.copy(temp2)
    for i in range(len(group_lst)):
        d = d[d[names[i]] == group_lst[i]]
    total =  d['cnt'].sum()
    d = d[d[label_y] == 1]
    pos = d['cnt'].sum()
    neg = total - pos
    result = get_one_degree_neighbors(temp2, names, group_lst)
    neighbors = compute_neighbors(group_lst, result)
    if(need_positive_or_negative == 1):
        # need pos, remove some neg
        x = Symbol('x')
        try:
          diff = solve( pos/ (neg - x) -  neighbors[2])[0]
        except:
          return -1
        print(neighbors[2], pos, neg, diff)
    else:
        #need negative
        x = Symbol('x')
        try:
          diff = solve((pos -x )/ neg -  neighbors[2])[0]
        except:
          return -1
        print(neighbors[2], pos, neg, diff)
    return diff


In [21]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer

def div_results(db, remedy, algo):
  columns_compas.extend([compas_y, "predicted"])

  df = pd.DataFrame(test_set, columns = columns_compas)

  columns_compas.remove(compas_y)
  columns_compas.remove('predicted')
  class_map={'N': 0, 'P': 1}

  min_sup=0.1



  fp_diver=FP_DivergenceExplorer(df,compas_y, "predicted", class_map=class_map)
  FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_fpr", "d_fnr", "d_accuracy"])
  from divexplorer.FP_Divergence import FP_Divergence
  fp_divergence_fpr=FP_Divergence(FP_fm, "d_fpr")
  fp_divergence_fnr=FP_Divergence(FP_fm, "d_fnr")
  fp_divergence_acc=FP_Divergence(FP_fm, "d_accuracy")

  INFO_VIZ=["support", "itemsets",  fp_divergence_fpr.metric, fp_divergence_fpr.t_value_col]
  INFO_VIZ2=["support", "itemsets",  fp_divergence_fnr.metric, fp_divergence_fnr.t_value_col]
  INFO_VIZ3=["support", "itemsets",  fp_divergence_acc.metric, fp_divergence_acc.t_value_col]

  K=200
  # summerization
  eps=0.01



  d = fp_divergence_fpr.getDivergence(th_redundancy=eps)[INFO_VIZ]
  d2 = fp_divergence_fnr.getDivergence(th_redundancy=eps)[INFO_VIZ2]
  d3 = fp_divergence_acc.getDivergence(th_redundancy=eps)[INFO_VIZ3]


  d= d[d['d_fpr'] > 0]
  d2= d2[d2['d_fnr'] > 0]
  d3= d3[d3['d_accuracy'] > 0]

  dfpr = fairness_score_computation(d, 'd_fpr')
  dfnr = fairness_score_computation(d2, 'd_fnr')
  dacc = fairness_score_computation(d3, 'd_accuracy')
  print(dfpr)
  print(dfnr)
  print(dacc)
  accuracy = accuracy_score(test_label, test_set['predicted'])
  print("accuracy is " , accuracy)
  writelist = [db,remedy,algo, dfpr, dfnr, dacc, accuracy]
  with open('Adult_results.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(writelist)
  print(writelist)
  return d,d2,d3

## Optimized Helper Function


In [22]:
# helper function for optimized
def compute_neighbors_opt(group_lst,lst_of_counts, pos, neg):
    #start2 = time.time()
    times = len(group_lst)
    pos_cnt = 0
    neg_cnt = 0
    for i in range(times):
        df_groupby = lst_of_counts[i]
        temp_group_lst_pos = copy.copy(group_lst)
        temp_group_lst_neg = copy.copy(group_lst)
        del temp_group_lst_pos[i]
        del temp_group_lst_neg[i]
        # count positive
        temp_group_lst_pos.append(1)
        group_tuple_pos = tuple(temp_group_lst_pos)
        if group_tuple_pos in df_groupby.keys():
            pos_cnt += df_groupby[group_tuple_pos]
        else:
            pos_cnt += 0
        # count negative
        temp_group_lst_neg.append(0)
        group_tuple_neg = tuple(temp_group_lst_neg)
        if group_tuple_neg in df_groupby.keys():
            neg_cnt += df_groupby[group_tuple_neg]
        else:
            neg_cnt += 0
    pos_val = pos_cnt - times* pos
    neg_val = neg_cnt - times* neg
    #end2 = time.time()
    #print("The time to compute the neighbor counts for " +  str(group_lst) +" is " + str(end2-start2))
    if neg_val == -1 or (neg_val == 0 and pos_val == 0):
        return (pos_val, neg_val, -1)
    if pos_val == 0 or neg_val == 0:
        return (pos_val, neg_val, 0)
    # print("here", pos_val, neg_val, pos_val/neg_val)

    return (pos_val, neg_val, pos_val/neg_val)

  and should_run_async(code)


In [23]:
# get the list of neighbors
def get_one_degree_neighbors_opt(group_lst):
    start1 = time.time()
    result = []
    for i in range(len(group_lst)):
        d = copy.copy(group_lst)
        d[i] = 'x'
        result.append(d)
    end1 = time.time()
    return result

  and should_run_async(code)


In [24]:
def determine_problematic_opt(group_lst, names, temp2, lst_of_counts, label, threshold= 0.3):
    #0: ok group, 1: need negative records, 2: need positive records
    d = copy.copy(temp2)
    for i in range(len(group_lst)):
        d = d[d[names[i]] == group_lst[i]]
    total =  d['cnt'].sum()
    d = d[d[label] == 1]
    pos = d['cnt'].sum()
    neg = total - pos
    neighbors = compute_neighbors_opt(group_lst,lst_of_counts, pos, neg)
    if(neighbors[2] == -1):
        # there is no neighbors
        return 0
    if(total > 30):
        # need to be large enough, need to adjust with different datasets.
        if neg == 0:
            if (pos > neighbors[2]):
                return 1
            if(pos <= neighbors[2]):
                return 0
        if (pos/(neg) - neighbors[2] > threshold):
            # too many positive records
            return 1
        if (neighbors[2] - pos/(neg) > threshold):
            return 2
    return 0

  and should_run_async(code)


In [25]:
def compute_problematic_opt(temp2, temp_g, names, label, lst_of_counts):
    need_pos = []
    need_neg = []
    for index, row in temp_g.iterrows():
        group_lst = []
        for n in names:
            group_lst.append(row[n])
        problematic = determine_problematic_opt(group_lst, names, temp2, lst_of_counts,label)
#         #print(problematic)
        if(problematic == 1):
            if group_lst not in need_neg:
                need_neg.append(group_lst)
        if(problematic == 2):
            if group_lst not in need_pos:
                need_pos.append(group_lst)
    return need_pos, need_neg

  and should_run_async(code)


In [26]:
# build the list of X00
def compute_lst_of_counts(temp, names, label):
    # get the list of group-by attributes
    lst_of_counts = []
    for i in range(len(names)):
        grp_names = copy.copy(names)
        del grp_names[i]
        grp_names.append(label)
        temp_df = temp.groupby(grp_names)['cnt'].count()
        lst_of_counts.append(temp_df)
    return lst_of_counts

def get_tuple(group_lst):
    return tuple(group_lst)


def get_temp_g(train_set, names, y_label):
  names2 = copy.deepcopy(names)
  names2.append(y_label)
  temp = train_set[names2]
  temp['cnt'] = 0
  temp_g = temp.groupby(names)['cnt'].count().reset_index()
  return temp, temp_g

  and should_run_async(code)


# Preferential Sampling

In [27]:
from sklearn.naive_bayes import MultinomialNB
def pref_sampling_opt(train_set, cols_given, label, need_pos, need_neg):
    if len(need_pos)+ len(need_neg) > 0:
        temp_train_x = pd.DataFrame(train_set, columns = columns_all)
        temp_train_label = pd.DataFrame(train_set, columns = [label])
        temp_train_label = temp_train_label[label]
        temp_train_label = temp_train_label.astype('int')
        mnb = MultinomialNB()
        mnb = mnb.fit(temp_train_x, temp_train_label)
        probs = mnb.predict_proba(temp_train_x)[:,0]
        train_set["prob"] = abs(probs - 0.5)
        # get the set of
    new_train_set = pd.DataFrame(columns = list(train_set.columns))
    updated_pos = 0
    for i in need_pos:
        # needs to updated more positive records
        # print(i)
        temp_df = copy.deepcopy(train_set)
        for n in range(len(i)):
          temp_df = temp_df[temp_df[cols_given[n]] == i[n]]
        # update the skew and diff
        idx = list(temp_df.index)
        train_set.loc[idx, 'skewed'] = 1
        idx_pos = list(temp_df[(getattr(temp_df, label) == 1)].index)
        if(len(idx_pos) == 0):
          # if there is no positive
          idx_neg = list(temp_df[(getattr(temp_df, label) == 0)].index)
          neg_ranked = train_set.loc[idx_neg].sort_values(by="prob", ascending=True)
          new_train_set = pd.concat([new_train_set, neg_ranked], ignore_index=True)
          continue
        idx_neg = list(temp_df[(getattr(temp_df, label) == 0)].index)
        pos_ranked = train_set.loc[idx_pos].sort_values(by="prob", ascending=True)
        neg_ranked = train_set.loc[idx_neg].sort_values(by="prob", ascending=True)
        diff = compute_diff_add_and_remove(i, temp2,  1, compas_y, names)
        if diff == -1:
          new_train_set = pd.concat([new_train_set, pos_ranked], ignore_index=True)
          new_train_set = pd.concat([new_train_set, neg_ranked], ignore_index=True)
          continue
        train_set.loc[idx, 'diff'] = int(diff)
        cnt = int(train_set.loc[idx_pos[0]]["diff"])
        updated_pos += cnt * 2
        # add more records when there are not enough available records
        new_train_set = pd.concat([new_train_set, pos_ranked], ignore_index=True)
        temp_cnt = cnt
        if len(pos_ranked) >= temp_cnt:
            new_train_set = pd.concat([new_train_set,pos_ranked[0:cnt]], ignore_index=True)
        else:
            while(temp_cnt > 0 ):
                new_train_set = pd.concat([new_train_set,pos_ranked[0:temp_cnt]], ignore_index=True)
            # duplicate the dataframe
                temp_cnt = temp_cnt - len(pos_ranked)
        # duplicate the top cnt records from the pos
        # remove the top cnt records from the neg
        if cnt == 0:
          new_train_set = pd.concat([new_train_set, neg_ranked], ignore_index=True)
          # print("+++++++++++++++++++++++++++++++++++++++++++++")
          # print(i)
          # print(len(pos_ranked)+cnt)
          # print(len(neg_ranked))
        else:
          new_train_set = pd.concat([new_train_set, neg_ranked[cnt-1:-1]], ignore_index=True)
          # print("+++++++++++++++++++++++++++++++++++++++++++++")
          # print(i)
          # print(len(pos_ranked)+cnt)
          # print(len(neg_ranked[cnt-1:-1]))
    print("updated {} positive records".format(str(updated_pos)))
    updated_neg = 0
    # adding more records to the need_neg set
    for i in need_neg:
        # print(i)
        # list of idx belongs to this group
        temp_df = copy.deepcopy(train_set)
        for n in range(len(i)):
          temp_df = temp_df[temp_df[cols_given[n]] == i[n]]
        # update the skew and diff
        idx = list(temp_df.index)
        train_set.loc[idx, 'skewed'] = 1
        idx_pos = list(temp_df[(getattr(temp_df, label) == 1)].index)
        idx_neg = list(temp_df[(getattr(temp_df, label) == 0)].index)
        if(len(idx_neg) == 0):
          pos_ranked = train_set.loc[idx_pos].sort_values(by="prob", ascending=True)
          new_train_set = pd.concat([new_train_set, pos_ranked], ignore_index=True)
          continue
        pos_ranked = train_set.loc[idx_pos].sort_values(by="prob", ascending=True)
        neg_ranked = train_set.loc[idx_neg].sort_values(by="prob", ascending=True)
        diff = compute_diff_add_and_remove(i, temp2, 0, compas_y, names)
        if diff == -1:
          new_train_set = pd.concat([new_train_set, neg_ranked], ignore_index=True)
          new_train_set = pd.concat([new_train_set, pos_ranked], ignore_index=True)
          continue
        train_set.loc[idx, 'diff'] = int(diff)
        cnt = int(train_set.loc[idx_pos[0]]["diff"])
        updated_neg += cnt * 2
        # add more records when there are not enough available records
        new_train_set = pd.concat([new_train_set, neg_ranked], ignore_index=True)
        temp_cnt = cnt
        if len(neg_ranked) >= temp_cnt:
            new_train_set = pd.concat([new_train_set,neg_ranked[0:cnt]], ignore_index=True)
        else:
            while(temp_cnt > 0 ):
                new_train_set = pd.concat([new_train_set,neg_ranked[0:temp_cnt]], ignore_index=True)
            # duplicate the dataframe
                temp_cnt = temp_cnt - len(neg_ranked)
        # duplicate the top cnt records from the pos
        # remove the top cnt records from the neg
        if cnt ==0:
          new_train_set = pd.concat([new_train_set, pos_ranked], ignore_index=True)
          # print("+++++++++++++++++++++++++++++++++++++++++++++")
          # print(i)
          # print(len(neg_ranked)+cnt)
          # print(len(pos_ranked))
        else:
          new_train_set = pd.concat([new_train_set, pos_ranked[cnt-1:-1]], ignore_index=True)
          # print("+++++++++++++++++++++++++++++++++++++++++++++")
          # print(i)
          # print(len(neg_ranked)+cnt)
          # print(len(pos_ranked[cnt-1:-1]))
        #print(len(new_train_set[new_train_set['income'] == 1]), len(new_train_set[new_train_set['income'] == 0]))
        # print(train_set.loc[idx_neg])
    print("updated {} negative records".format(str(updated_neg)))
    # add the other irrelavant items:
    idx_irr = list(train_set[train_set['skewed'] == 0].index)
    irr_df = train_set.loc[idx_irr]
    new_train_set = pd.concat([new_train_set, irr_df], ignore_index=True)
    print("The new dataset contains {} rows.".format(str(len(new_train_set))))
    new_train_set.reset_index()
    return new_train_set



  and should_run_async(code)


In [28]:


def find_top(all_names):
  all_names_lst_top = []
  for all in range(len(all_names)):
    if len(all_names[all]) == 1: # CHANGED HERE
      all_names_lst_top.append(all)
  return all_names_lst_top

  and should_run_async(code)


## Run Algorithm Lattice


In [33]:

#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in all_names_lst:
# for a in [all_names_lst[0]]: # leaf
# for a in [all_names_lst[-1:x]]: # top
  print("/////////////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  # start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  # end = time.time()
  # excute_time = end - start
  # print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started pref sampling")
  new_train_data = pref_sampling_opt(new_train_data, names, compas_y, need_pos, need_neg)
  # print(new_train_data)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

/////////////
63
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
started pref sampling
updated 1002 positive records
updated 1728 negative records
The new dataset contains 31656 rows.
0    24137
1     7519
Name: income, dtype: int64
/////////////
62
The sets of need pos and neg are
[[2, 0, 1, 1, 2], [2, 0, 1, 1, 29], [2, 0, 3, 1, 38], [2, 2, 4, 0, 38], [2, 2, 4, 1, 38]]
[[2, 0, 1, 1, 18], [2, 0, 1, 1, 38], [2, 0, 4, 1, 1], [6, 4, 4, 1, 38]]
started pref sampling
updated 60 positive records
updated 68 negative records
The new dataset contains 31656 rows.
0    24141
1     7515
Name: income, dtype: int64
/////////////
61
The sets of need pos and neg are
[[0, 0, 2, 1, 38], [0, 0, 4, 1, 25], [1, 0, 1, 1, 18], [1, 3, 4, 1, 38], [2, 4, 4, 1, 38]]
[[0, 0, 4, 1, 38], [0, 5, 4, 0, 38], [1, 0, 0, 1, 38]

### Results Lattice

In [32]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label,test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Lattice","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label,test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Lattice","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label,test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Lattice","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
# print("best", svc.best_score_)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Preferential Sampling-Lattice","SVM")

  and should_run_async(code)



svm
fpr and fnr
0.0138671875
0.9073962717979555
accuracy
0.7670647206250921


  fp = fp.append(row_root, ignore_index=True)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_ty

0
0.03992377631588995
0.4287345702498045
accuracy is  0.7670647206250921
['Adult', 'Preferential Sampling-Lattice', 'SVM', 0, 0.03992377631588995, 0.4287345702498045, 0.7670647206250921]


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


## Run Algorithm Leaf

In [35]:

#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in [all_names_lst[0]]:
  print("/////////////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)

  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started pref sampling")
  new_train_data = pref_sampling_opt(new_train_data, names, compas_y, need_pos, need_neg)

  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

/////////////
63
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
started pref sampling
updated 1002 positive records
updated 1728 negative records
The new dataset contains 31656 rows.
0    24137
1     7519
Name: income, dtype: int64


### Results Leaf

In [35]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Leaf","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Leaf","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Leaf","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
# print("best", svc.best_score_)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Preferential Sampling-Leaf","SVM")


dt
best 0.769244179763805
fpr and fnr
0.0447265625
0.6337943475646423
accuracy
0.8108506560518944


  fp = fp.append(row_root, ignore_index=True)


0.18120808872062508
0.5568230300005409
0.31414292945465616
accuracy is  0.8108506560518944
['Adult', 'Preferential Sampling-Leaf', 'DT', 0.18120808872062508, 0.5568230300005409, 0.31414292945465616, 0.8108506560518944]

rf


KeyboardInterrupt: 

## Run Algorithm Top

In [34]:

#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)
all_names_lst_top = find_top(all_names)
#iterate over all the names to get the temp2 df for each name
for a in all_names_lst_top:
  print("/////////////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start

  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started pref sampling")
  new_train_data = pref_sampling_opt(new_train_data, names, compas_y, need_pos, need_neg)

  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')


/////////////
1
The sets of need pos and neg are
[]
[]
started pref sampling
updated 0 positive records
updated 0 negative records
The new dataset contains 31656 rows.
0    23774
1     7882
Name: income, dtype: int64
/////////////
2
The sets of need pos and neg are
[]
[]
started pref sampling
updated 0 positive records
updated 0 negative records
The new dataset contains 31656 rows.
0    23774
1     7882
Name: income, dtype: int64
/////////////
3
The sets of need pos and neg are
[]
[]
started pref sampling
updated 0 positive records
updated 0 negative records
The new dataset contains 31656 rows.
0    23774
1     7882
Name: income, dtype: int64
/////////////
4
The sets of need pos and neg are
[]
[]
started pref sampling
updated 0 positive records
updated 0 negative records
The new dataset contains 31656 rows.
0    23774
1     7882
Name: income, dtype: int64
/////////////
5
The sets of need pos and neg are
[]
[]
started pref sampling
updated 0 positive records
updated 0 negative records
T

### Results Top

In [None]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Top","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Top","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Preferential Sampling-Top","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Preferential Sampling-Top","SVM")

# Duplication

In [38]:
def round_int(x):
    if x in [float("-inf"),float("inf")]: return 0
    return int(round(x))


def make_duplicate(d, group_lst, diff, label_y, names, need_positive_or_negative):
    selected = copy.deepcopy(d)
    print("names ", names, group_lst)
    for i in range(len(group_lst)):
        att_name = names[i]
        selected = selected[(selected[att_name] == group_lst[i])]
    selected = selected[(selected[label_y] == need_positive_or_negative)]

    if len(selected) == 0:
        return pd.DataFrame()
    while(len(selected) < diff):
        # duplicate the dataframe
        select_copy = selected.copy(deep=True)
        selected = pd.concat([selected, select_copy])

        # the number needed is more than the not needed numbers.

    generated = selected.sample(n = diff, replace = False, axis = 0)

    return generated


def naive_duplicate(d, temp2, names, need_pos, need_neg, label_y):
    # add more records for all groups
    # The smote algorithm to boost the coverage
    for r in need_pos:
    # add more positive records
        # determine how many points to add
        print("pos_vals", r)
        diff = compute_diff_add(r, temp2, names, label_y, 1)
        if diff == -1:
          continue
        diff = round_int(diff)
        # add more records
        print("Adding " + str(diff) +" positive records")
        samples_to_add = make_duplicate(d, r, diff, label_y, names, need_positive_or_negative = 1)
        d = pd.concat([d, samples_to_add], ignore_index=True)
    for k in need_neg:
        print("neg_vals", k)
        diff = compute_diff_add(k, temp2, names, label_y, need_positive_or_negative = 0)
        if diff == -1:
          continue
        diff = round_int(diff)
        print("Adding " + str(diff) +" negative records")
        samples_to_add = make_duplicate(d, k, diff, label_y, names, need_positive_or_negative = 0)
        d = pd.concat([d, samples_to_add], ignore_index=True)
    return d

## Run Algorithm Lattice

In [39]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in all_names_lst:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started duplication")
  new_train_data = naive_duplicate(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print("label y ", new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
63
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
started duplication
pos_vals [0, 2, 0, 4, 1, 38]
0.9786427145708583 366 1102 712.464271457086
0.9786427145708583 366 1102 712.464271457086
Adding 712 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country'] [0, 2, 0, 4, 1, 38]
pos_vals [0, 2, 5, 4, 0, 38]
1.0848861283643891 77 154 90.0724637681159
1.0848861283643891 77 154 90.0724637681159
Adding 90 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country'] [0, 2, 5, 4, 0, 38]
pos_vals [1, 2, 0, 0, 1, 38]
1.0353780313837375 14 44 31.5566333808844
1.0353780313837375 14 44 31.5566333808844
Adding 32 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country

### Results Lattice


In [None]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
d,d1,d2 = div_results("Adult","Duplication-Lattice","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")

print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Lattice","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Lattice","L")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Duplication-Lattice","SVM")

In [None]:
def fairness_score_computation(d, metrics):
    sum_of_score = 0
    for idx, row in d.iterrows():
      sum_of_score += row['support'] * row[metrics]
    return sum_of_score

print(fairness_score_computation(d, 'd_fpr'))

## Run Algorithm Leaf

In [40]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in [all_names_lst[0]]:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)

  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started duplication")
  new_train_data = naive_duplicate(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print("label y", new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
63
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
started duplication
pos_vals [0, 2, 0, 4, 1, 38]
0.9786427145708583 366 1102 712.464271457086
0.9786427145708583 366 1102 712.464271457086
Adding 712 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country'] [0, 2, 0, 4, 1, 38]
pos_vals [0, 2, 5, 4, 0, 38]
1.0848861283643891 77 154 90.0724637681159
1.0848861283643891 77 154 90.0724637681159
Adding 90 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country'] [0, 2, 5, 4, 0, 38]
pos_vals [1, 2, 0, 0, 1, 38]
1.0353780313837375 14 44 31.5566333808844
1.0353780313837375 14 44 31.5566333808844
Adding 32 positive records
names  ['age', 'marital-status', 'relationship', 'race', 'gender', 'native-country

### Results Leaf

In [None]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
d,d1,d2 = div_results("Adult","Duplication-Leaf","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Leaf","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Leaf","L")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Duplication-Leaf","SVM")

## Run Algorithm Top

In [41]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
all_names_lst_top = find_top(all_names)
for a in all_names_lst_top:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)

  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  print("started duplication")
  new_train_data = naive_duplicate(new_train_data, temp2, names, need_pos, need_neg, compas_y)

  print("Label y", new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
1
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64
?????/////
2
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64
?????/////
3
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64
?????/////
4
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64
?????/////
5
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64
?????/////
6
The sets of need pos and neg are
[]
[]
started duplication
Label y 0    23774
1     7882
Name: income, dtype: int64


### Results Top

In [None]:
test_set['predicted'] = test_predict
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
d,d1,d2 = div_results("Adult","Duplication-Top","DT")


print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
# print(fpr_onegroup(new_labels.tolist(), predict))
# print(fnr_onegroup(new_labels.tolist(), predict))
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Top","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Duplication-Top","L")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Duplication-Top","SVM")

#Down-sampling

In [48]:
def round_int(x):
    if x in [float("-inf"),float("inf")]: return 0
    return int(round(x))


def make_remove(d, group_lst, diff, names, label_y, need_positive_or_negative):
    temp = copy.deepcopy(d)
    for i in range(len(group_lst)):
        att_name = names[i]
        temp = temp[(temp[att_name] == group_lst[i])]
    temp = temp[(temp[label_y] == need_positive_or_negative)]
    # randomly generated diff samples
    # the number needed is more than the not needed numbers.
    if(diff>len(temp)):
        diff = len(temp)
    generated = temp.sample(n = diff, replace = False, axis = 0)
    return generated.index


def naive_downsampling(d, temp2, names, need_pos, need_neg, label_y):
    # add more records for all groups
    # The smote algorithm to boost the coverage
    for r in need_pos:
        print("removing more negative")
    # add more positive records
        # determine how many points to add
        print(r)
        diff = compute_diff_remove(r, temp2, names, label_y, need_positive_or_negative = 1)
        if diff == -1:
          continue
        diff = round_int(diff)
        # add more records
        print("Removed " + str(diff) +" negative records")
        samples_to_remove = make_remove(d, r, diff, names, label_y, need_positive_or_negative = 0)
        d.drop(index  = samples_to_remove, inplace = True)
    for k in need_neg:
        print(k)
        diff = compute_diff_remove(k, temp2, names, label_y, need_positive_or_negative = 0)
        if diff == -1:
          continue
        diff = round_int(diff)
        print("Removed " + str(diff) +" positive records")
        samples_to_remove = make_remove(d, k, diff, names, label_y, need_positive_or_negative = 1)
        d.drop(index  = samples_to_remove, inplace = True)
    return d

## Run Algorithm Lattice

In [49]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in all_names_lst:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  new_train_data = naive_downsampling(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
63
The time to compute unfair group is 0.6379449367523193
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
removing more negative
[0, 2, 0, 4, 1, 38]
0.9786427145708583 366 1102 728.012645319192
Removed 728 negative records
removing more negative
[0, 2, 5, 4, 0, 38]
1.0848861283643891 77 154 83.0248091603054
Removed 83 negative records
removing more negative
[1, 2, 0, 0, 1, 38]
1.0353780313837375 14 44 30.4783686966107
Removed 30 negative records
removing more negative
[1, 2, 0, 4, 1, 25]
1.014265335235378 17 137 120.239099859353
Removed 120 negative records
removing more negative
[1, 2, 5, 2, 0, 38]
1.3063063063063063 31 47 23.2689655172414
Removed 23 negative records
removing more negative
[2, 2, 0, 4, 1, 25]
0.8496659242761693 6 29 21.9384010484928
Removed 22 negative records
[1,

### Results Lattice

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Lattice","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Lattice","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Lattice","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Down Sampling-Lattice","SVM")

## Run Algorithm Leaf

In [50]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in [all_names_lst[0]]:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0

  new_train_data = naive_downsampling(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')


?????/////
63
The time to compute unfair group is 0.9208121299743652
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
removing more negative
[0, 2, 0, 4, 1, 38]
0.9786427145708583 366 1102 728.012645319192
Removed 728 negative records
removing more negative
[0, 2, 5, 4, 0, 38]
1.0848861283643891 77 154 83.0248091603054
Removed 83 negative records
removing more negative
[1, 2, 0, 0, 1, 38]
1.0353780313837375 14 44 30.4783686966107
Removed 30 negative records
removing more negative
[1, 2, 0, 4, 1, 25]
1.014265335235378 17 137 120.239099859353
Removed 120 negative records
removing more negative
[1, 2, 5, 2, 0, 38]
1.3063063063063063 31 47 23.2689655172414
Removed 23 negative records
removing more negative
[2, 2, 0, 4, 1, 25]
0.8496659242761693 6 29 21.9384010484928
Removed 22 negative records
[1,

### Results Leaf

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Leaf","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Leaf","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Leaf","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Down Sampling-Leaf","SVM")

## Run Algorithm Top

In [51]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
all_names_lst_top = find_top(all_names)
for a in all_names_lst_top:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  new_train_data = naive_downsampling(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')


?????/////
1
The time to compute unfair group is 0.017656564712524414
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
2
The time to compute unfair group is 0.043984413146972656
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
3
The time to compute unfair group is 0.026145458221435547
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
4
The time to compute unfair group is 0.03699064254760742
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
5
The time to compute unfair group is 0.011373043060302734
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
6
The time to compute unfair group is 0.15406370162963867
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64


### Results Top

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Top","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Top","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Down Sampling-Top","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Down Sampling-Top","SVM")

# Massaging

In [44]:
from sklearn.naive_bayes import MultinomialNB
def round_int(x):
    if x in [float("-inf"),float("inf")]: return 0
    return int(round(x))

def get_depromotion(d, diff, group_lst, names, label_y, flag_depro):
    input_test = pd.DataFrame(d, columns = columns_compas)

    clf = MultinomialNB()
    temp_train_label = pd.DataFrame(d, columns = [label_y])
    temp_train_label = temp_train_label[label_y]
    temp_train_label = temp_train_label.astype('int')
    clf = clf.fit(input_test, temp_train_label)
    prob  = clf.predict_proba(input_test)[:,0]
    select = copy.deepcopy(d)
    select['prob'] = prob # the higher the probablity is, the more likely for it to be 0
    # filter out those belongs to this group
    for i in range(len(group_lst)):
        att_name = names[i]
        select = select[(select[att_name] == group_lst[i])]
    select = select[(select[label_y] == flag_depro)]
    # rank them according to the probability
    # filp the records and remove the records from d
    if (flag_depro == 0):
        select.sort_values(by="prob", ascending=True, inplace=True)
        select[label_y] = 1
    else:
        select.sort_values(by="prob", ascending=False, inplace=True)
        select[label_y] = 0
    head = select.head(diff)
    index_list = []
    index_list = list(head.index)
    d.drop(index_list,inplace = True)
    head.drop(columns = ['prob'],inplace = True)
    return head



def naive_massaging(d, temp2, names, need_pos, need_neg,label_y):
    # add more records for all groups
    # The smote algorithm to boost the coverage
    for r in need_pos:
        print("adding more positive")
    # add more positive records
        # determine how many points to add
        print(r)
        diff = compute_diff_add_and_remove(r, temp2, 1, label_y, names)
        diff =  round_int(diff)
        # add more records
        #0 for promotion
        samples_to_add = get_depromotion(d, diff, r, names, label_y, flag_depro = 0)
        print("Changed " + str(len(samples_to_add)) +" records")
        d = pd.concat([d, samples_to_add])
        print(len(d))
    for k in need_neg:
        print(k)
        print("adding more negative")
        diff = compute_diff_add_and_remove(k, temp2, 0, label_y, names)
        diff =  round_int(diff)
        #1 for demotion
        samples_to_add = get_depromotion(d, diff, k, names, label_y, flag_depro = 1)
        print("Changed " + str(len(samples_to_add)) +" records")
        d = pd.concat([d, samples_to_add])
        print(len(d))
    return d

## Run Algorithm Lattice

In [45]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in all_names_lst:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]

  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  new_train_data = naive_massaging(new_train_data, temp2, names, need_pos, need_neg, compas_y)

  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
63
The time to compute unfair group is 0.7699806690216064
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
adding more positive
[0, 2, 0, 4, 1, 38]
Changed 360 records
31656
adding more positive
[0, 2, 5, 4, 0, 38]
Changed 43 records
31656
adding more positive
[1, 2, 0, 0, 1, 38]
Changed 16 records
31656
adding more positive
[1, 2, 0, 4, 1, 25]
Changed 61 records
31656
adding more positive
[1, 2, 5, 2, 0, 38]
Changed 13 records
31656
adding more positive
[2, 2, 0, 4, 1, 25]
Changed 10 records
31656
[1, 2, 0, 1, 1, 29]
adding more negative
Changed 7 records
31656
[1, 2, 0, 1, 1, 38]
adding more negative
Changed 6 records
31656
[1, 2, 0, 4, 1, 38]
adding more negative
Changed 723 records
31656
[1, 2, 5, 4, 0, 38]
adding more negative
Changed 131 records
31656
0    24138
1     7518
Nam

### Results Lattice

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Lattice","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label,test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Lattice","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Lattice","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Massage-Lattice","SVM")

In [None]:
r3

## Run Algorithm Leaf

In [46]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
for a in [all_names_lst[0]]:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]

  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  new_train_data = naive_massaging(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
63
The time to compute unfair group is 1.2597715854644775
The sets of need pos and neg are
[[0, 2, 0, 4, 1, 38], [0, 2, 5, 4, 0, 38], [1, 2, 0, 0, 1, 38], [1, 2, 0, 4, 1, 25], [1, 2, 5, 2, 0, 38], [2, 2, 0, 4, 1, 25]]
[[1, 2, 0, 1, 1, 29], [1, 2, 0, 1, 1, 38], [1, 2, 0, 4, 1, 38], [1, 2, 5, 4, 0, 38]]
adding more positive
[0, 2, 0, 4, 1, 38]
Changed 360 records
31656
adding more positive
[0, 2, 5, 4, 0, 38]
Changed 43 records
31656
adding more positive
[1, 2, 0, 0, 1, 38]
Changed 16 records
31656
adding more positive
[1, 2, 0, 4, 1, 25]
Changed 61 records
31656
adding more positive
[1, 2, 5, 2, 0, 38]
Changed 13 records
31656
adding more positive
[2, 2, 0, 4, 1, 25]
Changed 10 records
31656
[1, 2, 0, 1, 1, 29]
adding more negative
Changed 7 records
31656
[1, 2, 0, 1, 1, 38]
adding more negative
Changed 6 records
31656
[1, 2, 0, 4, 1, 38]
adding more negative
Changed 723 records
31656
[1, 2, 5, 4, 0, 38]
adding more negative
Changed 131 records
31656
0    24138
1     7518
Nam

### Results Leaf

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Leaf","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Leaf","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Leaf","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Massage-Leaf","SVM")

## Run Algorithm Top

In [47]:
#get all of the candidate groups possible with the combos and names

new_train_data = copy.deepcopy(train_set)

#iterate over all the names to get the temp2 df for each name
all_names_lst_top = find_top(all_names)
for a in all_names_lst_top:
  print("?????/////")
  print(a)
  temp2, names = get_temp(new_train_data, all_names[a], compas_y)
  temp, temp_g = get_temp_g(new_train_data, names, compas_y)
  temp_g = temp_g[temp_g['cnt'] > filter_count]
  lst_of_counts = compute_lst_of_counts(temp, names, compas_y)
  start = time.time()
  need_pos, need_neg = compute_problematic_opt(temp2, temp_g, names, compas_y, lst_of_counts)
  end = time.time()
  excute_time = end - start
  print("The time to compute unfair group is {}".format(str(excute_time)))
  print("The sets of need pos and neg are")
  print(need_pos)
  print(need_neg)
  new_train_data['skewed'] = 0
  new_train_data["diff"] = 0
  new_train_data = naive_massaging(new_train_data, temp2, names, need_pos, need_neg, compas_y)
  print(new_train_data[compas_y].value_counts())
new_train_x = pd.DataFrame(new_train_data, columns = columns_all)
new_train_label = pd.DataFrame(new_train_data, columns = [compas_y])
new_train_label = new_train_label[compas_y]
new_train_label = new_train_label.astype('int')

?????/////
1
The time to compute unfair group is 0.013807058334350586
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
2
The time to compute unfair group is 0.01045083999633789
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
3
The time to compute unfair group is 0.028435468673706055
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
4
The time to compute unfair group is 0.0140228271484375
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
5
The time to compute unfair group is 0.010902166366577148
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64
?????/////
6
The time to compute unfair group is 0.07302522659301758
The sets of need pos and neg are
[]
[]
0    23774
1     7882
Name: income, dtype: int64


### Results Top

In [None]:
print()
print("dt")
griddt.fit(new_train_x, new_train_label)
print("best", griddt.best_score_)

test_predict = griddt.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Top","DT")

print()
print("rf")
gridrf.fit(new_train_x, new_train_label)
print("best", gridrf.best_score_)
test_predict = gridrf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Top","RF")

print()
print("logistic")
gridlg.fit(new_train_x, new_train_label)
print("best", gridlg.best_score_)
test_predict = gridlg.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
r,r2,r3 = div_results("Adult","Massage-Top","LG")

print()
print("svm")
clf.fit(new_train_x, new_train_label)
test_predict = clf.predict(test_x)
print("fpr and fnr")
print(fpr_onegroup(list(test_label), test_predict))
print(fnr_onegroup(list(test_label), test_predict))
accuracy = accuracy_score(test_label, test_predict)
print("accuracy")
print(accuracy)
test_set['predicted'] = test_predict
s,s2,s3 = div_results("Adult","Massage-Top","SVM")