# Under-Representation Bias (w/ Synthetic Data)

This notebook recreates the finding that Equalized Odds constrained model can recover from under-representation bias.

### Setup

Please run the code block below to install the necessary packages (if needed).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.base import clone

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc
from collections import Counter

import fairlearn
from fairlearn.metrics import *
from fairlearn.reductions import *
import aif360

import copy, random

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Synthetic Dataset Generation

## Parameters (User Input)

In [2]:
'''

r is the proportion of training examples in the minority group, 

which means 1-r is proportion of examples in the majority group

eta is the probability of flipping the label

n is the number of training examples

beta is the probability of keeping a positively labeled example
from the minority class

NOTE: results can be replicated if and only if the following condition holds:

(1-r)(1-2*eta) + r((1-eta)*beta - eta) > 0

'''
def get_params(r = 1/3, eta = 1/4, n = 2000, beta = 0.5):
    return r, eta, n, beta

r, eta, n, beta = get_params(n = 20000)

In [3]:
# check if above constraint holds
def check_constraints(r, eta, beta):
    first = (1-r)*(1-2*eta)
    second = r * ((1-eta)*beta - eta)
    res = first + second
    print("constraint: ", res)
    print("yes!", r, eta, beta) if res > 0 else print("no", r, eta, beta)
    
bias_amts = np.divide(list(range(10, -1, -1)),10)

for beta in bias_amts:
    check_constraints(r=r, eta=eta, beta=beta)

constraint:  0.5
yes! 0.3333333333333333 0.25 1.0
constraint:  0.47500000000000003
yes! 0.3333333333333333 0.25 0.9
constraint:  0.45000000000000007
yes! 0.3333333333333333 0.25 0.8
constraint:  0.425
yes! 0.3333333333333333 0.25 0.7
constraint:  0.4
yes! 0.3333333333333333 0.25 0.6
constraint:  0.37500000000000006
yes! 0.3333333333333333 0.25 0.5
constraint:  0.35000000000000003
yes! 0.3333333333333333 0.25 0.4
constraint:  0.325
yes! 0.3333333333333333 0.25 0.3
constraint:  0.30000000000000004
yes! 0.3333333333333333 0.25 0.2
constraint:  0.275
yes! 0.3333333333333333 0.25 0.1
constraint:  0.25000000000000006
yes! 0.3333333333333333 0.25 0.0


## True Label Generation

In [4]:
# create minority and majority groups
def get_cat_features(n, r):
    num_minority = int(r * n)
    num_majority = n - num_minority
    
    minority = np.zeros((num_minority, 1))
    majority = np.ones((num_majority, 1))
    
    cat_features = np.vstack((minority, majority))
    
    # shuffle so as to ensure randomness
    np.random.shuffle(cat_features)
    
    return cat_features

In [5]:
# return labels from Bayes Optimal Classifier
def get_bayes_optimal_labels(features, effect_param):
    outcome_continuous = 1/(1+np.exp(-np.matmul(features, effect_param)))
    return np.where(outcome_continuous >= 0.5, 1, 0)

# flip labels with probability eta
def flip_labels(df_synthetic, eta):
    labels = df_synthetic['outcome']
    
    for i in range(len(labels)):
        if random.uniform(0,1) <= eta:
            labels[i] = 1 if labels[i] == 0 else 0
    df_synthetic['outcome'] = labels
    
    return df_synthetic

In [8]:
def true_label_generation(r, eta, n):
    # causal effect params
    effect_param_min = [0.5] 
    effect_param_maj = [-0.7]
    
    num_min = int(n*r)
    num_maj = n - num_min

    # numerical feature params
    means = [0]
    cov_matrix = [[1]]

    # features
    cat_features = get_cat_features(r=r, n=n)
    #num_features_min = np.random.multivariate_normal(means, cov_matrix, num_min)
    num_features_min = np.random.standard_normal(num_min).reshape((num_min,1))
    
    #num_features_maj = np.random.multivariate_normal(means, cov_matrix, num_maj)
    num_features_maj = np.random.standard_normal(num_maj).reshape((num_maj,1))
    num_features = np.concatenate((num_features_min, num_features_maj))

    # outcomes
    outcome_binary_min = get_bayes_optimal_labels(features=num_features_min, effect_param=effect_param_min)
    #outcome_binary_min = np.where(np.matmul(num_features_min, effect_param_min) > 0.5, 1, 0)
    outcome_binary_maj = get_bayes_optimal_labels(features=num_features_maj, effect_param=effect_param_maj)
    #outcome_binary_maj = np.where(np.matmul(num_features_maj, effect_param_maj) > 0.5, 1, 0)
    
    outcome = np.hstack((outcome_binary_min,outcome_binary_maj)).reshape(n,1)
    temp_data = np.hstack((num_features,cat_features, outcome))
    np.random.shuffle(temp_data) # randomly shuffle the data
    
    df_synthetic = pd.DataFrame(temp_data)
    df_synthetic.columns = ['num1','cat','outcome']
    
    df_majority = df_synthetic[df_synthetic['cat'] == 1]
    df_minority = df_synthetic[df_synthetic['cat'] == 0]
    
    #df_synthetic = flip_labels(df_synthetic, eta)
    
    df_majority = df_synthetic[df_synthetic['cat'] == 1]
    df_minority = df_synthetic[df_synthetic['cat'] == 0]
    
    return df_synthetic 

df_synthetic = true_label_generation(r=r, eta=eta, n=n)

In [9]:
# split into train and test
df_train = df_synthetic.loc[range(0,int(len(df_synthetic)/2)), :]
# if original dataset has odd number of samples, remove 1 sample to be even
if (len(df_synthetic) % 2 == 1):
    df_test = df_synthetic.loc[range(int(len(df_synthetic)/2)+1, len(df_synthetic)), :]
else:
    df_test = df_synthetic.loc[range(int(len(df_synthetic)/2), len(df_synthetic)), :]
    
df_test_maj = df_test[df_test['cat'] == 1]
df_test_min = df_test[df_test['cat'] == 0]

# format data
X_true = df_test.iloc[:, :-1].values
y_true = df_test.iloc[:, -1].values

X_true_maj = df_test_maj.iloc[:, :-1].values
y_true_maj = df_test_maj.iloc[:, -1].values
X_true_min = df_test_min.iloc[:, :-1].values
y_true_min = df_test_min.iloc[:, -1].values

sens_attrs_true = [df_test['cat']]

In [10]:
classifier_b = LogisticRegression(solver = 'liblinear', random_state=42)
classifier_bo = classifier_b.fit(X_true, y_true)
bo_pred = classifier_bo.predict(X_true)

classifier_b_maj = LogisticRegression(solver = 'liblinear', random_state=42)
classifier_maj = classifier_b_maj.fit(X_true_maj, y_true_maj)
bo_maj_pred = classifier_maj.predict(X_true_maj)

classifier_b_min = LogisticRegression(solver = 'liblinear', random_state=42)
classifier_min = classifier_b_min.fit(X_true_min, y_true_min)
bo_min_pred = classifier_min.predict(X_true_min)

In [11]:
print("\nAccuracy of Bayes Optimal Model on Ground Truth Data: ", accuracy_score(bo_pred, y_true))
print("Accuracy of Bayes Optimal Model on Ground Truth Data (Maj): ", accuracy_score(bo_maj_pred, y_true_maj))
print("Accuracy of Bayes Optimal Model on Ground Truth Data (Min): ", accuracy_score(bo_min_pred, y_true_min))

effect_param_maj = [-0.7]
effect_param_min = [0.5]

print("\nCoefs Majority: ", classifier_maj.coef_)
print("Coefs Minority: ", classifier_min.coef_)


Accuracy of Bayes Optimal Model on Ground Truth Data:  0.6573
Accuracy of Bayes Optimal Model on Ground Truth Data (Maj):  0.6471997604073075
Accuracy of Bayes Optimal Model on Ground Truth Data (Min):  0.6779048765803732

Coefs Majority:  [[-0.54824553 -0.03266046]]
Coefs Minority:  [[-0.68667007  0.        ]]


In [55]:
# Dumb Example

# X has two features, one integer and one randomly selected value from standard normal distribution

effect_params = [1.3, -0.5]

X = np.array([[np.random.randint(-200, 200), np.random.randn()] for i in range(50)])
y = np.where(1/(1+np.exp(-np.matmul(X, effect_params))) >= 0.5, 1, 0)
    
# Model
model = LogisticRegression(solver='liblinear', random_state=42).fit(X,y)

y_pred = model.predict(X)

print("Effect Parameters: ", effect_params)
print("Coeffs: ", model.coef_)

Effect Parameters:  [1.3, -0.5]
Coeffs:  [[0.73243107 0.03238579]]
