# Fair Binary Classification with SearchFair on CelebA and Adult

Here, we show how to use SearchFair on two datasets: CelebA and Adult

## Imports

We start by importing SearchFair from the installed package.

In [1]:
from searchfair import SearchFair

Second, we load some necessary methods and numpy.

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np

# The CelebA dataset

On the Celebrity Faces dataset we are given descriptions of celebrity faces, with 40 binary attributes. Here, we use the Attribute 'Smiling' as the class label, and sex as the sensitive attribute. 

In [3]:
import get_real_data as get_data

# Load Data
x_data, y_data, s_data = get_data.get_celebA_data(load_data_size=None)
# Train Test split. Here, we choose a small number to reduce running time.
train_size = 1000
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=train_size, shuffle=True)

Here are some basic information about the dataset:

In [4]:
import utils as ut
ut.print_data_stats(s_data, y_data)

Total data points: 202599
# non-protected examples: 118165
# protected examples: 84434
# non-protected examples in positive class: 63871 (54.1%)
# protected examples in positive class: 33798 (40.0%)


In [5]:
""" Fix Parameters for Cross Validation"""

fairness_notion = 'DDP' # DDP = Demographic Parity, DEO = Equality of Opportunity. 
kernel = 'linear'
verbose=False

# regularization parameter beta
beta_params = [0.0001, 0.001]
cv_params = {'reg_beta': beta_params}
    

Surpress warnings because problem might not be DPP but thats fine. not a problem

In [6]:
model = SearchFair(fairness_regularizer='wu', wu_bound='hinge', kernel=kernel, fairness_notion=fairness_notion, verbose=verbose)

#print(check_estimator(model))
grid_clf = GridSearchCV(model,cv_params, cv=2, verbose=2, n_jobs=1, scoring='accuracy')
grid_clf.fit(x_train, y_train, s_train=s_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] reg_beta=0.0001 .................................................
Is DPP?  True
Is DCP?  True


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Is DPP?  False
Is DCP?  True


	https://www.cvxpy.org/tutorial/advanced/index.html#disciplined-parametrized-programming


Failure:Interrupted
[CV] .................................. reg_beta=0.0001, total= 1.6min
[CV] reg_beta=0.0001 .................................................
Is DPP?  True
Is DCP?  True


Traceback (most recent call last):
  File "/Users/mlohaus/anaconda3/envs/SearchFair/lib/python3.8/site-packages/scikit_learn-0.23.1-py3.8-macosx-10.9-x86_64.egg/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mlohaus/anaconda3/envs/SearchFair/lib/python3.8/site-packages/searchfair-0.0.1-py3.8.egg/searchfair/classifiers.py", line 147, in fit
    new_rd, new_alpha = learn(lbda_new, None)
  File "/Users/mlohaus/anaconda3/envs/SearchFair/lib/python3.8/site-packages/searchfair-0.0.1-py3.8.egg/searchfair/classifiers.py", line 112, in learn
    self.optimize()
  File "/Users/mlohaus/anaconda3/envs/SearchFair/lib/python3.8/site-packages/searchfair-0.0.1-py3.8.egg/searchfair/classifiers.py", line 301, in optimize
    self.prob.solve(solver=cp.SCS, max_iters=self.max_iter, verbose=self.verbose, warm_start=True)
  File "/Users/mlohaus/anaconda3/envs/SearchFair/lib/python3.8/site-packages/cvxpy-1.1.1-py3.8-macosx

Is DPP?  False
Is DCP?  True


	https://www.cvxpy.org/tutorial/advanced/index.html#disciplined-parametrized-programming


[CV] .................................. reg_beta=0.0001, total= 3.2min
[CV] reg_beta=0.001 ..................................................
Is DPP?  True
Is DCP?  True
Is DPP?  False
Is DCP?  True


	https://www.cvxpy.org/tutorial/advanced/index.html#disciplined-parametrized-programming


[CV] ................................... reg_beta=0.001, total= 2.5min
[CV] reg_beta=0.001 ..................................................
Is DPP?  True
Is DCP?  True
Is DPP?  False
Is DCP?  True


	https://www.cvxpy.org/tutorial/advanced/index.html#disciplined-parametrized-programming


[CV] ................................... reg_beta=0.001, total= 3.0min
Is DPP?  True
Is DCP?  True


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 10.4min finished


Is DPP?  False
Is DCP?  True


	https://www.cvxpy.org/tutorial/advanced/index.html#disciplined-parametrized-programming


GridSearchCV(cv=2, estimator=SearchFair(), n_jobs=1,
             param_grid={'reg_beta': [0.0001, 0.001]}, scoring='accuracy',
             verbose=2)

To print out the Accuracy and the fairness notions Demographic Parity and Equality of Opportuniy, we define the following function. 

In [7]:
def print_clf_stats(model, x_train, x_test, y_train, y_test, s_train, s_test):
    train_acc = ut.get_accuracy(np.sign(model.predict(x_train)), y_train)
    test_acc = ut.get_accuracy(np.sign(model.predict(x_test)), y_test)
    test_DDP, test_DEO = ut.compute_fairness_measures(model.predict(x_test), y_test, s_test)
    train_DDP, train_DEO = ut.compute_fairness_measures(model.predict(x_train), y_train, s_train)

    print(10*'-'+"Train"+10*'-')
    print("Accuracy: %0.4f%%" % (train_acc * 100))
    print("DDP: %0.4f%%" % (train_DDP * 100), "DEO: %0.4f%%" % (train_DEO * 100))
    print(10*'-'+"Test"+10*'-')
    print("Accuracy: %0.4f%%" % (test_acc * 100))
    print("DDP: %0.4f%%" % (test_DDP * 100), "DEO: %0.4f%%" % (test_DEO * 100))

In [None]:
Now lets see, if we obtained a fair classifier with respect to the fairness notions we specified. 

In [8]:
print_clf_stats(grid_clf, x_train, x_test, y_train, y_test, s_train, s_test)

----------Train----------
Accuracy: 86.1667%
DDP: 0.2830% DEO: -10.7086%
----------Test----------
Accuracy: 82.8128%
DDP: 2.5627% DEO: -3.1142%


In [None]:
if kernel == 'rbf':
        n_features = x_data.shape[1]
        default_width = 1/n_features
        order_of_magn = np.floor(np.log10(default_width))
        kernel_widths = [10**(order_of_magn), default_width, 10**(order_of_magn+1)]
        cv_params['gamma'] = kernel_widths

## Adult dataset

In the fairness literature, the adult dataset is a very popular dataset. It contains US census data from 1994, where the class label indicates if the income is higher or lower than 50.000$. The binary sensitive attribute here, is the sex.

In [None]:
# Load Data
x_data, y_data, s_data = get_data.get_adult_data(load_data_size=None)
# Train Test split. Here, we choose a small number to reduce running time.
train_size = 1000
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=train_size, shuffle=True)
ut.print_data_stats(s_data, y_data)

In [None]:
""" Fix Parameters for Cross Validation"""

fairness_notion='DDP'
kernel = 'linear'
verbose=False

# regularization parameter beta
beta_params = [0.0001, 0.001]
cv_params = {'reg_beta': beta_params}

In [None]:
model = SearchFair(fairness_regularizer='wu', wu_bound='hinge', kernel=kernel, fairness_notion=fairness_notion, verbose=verbose)

#print(check_estimator(model))
grid_clf = GridSearchCV(model,cv_params, cv=3, verbose=2, n_jobs=1, scoring='accuracy')
grid_clf.fit(x_train, y_train, s_train=s_train)