#### Examples for machine learning algorithms by dsldPy

The goal is for users to train models with a simple, intuitive interface and also understand effects on fairness-utility tradeoffs based on hyperparamater selection. Examples are shown on training/testing sets with cross validation approaches.

1) regression examples using dsldPyFairML and dsldPyQeFairML
2) classification examples using dsldPyFairML and dsldPyQeFairML
3) k-fold cross validation to choose best hyperparameters for fairness utility tradeoff

In [None]:
## requires R and the dsld (R) package installed
# !pip install dsldPy

In [None]:
# load libraries
from dsldPy import (
# data reading and preprocessing
preprocess_data, read_data,

# fairML wrappers
dsldPyFrrm, dsldPyFgrrm, dsldPyNclm, dsldPyZlm, dsldPyZlrm, dsldPyFairML_Summary, dsldPyFairML_Predict,

# qeFairML wrappers
dsldPyQeFairKNN, dsldPyQeFairRF, dsldPyQeFairRidgeLin, dsldPyQeFairRidgeLog, dsldPyQeFairML_Predict,

dsldPyFairUtils
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score


In [None]:
### regression example --- frrm(), nclm(), zlm(), qeFairKNN(), qeFairRF(), qeFairRidgeLin()

### read and preprocess data

### data preprocessing

### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)
### the preprocessing is done by the function preprocess_data
### user needs to manually provide the categorical and numerical features (list)
### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions

# test and train split
#### REPLACE WITH YOUR PATH TO svcensus.RData
# df = read_data("") 
test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)
test_y = test_df['wageinc']
test_df = test_df.drop(columns=['wageinc'])

# preprocess data
cat_features_train = ['educ', 'occ', 'gender']
num_features_train = ['age', 'wageinc', 'wkswrkd']
svcensus_train = preprocess_data(train_df, cat_features_train, num_features_train)

cat_features_test = ['educ', 'occ', 'gender']
num_features_test = ['age', 'wkswrkd']
svcensus_test = preprocess_data(test_df, cat_features_test, num_features_test)


In [None]:
### using dsldPyFairML() function

### model training --- frrm() 
### unfairness = 0.05 // can also try different values for unfairness
a = dsldPyFrrm(data=svcensus_train, yName='wageinc', sName='gender',unfairness= 0.05, definition = "sp-komiyama", lamda = 0, save = False)

# print train accuracy and correlations
print(f"train predictions: {a['train_predictions']}")
print(f"train accuracy: {a['train_accuracy']}")
print(f"train correlations: {a['train_correlations']}")

In [None]:
### predict() on test data
a_preds = dsldPyFairML_Predict(a, svcensus_test)

# print test predictions and correlations
print(f"test predictions: {a_preds['test_predictions']}")
print(f"test correlations: {a_preds['test_correlations']}")

# manuallycompute test accuracy (MAPE)
test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])
print(f"test accuracy: {test_accuracy}")

### the same can be done for other models --- nclm(), zlm() with dsldPyFairML_Predict() method

In [None]:
### using dsldPyQeFairML() functions 

### model training --- dsldQeFairRF() 
### deweightPars = {'educ': 0.2, 'occ': 0.05} // try different values for proxies
deweightPars = {'educ': 0.2, 'occ': 0.05}

a = dsldPyQeFairRF(data=svcensus_train, yName='wageinc', sNames='gender', deweightPars=deweightPars)

# print train accuracy and correlations
print(f"train predictions: {a['train_predictions']}")
print(f"train accuracy: {a['train_accuracy']}")
print(f"train correlations: {a['train_correlations']}")

In [None]:
### predict on test data
a_preds = dsldPyQeFairML_Predict(a, svcensus_test)

# print test predictions and correlations
print(f"test predictions: {a_preds['test_predictions']}")
print(f"test correlations: {a_preds['test_correlations']}")

# manually compute test accuracy (MAPE)
test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])
print(f"test accuracy: {test_accuracy}")

### the same can be done for other models --- qeFairKNN(), qeFairRidgeLin() with dsldPyQeFairML_Predict() method

In [None]:
### classification examples --- fgrrm(), zlrm(), qeFairKNN(), qeFairRF(), qeFairRidgeLog()

### read and preprocess data

# test and train split
#### REPLACE WITH YOUR PATH TO compas1.RData
# df = read_data("")
test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)
test_y = test_df['two_year_recid']
test_y = test_df['two_year_recid'].map({'Yes': 1, 'No': 0})            # convert to binary
test_df = test_df.drop(columns=['two_year_recid'])

# preprocess data
cat_features = ['sex', 'race', 'two_year_recid']
num_features = ["age", "juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1_train = preprocess_data(train_df, cat_features_train, num_features_train)

cat_features = ['sex', 'race']
num_features = ["age", "juv_fel_count","decile_score","juv_misd_count","juv_other_count","priors_count","c_jail_in","c_jail_out","c_offense_date","screening_date","in_custody","out_custody"]
compas1_test = preprocess_data(test_df, cat_features_test, num_features_test)


In [None]:
### using dsldPyFairML() functions 

### model training --- fgrrm() 
### unfairness = 0.1 // try different values for unfairness
a = dsldPyFgrrm(data=compas1_train, yName='two_year_recid', sName='race', unfairness=0.1, definition = "sp-komiyama", family = "binomial", lamda = 0, save = False, yesYVal = "Yes")

# print train accuracy and correlations
print(f"train predictions: {a['train_predictions']}")             # returns prob = Yes
print(f"train accuracy (misclassification rate): {a['train_accuracy']}")
print(f"train correlations: {a['train_correlations']}")

In [None]:
### predict() on test set
a_preds = dsldPyFairML_Predict(a, compas1_test)

# print test predictions and correlations
print(f"test predictions: {a_preds['test_predictions']}") # returns prob = Yes
print(f"test correlations: {a_preds['test_correlations']}")

# manually compute test accuracy (MAPE)
y_pred = [int(round(x)) for x in a_preds['test_predictions']]
test_accuracy = accuracy_score(test_y, y_pred)
misclass_rate = 1 - test_accuracy

# print train accuracy and correlations
print(f"test accuracy (misclassification rate): {misclass_rate}")

### the same can be done for other models --- zlrm() with dsldPyFairML_Predict() method

In [None]:
### using dsldPyQeFairML() functions 

### model training --- dsldQeFairKNN() 
### deweightPars = {'decile_score': 0.2, 'priors_count': 0.5} // try different values for deweightPars
deweightPars = {'decile_score': 0.2, 'priors_count': 0.5}

a = dsldPyQeFairKNN(data=compas1_train, yName='two_year_recid',sNames= 'race', deweightPars=deweightPars, k = 10, scaleX = True, yesYVal = "Yes")

# print train accuracy and correlations
# in the case of classification, the train_predictions returns both predClasses and prob = Yes
print(f"train predictions: {a['train_predictions']}")     
print(f"train accuracy: {a['train_accuracy']}")
print(f"train correlations: {a['train_correlations']}")

In [None]:
### predict() on test set
a_preds = dsldPyQeFairML_Predict(a, compas1_test)

# print test predictions and correlations
print(f"test predictions: {a_preds['test_predictions']}")
print(f"test correlations: {a_preds['test_correlations']}")

# compute test accuracy
y_pred = [int(round(x)) for x in list(a_preds['test_predictions'][1])]
test_accuracy = accuracy_score(test_y, y_pred)
misclass_rate = 1 - test_accuracy

# print train accuracy and correlations
print(f"test accuracy (misclassification rate): {misclass_rate}")

### the same can be done for other models --- dsldQeFairRF(), dsldQeFairRidgeLog() with dsldPyQeFairML_Predict() method

In [None]:
### k-fold cross validation to find best model based on fairness and accuracy
dsldPyFairUtils(data=svcensus_train, yName='wageinc', sName='gender', dsldFTNname = "dsldFrrm", unfairness = [0.01, 0.05, 0.1, 0.2, 0.8], k_folds = 10)

In [None]:
dsldPyFairUtils(data = svcensus_train, yName = 'wageinc', sName = 'gender', dsldFTNname = "dsldQeFairKNN", deweightPars = {'occ': [0.9 ,0.8 ,0.5 ,0.3 ,0.1 ,0.05 ,0.01]}, k_folds = 10)