# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sklearn
import dalex as dx

from copy import copy

from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

{
    "numpy": np.__version__,
    "pandas": pd.__version__,
    "matplotlib": matplotlib.__version__,
    "seaborn": sns.__version__,
    "sklearn": sklearn.__version__,
    "dalex": dx.__version__,
}

  import pkg_resources


{'numpy': '2.3.3',
 'pandas': '2.3.2',
 'matplotlib': '3.10.6',
 'seaborn': '0.13.2',
 'sklearn': '1.7.2',
 'dalex': '1.7.2'}

In [2]:
df = pd.read_csv("./stackoverflow_full.csv", index_col=0)
target = "Employed"

### Insert here the "no changes model cells"

## ML prerequisites

In [3]:
#split your data set in 2 parts : training and testing

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=target),
    df[target],
    test_size=0.3,
    random_state=42
)

In [5]:
# Protected attribute is 0 if a man or non binary and 1 if a woman plus the age

protected = (pd.Series(np.where(X_test["Gender"] == "Woman", '1', '0'), index=X_test.index) 
             + '_' 
             + X_test.Age)
protected_train = (pd.Series(np.where(X_train["Gender"] == "Woman", '1', '0').astype(str), index=X_train.index) 
                   + '_' 
                   + X_train.Age)

# Privileged population is men under 35 years old
privileged = '0_<35'

In [6]:
preprocessor = make_column_transformer(
      ("passthrough", make_column_selector(dtype_include=np.number)),
      (OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object))
)

#You can change the Decision tree hyperparameters or the classifier below

clf_decisiontree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=10, random_state=123))
])

In [7]:
# clf_decisiontree.fit(df.drop(columns=[target]), df[target])
clf_decisiontree.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('passthrough', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,123
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [8]:
# exp_decisiontree = dx.Explainer(clf_decisiontree, df.drop(columns=[target]), df[target], verbose=False)
exp_decisiontree = dx.Explainer(clf_decisiontree, X_test, y_test, verbose=True)

Preparation of a new explainer is initiated

  -> data              : 22039 rows 13 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 22039 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x00000174C550DE40> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.536, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = -0.000922, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!


In [9]:
exp_decisiontree.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.804868,0.788158,0.796425,0.779845,0.860599


In [10]:
fairness_decisiontree = exp_decisiontree.model_fairness(protected=protected, privileged=privileged)

In [11]:
fairness_decisiontree.fairness_check(epsilon = 0.8) # default epsilon

Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on '0_<35'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR       STP
0_>35  0.969325  0.989770  0.982346  0.942085  0.937943
1_<35  0.947239  1.006394  0.979823  0.768340  0.833333
1_>35  0.900613  1.046036  1.034048  0.459459  0.673759


In [12]:
fairness_decisiontree.plot(verbose=False)

  data = data.stack(dropna=False)


###

### Strategy 3: Post-processing: ROC-pivot

#### After-Training

For this method, there is no re-training to do since it's a post-processing method. The idea is to alter results in favor / defavor of some groups to increase the fairness metrics scores (privileged group VS others).

From a math point of view, 

Let, 
* `P` be the probability output of a model (higher probability means higher chances to get the favorable outcome, "1" in out case).
* `cutoff` be the value to assign values to 0 (below cutoff) or 1 (above cutoff)
* `𝜃` be the margin parameter to alter results (it is representing the notion of "close enough")
* `Priviledge` be the boolean value if the observation is part of the priviledge group

The roc pivot method will distinguish two cases : 

* The first one: if `|P - cutoff| < 𝜃 AND Priviledge AND P > cutoff` is `True` then the new probability became `P = cutoff - (P - cutoff)` which is now below the cutoff.

* The second case: if `|P - cutoff| < 𝜃 AND NOT(Priviledge) AND cutoff > P` is `True`, then the new probability became `P = cutoff + (cutoff - P)` which is above the cutoff value.


In [13]:
from dalex.fairness import roc_pivot
exp_decisiontree_roc = copy(exp_decisiontree)

# Results modifications. Theta arbitrarily set at 0.1
exp_decisiontree_roc = roc_pivot(exp_decisiontree, protected, privileged, 
                                 theta = 0.1, verbose = False)

#### Algorithmic performance

In [14]:
exp_decisiontree_roc.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.803765,0.780982,0.79221,0.7744,0.859652


#### Fairness performance

In [15]:
fairness_decisiontree_roc = exp_decisiontree_roc.model_fairness(
    protected, 
    privileged, 
    label='DecisionTreeClassifier_roc')

fairness_decisiontree_roc.fairness_check(epsilon = 0.8)

Bias detected in 1 metric: FPR

Conclusion: your model cannot be called fair because 1 criterion exceeded acceptable limits set by epsilon.
It does not mean that your model is unfair but it cannot be automatically approved based on these metrics.

Ratios of metrics, based on '0_<35'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR       STP
0_>35  1.157687  0.993557  0.897185  1.685990  1.225049
1_<35  1.128778  1.001289  0.883721  1.434783  1.101761
1_>35  1.070959  1.032216  0.909425  1.004831  0.911937


In [16]:
fairness_decisiontree.plot(
    [fairness_decisiontree_roc] )







- Is this strategy effective in terms of algorithmic performance?
- What comment can you make based on the fairness metric result?
- Could you think of a way to improve this strategy (not necessarly on python but as a complementary idea to this solution)?

If you have more time or wish to compare, feel free to try the other strategies and compare the results (you can plot the fairness metrics to have a visual comparison).