# Detect unfair

In [132]:
import pandas as pd

loan_dataset = pd.read_csv("../datasets/loan.csv")
loan_dataset.head()

categorical_features = ["sex", "rent", "minority", "ZIP", "occupation"]
numeric_features = [
    "education", "age", "income", "loan_size", "payment_timing",
    "year", "job_stability"
]
for cat in categorical_features:
    loan_dataset[cat] = loan_dataset[cat].astype("object")

pred="default"

X = loan_dataset.copy().drop([pred], axis=1)
y = (loan_dataset.copy()[pred] != f"{pred}-no").astype(int).values

In [120]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create preprocessor of features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create pipeline
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ]
)

# Split into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

# Train classifier
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['education', 'age', 'income',
                                                   'loan_size',
                                                   'payment_timing', 'year',
                                                   'job_stability']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['sex', 'rent', 'minority',
                                                   'ZIP', 'occupation'])])),
                ('classifier', LogisticRegression())])

In [122]:
# Predict
y_pred = clf.predict(X)
y_pred

array([1, 1, 1, ..., 0, 0, 0])

# Quality of service harm

Let's define the __sensitive features__ for this dataset:

In [123]:
# Define sensitive features
sensitive_features = ["minority", "sex"]

Let's inspect the accuracy of the model breaked by the values on each sensitive feature:

In [124]:
from fairlearn.metrics import MetricFrame
from sklearn.metrics import recall_score, precision_score 

for sf in sensitive_features:
    grouped_metric = MetricFrame(
        {"precision": precision_score, "recall": recall_score}, y, y_pred,
        sensitive_features=loan_dataset["minority"]
    )
    grouped_metric.overall
    grouped_metric_df = grouped_metric.by_group
    display(grouped_metric_df)


Unnamed: 0_level_0,precision,recall
minority,Unnamed: 1_level_1,Unnamed: 2_level_1
minority-no,1.0,1.0
minority-yes,0.999695,1.0


Unnamed: 0_level_0,precision,recall
minority,Unnamed: 1_level_1,Unnamed: 2_level_1
minority-no,1.0,1.0
minority-yes,0.999695,1.0


# Quality of allocation harm

In [125]:
for sf in sensitive_features:
    pred_grouped = pd.DataFrame({f"{sf}": loan_dataset[sf], "y_pred": y_pred, "y_true": y})
    pred_vals = pred_grouped.groupby(sf).sum().values / loan_dataset[sf].value_counts().values
    pred_grouped = pd.DataFrame(pred_vals, columns=[f"{pred}_predicted", f"{pred}_true"])
    display(pred_grouped)


Unnamed: 0,default_predicted,default_true
0,0.001453,0.001455
1,0.997851,0.998937


Unnamed: 0,default_predicted,default_true
0,0.5,0.499696
1,0.5,0.5
