# Measuring Discrimination with SolasAI

In [None]:
# In some environments, plotly does not render properly.  If this is the case, run the following code:
# import plotly.io as pio
# pio.renderers.default = "svg"

In [None]:
import solas_disparity as sd

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', 500)

## Importing Data and Building a Model

In [None]:
df = pd.read_csv("hmda.csv.gz", index_col="id")
df.sample(random_state=161803, n=5)

In [None]:
features = [
    "Loan Amount",
    "Loan-to-Value Ratio",
    "Intro Rate Period",
    "Property Value",
    "Income",
    "Debt-to-Income Ratio",
    "Term 360",
    "Conforming",
]
label = "Low-Priced"

df['train'] = np.random.choice(a=['train', 'valid'], replace=True, size=len(df), p=[0.8, 0.2])
train = (df['train'] == 'train')


pd.crosstab(df[label], df['train'])

In [None]:
params = dict(
    objective="binary:logistic",
    max_depth=3,
    learning_rate=0.02,
    n_estimators=200,
    base_score=df.loc[train, label].mean(),
    random_state=31415,
)
xgb_classifier = xgb.XGBClassifier(**params).fit(X=df.loc[train, features], y=df.loc[train, label])

In [None]:

df.loc[train, 'predictions'] = xgb_classifier.predict_proba(df.loc[train, features])[:, 1]
df.loc[~train, 'predictions'] = xgb_classifier.predict_proba(df.loc[~train, features])[:, 1]


auc_train = metrics.roc_auc_score(y_score=df.loc[train, 'predictions'], y_true=df.loc[train, label])
auc_valid = metrics.roc_auc_score(y_score=df.loc[~train, 'predictions'], y_true=df.loc[~train, label])

print(
    f"\n************************"
    f"\n**** Model ROC-AUC: ****"
    f"\nTraining:          {auc_train:0.3f}"
    f"\nValidation:        {auc_valid:0.3f}"
    f"\nPercent Change:   {auc_valid / auc_train - 1: 0.2%}"
    f"\n************************"
)

In [None]:
df.loc[train, 'predictions'].describe()

cutoff = 0.90

df['Gets Offer'] = (df['predictions'] > cutoff).astype(int)
df['Gets Offer'].value_counts(dropna=False, normalize=True)

In [None]:
common_info_for_testing = dict(
    group_data=df.loc[~train, :],
    protected_groups=["Black", "Asian", "Native American", "Hispanic", "Female"],
    reference_groups=["White", "White", "White", "Non-Hispanic", "Male"],
    group_categories=["Race", "Race", "Race", "Ethnicity", "Sex"],
)

## Adverse Impact Ratio (AIR)

In [None]:
air = sd.adverse_impact_ratio(
    **common_info_for_testing,
    outcome=df.loc[~train, 'Gets Offer'],
    air_threshold=0.8,
    percent_difference_threshold=0.0,
)

In [None]:
air

## Adverse Impact Ratio by Quantile

In [None]:
airq = sd.adverse_impact_ratio_by_quantile(
    **common_info_for_testing,
    outcome=df.loc[~train, 'predictions'],
    air_threshold=0.8,
    percent_difference_threshold=0.0,
    quantiles=[decile / 10 for decile in range(1, 11)],
    lower_score_favorable=False,
)
airq.plot()

## Standardized Mean Difference (SMD)

In [None]:
smd = sd.standardized_mean_difference(
    **common_info_for_testing,
    outcome=df.loc[~train, 'predictions'],
    smd_threshold=-30,
    lower_score_favorable=False,
)
smd

## Residual Standardized Mean Difference

In [None]:
rsmd = sd.residual_standardized_mean_difference(
    **common_info_for_testing,
    prediction=df.loc[~train, 'predictions'],
    label=df.loc[~train, label],
    residual_smd_threshold=30,
    lower_score_favorable=True,
)
display(rsmd.plot())
sd.ui.show(rsmd.summary_table)