# SolasAI Disparity Calculations

In [1]:
# In some environments, plotly does not render properly.  If this is the case, run the following code:
# import plotly.io as pio
# pio.renderers.default = "svg"

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import solas_disparity as sd
import xgboost as xgb

pd.set_option('display.max_columns', 500)

## Building a Model

In [8]:
df = pd.read_csv("hmda.csv.gz", index_col="id")
df.sample(random_state=161803, n=5)

Unnamed: 0_level_0,Low-Priced,Interest Rate,Rate Spread,Loan Amount,Loan-to-Value Ratio,No Intro Rate Period,Intro Rate Period,Property Value,Income,Debt-to-Income Ratio,Term 360,Conforming,State,Product Type,Black,Asian,White,Native American,Hawaiian Or Pacific Islander,Hispanic,Non-Hispanic,Male,Female,Age >= 62,Age < 62,Race,Ethnicity,Sex
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
13451,1.0,0.04875,0.00596,155000.0,0.97,1,0,165000.0,35000.0,0.33,1.0,1.0,FL,conventional,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,,,White,Hispanic,Male
18248,1.0,0.0575,0.01268,305000.0,1.0,1,0,295000.0,60000.0,0.55,1.0,1.0,CO,va,,,,,,,,,,,,Unknown,Unknown,Unknown
19610,1.0,0.055,0.01214,485000.0,0.95,1,0,515000.0,100000.0,0.43,1.0,1.0,CO,conventional,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,1.0,0.0,White,Non-Hispanic,Unknown
3339,1.0,0.03875,-0.00087,675000.0,1.0,1,0,675000.0,190000.0,0.33,1.0,1.0,VA,va,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0,1.0,Black,Non-Hispanic,Unknown
19675,1.0,0.04375,0.00076,275000.0,0.3507,1,0,775000.0,209000.0,0.25,1.0,1.0,AZ,conventional,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,0.0,1.0,White,Non-Hispanic,Unknown


In [9]:
features = [
    "Loan Amount",
    "Loan-to-Value Ratio",
    "Intro Rate Period",
    "Property Value",
    "Income",
    "Debt-to-Income Ratio",
    "Term 360",
    "Conforming",
    # "Product Type",
]
label = "Low-Priced"
df['Product Type'] = df['Product Type'].astype('category')
df[label] = df[label].astype('int')
X_train, X_valid, y_train, y_valid = train_test_split(df[features], df[label], test_size=0.25, random_state=161803)

y_train.value_counts(dropna=False)


1    13586
0     1414
Name: Low-Priced, dtype: int64

In [21]:
params = dict(
    objective="binary:logistic",
    max_depth=6,
    learning_rate=0.10,
    n_estimators=1000,
    base_score=y_train.mean(),
    random_state=31415,
    # enable_categorical=True,
)
xgb_classifier = xgb.XGBClassifier(**params).fit(X=X_train, y=y_train)

pred_valid = pd.Series(data=xgb_classifier.predict(X_valid), index=X_valid.index)
f"Model ROC-AUC: {metrics.roc_auc_score(y_true=y_valid, y_score=pred_valid):0.3f}"

'Model ROC-AUC: 0.598'

Store arguments in a dictionary for reusability in multiple calls to disparity functions.

In [12]:
reused_arguments = dict(
    protected_groups=["Black", "Asian", "Native American", "Hispanic", "Female"],
    reference_groups=["White", "White", "White", "Non-Hispanic", "Male"],
    group_categories=["Race", "Race", "Race", "Ethnicity", "Sex"],
    sample_weight=None,
)

## Adverse Impact Ratio (AIR)

In [None]:
air = sd.adverse_impact_ratio(
    outcome=predictions <= predictions.quantile(0.5),
    air_threshold=0.8,
    percent_difference_threshold=0.0,
    **reused_arguments,
)

In [None]:
air

In [None]:
sd.ui.show(air.summary_table)

## Standardized Mean Difference (SMD)

In [None]:
smd = sd.standardized_mean_difference(
    outcome=predictions,
    label=y_test,
    smd_threshold=30,
    lower_score_favorable=True,
    **reused_arguments,
)

In [None]:
smd

In [None]:
sd.ui.show(smd.summary_table)

## Adverse Impact Ratio by Quantile

In [None]:
airq = sd.adverse_impact_ratio_by_quantile(
    outcome=predictions,
    air_threshold=0.8,
    percent_difference_threshold=0.0,
    quantiles=[decile / 10 for decile in range(1, 11)],
    lower_score_favorable=True,
    **reused_arguments,
)

In [None]:
airq

In [None]:
sd.ui.show(airq.summary_table)

## Odds Ratio

In [None]:
odds_ratio = sd.odds_ratio(
    outcome=predictions <= predictions.quantile(0.5),
    odds_ratio_threshold=0.68,
    percent_difference_threshold=0.0,
    **reused_arguments,
)

In [None]:
odds_ratio

In [None]:
sd.ui.show(odds_ratio.summary_table)

## Categorical Adverse Impact Ratio

Generate an example categorical outcome.

In [None]:
categorical_outcome = pd.qcut(predictions, q=[0.0, 0.25, 0.5, 0.75, 1.0])
categories = categorical_outcome.cat.categories.to_series()
categories = pd.Series(["Best", "Great", "Good", "Bad"], index=categories.index)
categorical_outcome.replace(categories.to_dict(), inplace=True)

In [None]:
cair = sd.categorical_adverse_impact_ratio(
    outcome=categorical_outcome,
    ordinal_categories=list(reversed(categories.tolist())),
    air_threshold=0.8,
    percent_difference_threshold=0.0,
    **reused_arguments,
)

In [None]:
cair

In [None]:
sd.ui.show(cair.summary_table)

## Residual Standardized Mean Difference

In [None]:
rsmd = sd.residual_standardized_mean_difference(
    prediction=predictions,
    label=y_test,
    residual_smd_threshold=30,
    lower_score_favorable=True,
    **reused_arguments,
)

In [None]:
rsmd

In [None]:
sd.ui.show(rsmd.summary_table)

## Segmented Adverse Impact Ratio

Generate example income segments.

In [None]:
segments = pd.qcut(df.loc[test_index, "Income"], q=[0.0, 1 / 3, 2 / 3, 1.0])
categories = segments.cat.categories.to_series()
categories = pd.Series(["Low Income", "Mid Income", "High Income"], index=categories.index)
segments.replace(categories.to_dict(), inplace=True)

In [None]:
sair = sd.segmented_adverse_impact_ratio(
    outcome=predictions <= predictions.quantile(0.5),
    air_threshold=0.8,
    percent_difference_threshold=0.0,
    fdr_threshold=0.2,
    segment=segments,
    **reused_arguments,
)

In [None]:
sair.summary_table

In [None]:
sair.summary_table_by_segments

In [None]:
sair

In [None]:
sd.ui.show(sair.summary_table)