In [1]:
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
    brier_score_loss,
)
from sklearn.calibration import calibration_curve
import numpy as np
import pandas as pd
import os
import sys

import equiboots as eqb

### Fetching Dataset

In [2]:
adult_x = pd.read_parquet("../data/processed/X.parquet")
adult_y = pd.read_parquet("../data/processed/y_income.parquet")

In [9]:
adult_x = pd.read_parquet("../data/raw/df.parquet")

In [10]:
adult_x

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
census_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
582248222,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
561810758,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
598098459,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
776705221,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
479262902,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416926381,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
325172833,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
730978234,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
161527603,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [3]:
from adult_income.functions import find_best_model, mlflow_load_model

In [4]:
def return_best_model(outcome, metric, mlruns_location):

    outcome = "income"
    experiment_name = outcome + "_model"

    run_name, estimator_name = find_best_model(
        experiment_name, metric, mlruns_location=mlruns_location
    )

    model_name = f"{estimator_name}_{outcome}"
    best_model = mlflow_load_model(
        experiment_name, run_name, model_name, mlruns_location=mlruns_location
    )
    return best_model

In [5]:
best_model = return_best_model("income", "valid Average Precision", "../mlruns/models/")

Best Run ID: 802618f54b144225b05e37f61d2cd070, Best valid Average Precision: 0.74


In [None]:
y_pred = best_model.predict(adult_x)
y_prob = best_model.predict_proba(adult_x)
y_true = adult_y

In [7]:
adult_x.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'percentage_missing', 'age_missing', 'fnlwgt_missing',
       'education-num_missing', 'capital-gain_missing', 'capital-loss_missing',
       'hours-per-week_missing', 'percentage_missing_missing'],
      dtype='object')

In [None]:
eq = eqb.EquiBoots(
    y_true=y_true,
    y_prob=y_prob,
    y_pred=y_pred,
    fairness_df=fairness_df,
    fairness_vars=["race", "sex"],
)
eq.grouper(groupings_vars=["race", "sex"])
sliced_data = eq.slicer("race")