In [75]:
import pickle
import time

import numpy as np
import pandas as pd
import lightgbm as lgb

# metrics
from fairlearn.metrics import demographic_parity_difference
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error

In [4]:
%load_ext autoreload
%autoreload 2
from main.utils.preprocessing import preprocess_census
from main.estimators.evaluation import cv_early_stopping
from main.fairness.metrics import unfairness, get_all_predictions, calculate_metrics
from main.utils.dataloader import load_sunbelt_data

In [5]:
sunbelt_states = ['AL', 'AZ', 'FL', 'GA', 'LA', 
                  'MS', 'NM', 'SC', 'TX', 'CA']

data_all = load_sunbelt_data(states=sunbelt_states)

cat_features = ['OCCP', 'POBP', 'SCHL', 'RELP']

ret_dict =  preprocess_census(data=data_all,
                              target_feature='PINCP',
                              sensitive_features=['SEX', 'RAC1P_black'],
                              categorical_features=cat_features,
                              continuous_features=['WKHP', 'AGEP'], 
                              objective='regression')

In [None]:
res_dict = {}
res_dict['bench'] = {}
res_dict['ours'] = {}

for seed_ in [42, 1029, 3948, 103, 56, 93983838, 828, 1928838, 900, 10]:


    sunbelt_states = ['AL', 'AZ', 'FL', 'GA', 'LA', 
                    'MS', 'NM', 'SC', 'TX', 'CA']

    data_all = load_sunbelt_data(states=sunbelt_states)

    cat_features = ['OCCP', 'POBP', 'SCHL', 'RELP']

    ret_dict =  preprocess_census(data=data_all,
                                target_feature='PINCP',
                                sensitive_features=['SEX', 'RAC1P_black'],
                                categorical_features=cat_features,
                                continuous_features=['WKHP', 'AGEP'], 
                                objective='regression', 
                                split_seed=seed_)


    params = {
        "objective": "binary",
        "metric": "auc",
        "min_data_in_leaf": 50,
        "learning_rate": 0.05,
        "feature_fraction": 0.9,
        "verbose": -1
    }

    cv_results = cv_early_stopping(params=params, 
                                nfolds=5, 
                                max_rounds=1000, 
                                early_stopping_rounds=20, 
                                X_train=ret_dict['X_train'], 
                                y_train=ret_dict['y_train'], 
                                categorical_feats=cat_features, 
                                objective='regression')


    best_res = np.argmin(cv_results['metric'])
    best_iter = cv_results['iterations'][best_res]

    # Retrain whole model

    start_ours = time.time()

    data_train_all = lgb.Dataset(data=ret_dict['X_train'], 
                                label=ret_dict['y_train'], 
                                categorical_feature=cat_features)

    best_estimator = lgb.train(params=params,
                            train_set=data_train_all, 
                            num_boost_round=best_iter)


    output_check = get_all_predictions(best_estimator, 
                                    ['SEX_2', 'RAC1P_black'], 
                                    ret_dict)


    end_ours = time.time()

    time_ours = end_ours - start_ours


    ret_metrics = calculate_metrics(output_check, 
                    ret_dict['y_test'], 
                    objective='regression')


    with open(f'data/results/output_regression_after_seed_{seed_}.pkl', 'wb') as con_:
        pickle.dump(res_dict, con_)

