## Function

In [34]:
import pandas as pd
from sklearn.base import BaseEstimator #, ClassifierMixin
import numpy as np 

class DiscriminationFreeEstimator(BaseEstimator):
    def __init__(self, estimator, sensitive_variable='sex'):
        self.estimator = estimator
        self.sensitive_variable = sensitive_variable
        self.is_fitted = False

    def fit(self, X, y):
        # in case of multiple protected variables 
        self.sensitive_index = list(X.columns).index(self.sensitive_variable)
        # Fit the underlying estimator
        self.estimator.fit(X, y)
        self.is_fitted = True
        return self

    def get_group_weights(self, X):
        # Extract unique values of the protected variable
        sensitive_values = X.iloc[:, self.sensitive_index]
        unique_groups = sensitive_values.unique()
        group_weights = {}

        for group in unique_groups:
            # Calculate the weight for the each group from protected variable 
            group_weight = np.mean(sensitive_values == group)
            group_weights[group] = group_weight
        return group_weights
    
    # return a table of best_estimates for each group and discrimination-free price
    def get_predictions(self, X):
        predictions = pd.DataFrame(index=X.index)
        group_weights = self.get_group_weights(X)
        # Calculate the discrimination-free prediction by weighting the group predictions
        discrimination_free_prediction = pd.Series(0, index=predictions.index)

        #Calculate the best estimate
        for group, weight in group_weights.items():
            # Create a copy of X and set the sensitive variable to the current group value
            X_copy = X.copy()
            X_copy[self.sensitive_variable] = group
            # Predict for the modified dataset
            predictions[group] = self.estimator.predict(X_copy)
            # Add weighted values of each group one by one to discrimination_free_prediction
            discrimination_free_prediction += predictions[group] * weight

        # Add the discrimination-free prediction to the DataFrame
        predictions['discrimination_free'] = discrimination_free_prediction
        return predictions

    def predict(self, X):
        # Ensure the instance is fitted before predicting
        if not self.is_fitted:
            raise NotFittedError("This DiscriminationFreeEstimator instance is not fitted yet.")
        
        # Get the predictions including the discrimination-free prediction
        predictions = self.get_predictions(X)
        return predictions['discrimination_free']
    

## Data 

In [3]:
import pandas as pd
data = pd.read_csv('Medical_insurance.csv')
data['sex'] = [0 if i == 'male' else 1 for i in data.sex]
data['smoker'] = [0 if i == 'no' else 1 for i in data.smoker]

In [4]:
# Create a mapping dictionary
region_mapping = {
    'northwest': 1,
    'northeast': 2,
    'southwest': 3,
    'southeast': 4
}

# Use the map function to apply the mapping
data['region'] = data['region'].map(region_mapping).fillna(0).astype(int)

In [5]:
from sklearn.model_selection import train_test_split
X =data.drop(['charges'], axis = 1)
y = data[['charges']]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [6]:
from sklearn.tree import DecisionTreeRegressor
model = DiscriminationFreeEstimator(DecisionTreeRegressor(random_state = 1))
model.fit(X_train,y_train)
model.predict(X_test)

1106     9136.578240
1321    28101.333050
2313    12032.326000
2274     1927.109626
1432     3393.356350
            ...     
1948     2836.444222
2268     2585.850650
1922     5972.378000
361      4998.390707
1748    13796.677224
Name: discrimination_free, Length: 693, dtype: float64

In [36]:
model.get_predictions(X_test)

Unnamed: 0,1,0,discrimination_free
1106,8988.15875,9288.02670,9136.578240
1321,28101.33305,28101.33305,28101.333050
2313,12032.32600,12032.32600,12032.326000
2274,2166.73200,1682.59700,1927.109626
1432,3393.35635,3393.35635,3393.356350
...,...,...,...
1948,3172.01800,2494.02200,2836.444222
2268,2585.85065,2585.85065,2585.850650
1922,5972.37800,5972.37800,5972.378000
361,5240.76500,4751.07000,4998.390707


In [12]:
normal_model = DecisionTreeRegressor(random_state=1)
normal_model.fit(X_train, y_train)
normal_model.predict(X_test)

array([ 8988.15875, 28101.33305, 12032.326  ,  1682.597  ,  3393.35635,
       24106.91255,  5002.7827 , 47269.854  ,  8068.185  ,  2639.0429 ,
       11299.343  , 10594.2257 ,  2498.4144 , 20878.78443,  4340.4409 ,
        6799.458  , 30284.64294,  9861.025  ,  3761.292  , 24059.68019,
       10370.91255, 58571.07448, 19594.80965, 49577.6624 , 33907.548  ,
        8547.6913 ,  1621.3402 , 14210.53595, 24603.04837,  1121.8739 ,
       10579.711  ,  4719.52405,  1639.5631 ,  1725.5523 , 36910.60803,
        8457.818  ,  1664.9996 ,  2395.17155,  6185.3208 , 21259.37795,
        3490.5491 ,  7358.17565,  9704.66805,  3597.596  ,  2117.33885,
       47269.854  ,  3554.203  , 41919.097  ,  2103.08   ,  6184.2994 ,
       43896.3763 ,  9301.89355,  2801.2588 ,  4753.6368 ,  3987.926  ,
       12333.828  ,  1909.52745,  2221.56445,  6414.178  ,  8428.0693 ,
        3213.62205, 16586.49771, 25517.11363,  1149.3959 , 11015.1747 ,
       12479.70895,  4687.797  ,  3732.6251 ,  6389.37785,  5002

In [15]:
from sklearn.neural_network import MLPRegressor
normal_model = MLPRegressor(random_state=1, max_iter=500)
normal_model.fit(X_train, y_train)
normal_model.predict(X_test)

  y = column_or_1d(y, warn=True)


array([14445.64874125, 17708.84806501, 14970.24457834, 11162.5694089 ,
       10548.80267462, 15441.29589137, 11992.08597502, 19079.26866567,
       15603.42121864,  9394.72585325, 16553.73028544, 17109.7408287 ,
       11316.0394999 , 14737.9922585 , 10778.43464516, 12538.12714512,
       14695.00373815, 14725.17470568, 11046.36575027,  9303.37728337,
       15742.25415884, 13628.89790767, 13263.10737886, 19639.35253049,
       10184.03552016, 14651.00853895,  6739.04487843, 18859.06104745,
       13916.86890989,  7856.10623255, 17488.75169275,  9866.42090314,
        8872.75151469,  9272.75894344, 18160.23738441, 16135.37600994,
        9228.749329  ,  8791.69921294, 13407.86509069, 14072.80299763,
       10983.53091901, 13593.58383555, 16349.39306095, 11887.42954548,
        7375.79859163, 19587.18970972, 10623.04856743, 16198.23992842,
       10964.8576027 , 12705.44645962, 16709.20291543, 14281.67424901,
        9736.87129406, 13872.70353767, 11512.44389861, 16748.26126746,
      

In [16]:
model = DiscriminationFreeEstimator(MLPRegressor(random_state=1, max_iter=500))
model.fit(X_train,y_train)
model.predict(X_test)

  y = column_or_1d(y, warn=True)


1106    14536.082611
1321    17616.568606
2313    15060.677899
2274    11070.289950
1432    10639.236544
            ...     
1948    10582.809794
2268     7921.955894
1922    14042.721537
361     12453.170556
1748     8525.418655
Name: discrimination_free, Length: 693, dtype: float64