# Definitions

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install dataprep

from scipy.stats import spearmanr, kendalltau, gamma
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import resample
from sklearn.utils import compute_sample_weight
import pandas as pd
import numpy as np
from scipy.stats import skew, iqr, jarque_bera
from sklearn.preprocessing import QuantileTransformer
from sklearn import preprocessing
from dataprep.eda import plot, plot_correlation, create_report, plot_missing
import matplotlib.pyplot as plt

In [None]:

def bin_splitting(data, column_to_split, target_variable):
    print("---- Bin Splitting started ----")
    data_copy = data.copy()
    threshold = 0.5     
    skewness = data[column_to_split].skew()

    # Here we will check for skewness using IQR to measure a heavy tailed distibution
    quantiles = data[column_to_split].quantile([0.25, 0.5, 0.75])
    iqr_condition = quantiles[0.75] - quantiles[0.25] > 2*(quantiles[0.5]-quantiles[0.25])

    jb_test = jarque_bera(data[column_to_split])
    jb_condition = jb_test[0] > jb_test[1]


    if abs(skewness) > threshold and (iqr_condition or jb_condition) :

        # transforming the data into qunatile bins
        bins, split_values = pd.qcut(data[column_to_split], q=3, retbins=True, labels=False, duplicates='drop')

        # Print the skewness value and its direction
        if skewness > 0:
            print(f"The {column_to_split} column is right-skewed")
            minority_data = data[data[column_to_split] >= split_values[-2]]
            majority_data = data[data[column_to_split] < split_values[-2]]
            return fix_distribution(minority_data, majority_data, target_variable, column_to_split)


        elif skewness < 0:
            print(f"The {column_to_split} column is left-skewed")
            minority_data = data[data[column_to_split] <= split_values[1]]
            majority_data = data[data[column_to_split] > split_values[1]]

            return fix_distribution(minority_data, majority_data, target_variable, column_to_split)

    else:
        return data
    



def fix_distribution(minority_data, majority_data, target_variable, protected_attribute):
            print("---- Distribution Tool in process ----")      
            print("--- Starting to handle skewness ---")

            transformer = QuantileTransformer(output_distribution='normal', random_state=42)

            # Saving a copy of the target variable
            temp_protected_column_minority = minority_data[protected_attribute].copy()
            temp_protected_column_majority = majority_data[protected_attribute].copy()
            temp_target_column_minority = minority_data[target_variable].copy()
            temp_target_column_majority = majority_data[target_variable].copy()

            minority_normalized = pd.DataFrame(transformer.fit_transform(minority_data.drop(target_variable, axis=1)),
                                            columns=minority_data.drop(target_variable, axis=1).columns)
            
            majority_normalized = pd.DataFrame(transformer.transform(majority_data.drop(target_variable, axis=1)),
                                               columns=majority_data.drop(target_variable, axis=1).columns)
            
            
            # re entering the target column to make sure it didnt change
            minority_normalized[protected_attribute] = temp_protected_column_minority
            majority_normalized[protected_attribute] = temp_protected_column_majority
            minority_normalized[target_variable] = temp_target_column_minority
            majority_normalized[target_variable] = temp_target_column_majority
            # Combine the normalized data with the majority data
            balanced_data = pd.concat([minority_normalized, majority_data])
            balanced_data = balanced_data.dropna()
            return balanced_data


In [None]:

np.random.seed(123)

# the function will always return majority before minority
def get_majority_minority(data, column_to_split, target_variable):
    threshold = 0.5     
    skewness = data[column_to_split].skew()

    # Here we will check for skewness using IQR to measure a heavy tailed distibution
    quantiles = data[column_to_split].quantile([0.25, 0.5, 0.75])
    iqr_condition = quantiles[0.75] - quantiles[0.25] > 2*(quantiles[0.5]-quantiles[0.25])

    jb_test = jarque_bera(data[column_to_split])
    jb_condition = jb_test[0] > jb_test[1]


    if abs(skewness) > threshold and (iqr_condition or jb_condition) :

        # transforming the data into qunatile bins
        bins, split_values = pd.qcut(data[column_to_split], q=3, retbins=True, labels=False, duplicates='drop')

        # Print the skewness value and its direction
        if skewness > 0:
            print(f"The {column_to_split} column is right-skewed")
            minority_data = data[data[column_to_split] >= split_values[-2]]
            majority_data = data[data[column_to_split] < split_values[-2]]
            return majority_data, minority_data    


        elif skewness < 0:
            print(f"The {column_to_split} column is left-skewed")
            minority_data = data[data[column_to_split] <= split_values[1]]
            majority_data = data[data[column_to_split] > split_values[1]]
            return majority_data, minority_data

    return data[data[column_to_split] >= np.median(data[column_to_split])] , data[data[column_to_split] < np.median(data[column_to_split])]
        



def compute_fairness_matrics(data, protected_variable, target_variable):

  
  majority_data, minority_data = get_majority_minority(data, protected_variable, target_variable)
  
  X_train_major, X_test_major, y_train_major, y_test_major, pa_train_major, pa_test_major = train_test_split(majority_data.drop(columns=[protected_variable, target_variable]),
                                                                         majority_data[target_variable], majority_data[protected_variable], test_size=0.2, random_state=42)
  X_train_minor, X_test_minor, y_train_minor, y_test_minor, pa_train_minor, pa_test_minor = train_test_split(minority_data.drop(columns=[protected_variable, target_variable]),
                                                                         minority_data[target_variable], minority_data[protected_variable], test_size=0.2, random_state=42)

  # Fit a logistic regression classifier
  clf = LogisticRegression(random_state=123, max_iter=1200)
  clf.fit(pd.concat([X_train_major, X_train_minor]), pd.concat([y_train_major, y_train_minor]))

  # Define the threshold for classification
  threshold = 0.5


  # Method: Separation
  # Compute the proportion of positive outcomes for each group
  pos_rate_protected = np.sum((clf.predict_proba(X_test_minor)[:, 1] >= threshold) * (y_test_minor == 1)) / np.sum((y_test_minor == 1))
  pos_rate_non_protected = np.sum((clf.predict_proba(X_test_major)[:, 1] >= threshold) * (y_test_major == 1)) / np.sum((y_test_major == 1))
  acc = accuracy_score(pd.concat([y_test_major, y_test_minor]), clf.predict(pd.concat([X_test_major, X_test_minor])))

  return abs(pos_rate_protected - pos_rate_non_protected), acc
 

## Measuring Fail Utilities
Those are utility functions that we used it order to check for why our tool works well or bad.  

Checking if there is non-linearity - this could result in our method's failure.
We choose to use spearman correlation test as it was discussed in class for this purpose.

In [None]:
from scipy.stats import spearmanr, kendalltau, gamma


def spearman(data, protected_attribte, target_attribute):
    # Compute Spearman's rank correlation coefficient
    spearman_corr, spearman_p = spearmanr(data[protected_attribte], data[target_attribute])
    print("Spearman's correlation coefficient:", spearman_corr)
    print("p-value:", spearman_p)

## Baselines:

In [None]:
from sklearn.utils.class_weight import compute_class_weight


def resampling(data, protected_variable, target_variable):
    majority_data, minority_data = get_majority_minority(data, protected_variable, target_variable)
    minority_data_resampled = resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)
    return minority_data_resampled




def reweighting(data, protected_variable, target_variable):
    # Reweight the protected attribute to achieve balance
    majority_data, minority_data = get_majority_minority(data, protected_variable, target_variable)

    # compute the class weights for each group
    class_weights = compute_class_weight("balanced", classes=[0, 1], y=data[target_variable])

    # create a dictionary to hold the class weights for each group
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

    # apply the class weights to the minority group
    minority_data_weighted = minority_data.sample(frac=class_weight_dict[1], replace=True, random_state=42)

    # combine the two groups back into one dataset
    data_weighted = pd.concat([majority_data, minority_data_weighted])

    return data_weighted



# Breast Cancer

In [None]:
data = pd.read_csv('/content/drive/MyDrive/datasets/BreastCancer/Breast_Cancer.csv')

protected_attribute = 'Tumor Size'
target_variable = 'Status'

data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

data.head(3)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive


In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

categorial_columns = data.select_dtypes(exclude=[np.number]).columns
for col in categorial_columns:
    data[col] = le.fit_transform(data[col])

data.head(3)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,2,1,0,0,0,1,3,1,4,1,1,24,1,60,0
1,50,2,1,1,1,2,0,2,1,35,1,1,14,5,62,0
2,58,2,0,2,2,4,0,2,1,63,1,1,14,7,75,0


# Diabetes

In [None]:
data = pd.read_csv('/content/drive/MyDrive/datasets/Diabetes/diabetes.xls')

protected_attribute = 'Age'
target_variable = 'Outcome'

data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

data.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


# Heart

In [None]:
# Load the data into a pandas DataFrame
data = pd.read_csv('/content/drive/MyDrive/datasets/BreastCancer/heart_cleveland_upload.csv')

protected_attribute = 'oldpeak'
target_variable = 'condition'

data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

data.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0


# Adult

In [None]:
from sklearn import preprocessing

data = pd.read_csv("/content/drive/MyDrive/datasets/Adult/adult.csv")

data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

data.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K


In [None]:
le = preprocessing.LabelEncoder()

protected_attribute = 'fnlwgt'
target_variable = 'income'


# data[target_variable] = np.where(data[target_variable] == "Male", 1, 0) 


categorial_columns = data.select_dtypes(exclude=[np.number]).columns
for col in categorial_columns:
    data[col] = le.fit_transform(data[col])

# Run

In [None]:
results = pd.DataFrame(columns = ['separation diff', 'accuracy'],
        index = ['raw', 'QuantileTransformer', 'Resample', 'reweighting'])

#### Raw run ####
X = data.drop(columns=[target_variable, protected_attribute])
y = data[target_variable]
Z = data[protected_attribute]

results.loc["raw"] = compute_fairness_matrics(data, protected_attribute, target_variable)

# Running spearman to evaluate the non-linearity of the protected attribute and the target attribute
spearman(data, protected_attribute, target_variable)

print("\n")

#### Bin Splitting - QuantileTransformer ####

fixed_data = bin_splitting(data, protected_attribute, target_variable)

X = fixed_data.drop(columns=[target_variable, protected_attribute])
y = fixed_data[target_variable]
Z = fixed_data[protected_attribute]


results.loc["QuantileTransformer"] = compute_fairness_matrics(fixed_data, protected_attribute, target_variable)

print("\n")

#### Resample run ####
fixed_data = resampling(data, protected_attribute, target_variable)

X = fixed_data.drop(columns=[target_variable, protected_attribute])
y = fixed_data[target_variable]
Z = fixed_data[protected_attribute]

results.loc["Resample"] = compute_fairness_matrics(fixed_data, protected_attribute, target_variable)

print("\n")

#### Reweight run ####
fixed_data = reweighting(data, protected_attribute, target_variable)

X = fixed_data.drop(columns=[target_variable, protected_attribute])
y = fixed_data[target_variable]
Z = fixed_data[protected_attribute]


results.loc["reweighting"] = compute_fairness_matrics(fixed_data, protected_attribute, target_variable)

print("\n")

results

[0.  0.1 1.4 6.2]
The oldpeak column is right-skewed
Spearman's correlation coefficient: 0.41103152452191627
p-value: 1.552780442721676e-13


---- Bin Splitting started ----
The oldpeak column is right-skewed
---- Distribution Tool in process ----
--- Starting to handle skewness ---
[0.  0.8 4.2]
The oldpeak column is right-skewed


[0.  0.1 1.4 6.2]
The oldpeak column is right-skewed
[1.4 1.8 2.6 6.2]
The oldpeak column is right-skewed






[0.  0.1 1.4 6.2]
The oldpeak column is right-skewed
[0.         0.16666667 1.4        5.6       ]
The oldpeak column is right-skewed




Unnamed: 0,separation diff,accuracy
raw,0.160784,0.816667
QuantileTransformer,0.555556,0.711111
Resample,0.1,0.820513
reweighting,0.294118,0.854839
