In [1]:
import pandas as pd
from aif360.algorithms.preprocessing import LFR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import BinaryLabelDataset

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [14]:
filepath = "datasets/processed_datasets/compass.csv"
sensitive_attribute = "race"
target_column = "two_year_recid"


df = pd.read_csv(filepath, header=0,skipinitialspace=True)
target_name = "predicted_" + target_column
df[sensitive_attribute] = LabelEncoder().fit_transform(df[sensitive_attribute])



In [19]:
from sklearn.utils import resample

y = df[target_column]
X = df.drop(columns=[target_column])

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Combine X_train and y_train for balancing
train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

# Check class distribution before balancing
print("Class distribution before balancing:")
print(train_data[target_column].value_counts())

# Separate majority and minority classes
majority_class = train_data[train_data[target_column] == 1]
minority_class = train_data[train_data[target_column] == 0]

# Upsample the minority class to balance the dataset
if len(minority_class) < len(majority_class):
    minority_class_upsampled = resample(minority_class,
                                        replace=True,
                                        n_samples=len(majority_class),
                                        random_state=42)
    # Combine majority and upsampled minority class
    balanced_train_data = pd.concat([majority_class, minority_class_upsampled])
else:
    balanced_train_data = train_data

# Shuffle the balanced dataset
balanced_train_data = balanced_train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split balanced data into features and target
X_train_balanced = balanced_train_data.drop(columns=[target_column])
y_train_balanced = balanced_train_data[target_column]

# Display class distribution after balancing
print("Class distribution after balancing:")
print(y_train_balanced.value_counts())

# Prepare the BinaryLabelDataset for training and testing
train_dataset = BinaryLabelDataset(df=pd.concat([X_train_balanced.reset_index(drop=True), y_train_balanced.reset_index(drop=True)], axis=1),
                                   label_names=[target_column],
                                   protected_attribute_names=[sensitive_attribute])

test_dataset = BinaryLabelDataset(df=pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1),
                                  label_names=[target_column],
                                  protected_attribute_names=[sensitive_attribute])

print("Train and test datasets created successfully.")

Class distribution before balancing:
two_year_recid
1    2756
0    2293
Name: count, dtype: int64
Class distribution after balancing:
two_year_recid
0    2756
1    2756
Name: count, dtype: int64
Train and test datasets created successfully.


In [24]:
from collections import Counter

y = df[target_column]
X = df.drop(columns=[target_column])

# Step 2: Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Combine X_train and y_train for balancing
train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

# Step 4: Check class distribution before balancing
print("Class distribution before balancing:", Counter(train_data[target_column]))

# Step 5: Balancing the dataset with upsampling
majority_class = train_data[train_data[target_column] == 0]
minority_class = train_data[train_data[target_column] == 1]

if len(minority_class) < len(majority_class):
    # Upsample minority class
    minority_class_upsampled = resample(minority_class, 
                                        replace=True,
                                        n_samples=len(majority_class), 
                                        random_state=42)
    # Combine majority class with upsampled minority class
    balanced_train_data = pd.concat([majority_class, minority_class_upsampled])
else:
    balanced_train_data = train_data

# Shuffle the balanced data
balanced_train_data = balanced_train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Split balanced data into features and target
X_train_balanced = balanced_train_data.drop(columns=[target_column])
y_train_balanced = balanced_train_data[target_column]

# Step 7: Create BinaryLabelDataset for balanced training and test data
train_dataset = BinaryLabelDataset(df=pd.concat([X_train_balanced.reset_index(drop=True), y_train_balanced.reset_index(drop=True)], axis=1),
                                   label_names=[target_column],
                                   protected_attribute_names=[sensitive_attribute])

test_dataset = BinaryLabelDataset(df=pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1),
                                  label_names=[target_column],
                                  protected_attribute_names=[sensitive_attribute])

# Step 8: Verify label distribution after balancing
print("Balanced Train Dataset Distribution:", Counter(train_dataset.labels.ravel()))
print("Test Dataset Distribution:", Counter(test_dataset.labels.ravel()))

# Step 9: Apply LFR Transformation
from aif360.algorithms.preprocessing import LFR

lfr = LFR(unprivileged_groups=[{sensitive_attribute: 0}],
          privileged_groups=[{sensitive_attribute: 1}],
          verbose=0,
          seed=4048)

# Fit LFR only if dataset has at least two classes
if len(set(train_dataset.labels.ravel())) > 1:
    lfr.fit(train_dataset)
    train_dataset_transformed = lfr.transform(train_dataset)
    test_dataset_transformed = lfr.transform(test_dataset)

    # Extract transformed features and labels
    X_train_transformed = train_dataset_transformed.features
    y_train_transformed = train_dataset_transformed.labels.ravel()

    X_test_transformed = test_dataset_transformed.features
    y_test_transformed = test_dataset_transformed.labels.ravel()

    # Verify label distribution post-transformation
    print("Label Distribution After LFR Transformation (Training):", Counter(y_train_transformed))
    print("Label Distribution After LFR Transformation (Testing):", Counter(y_test_transformed))

Class distribution before balancing: Counter({1: 2756, 0: 2293})
Balanced Train Dataset Distribution: Counter({np.float64(1.0): 2756, np.float64(0.0): 2293})
Test Dataset Distribution: Counter({np.float64(1.0): 1207, np.float64(0.0): 958})
Label Distribution After LFR Transformation (Training): Counter({np.float64(1.0): 5049})
Label Distribution After LFR Transformation (Testing): Counter({np.float64(1.0): 2165})


In [22]:
print(len(set(y_train_transformed)))
model = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', random_state=1))
model.fit(X_train_transformed, y_train_transformed)
y_pred = model.predict(X_test_transformed)


1


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.float64(1.0)

In [12]:
y_test_transformed = y_test.to_numpy()

accuracy = accuracy_score(y_test_transformed, y_pred)
precision = precision_score(y_test_transformed, y_pred)
recall = recall_score(y_test_transformed, y_pred)
f1 = f1_score(y_test_transformed, y_pred)

X_test_copy = X_test.copy() 
X_test_copy[target_column] = y_test_transformed

dataset_true = BinaryLabelDataset(df=X_test_copy,
                                  label_names=[target_column],
                                  protected_attribute_names=[sensitive_attribute])

X_test_pred = X_test.copy()
X_test_pred[target_column] = y_pred

dataset_predicted = BinaryLabelDataset(df=X_test_pred,
                                       label_names=[target_column],
                                       protected_attribute_names=[sensitive_attribute])

assert dataset_true.labels.shape == dataset_predicted.labels.shape, \
    f"Shape mismatch: True labels {dataset_true.labels.shape}, Predicted labels {dataset_predicted.labels.shape}"

metric = BinaryLabelDatasetMetric(dataset_true,
                                  privileged_groups=[{sensitive_attribute: 1}],
                                  unprivileged_groups=[{sensitive_attribute: 0}])

classification_metric = ClassificationMetric(dataset_true, dataset_predicted,
                                             privileged_groups=[{sensitive_attribute: 1}],
                                             unprivileged_groups=[{sensitive_attribute: 0}])

disparate_impact = metric.disparate_impact()
statistical_parity_diff = metric.statistical_parity_difference()

ppv_privileged = classification_metric.positive_predictive_value(privileged=True)
ppv_unprivileged = classification_metric.positive_predictive_value(privileged=False)
ppv_parity = abs(ppv_privileged - ppv_unprivileged)

fpr_privileged = classification_metric.false_positive_rate(privileged=True)
fpr_unprivileged = classification_metric.false_positive_rate(privileged=False)
fpr_parity = abs(fpr_privileged - fpr_unprivileged)

results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1,
    "Disparate Impact": disparate_impact,
    "Statistical Parity Difference": statistical_parity_diff,
    "PPV Parity": ppv_parity,
    "FPR Parity": fpr_parity
}

print(results)


{'Accuracy': 0.7548900430986849, 'Precision': np.float64(0.5747663551401869), 'Recall': np.float64(0.10780017528483786), 'F1 Score': np.float64(0.181549815498155), 'Disparate Impact': np.float64(0.3589947942816373), 'Statistical Parity Difference': np.float64(-0.2036429050424906), 'PPV Parity': np.float64(0.314696106362773), 'FPR Parity': np.float64(0.00022729688361621675)}
