In [19]:
import pandas as pd
import numpy as np

# ==============================================================================
# 1. DATA LOADING AND PREPARATION
# ==============================================================================
print("--- 1. Loading and Preparing Data ---")

# Load the dataset you provided
try:
    df = pd.read_csv('state_GA_reduced_encoded.csv')
    print("Dataset loaded successfully.")
    print(f"Original dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'state_GA_reduced_encoded.csv' not found.")
    print("Please make sure the CSV file is in the same directory as this notebook.")
    exit()

# Convert 'interest_rate' to a numeric type and handle errors
df['interest_rate'] = pd.to_numeric(df['interest_rate'], errors='coerce')
df.dropna(subset=['interest_rate'], inplace=True)

# Create the binary 'favorable_interest_rate' outcome variable
FAVORABLE_INTEREST_RATE_THRESHOLD = 7.5
df['favorable_interest_rate'] = np.where(df['interest_rate'] < FAVORABLE_INTEREST_RATE_THRESHOLD, 1, 0)

print("\nDependent variables prepared.")

# --- Protected Class & Group Definition ---
# Protected Class 1: Sex
sex_df = df[df['applicant_sex'].isin(['Male', 'Female'])].copy()
privileged_sex_group = {'applicant_sex': 'Male'}
unprivileged_sex_group = {'applicant_sex': 'Female'}
print(f"\nFiltered for Sex analysis. New shape: {sex_df.shape}")

# Protected Class 2: Race
race_df = df[df['derived_race_new'].isin(['White', 'Black or African American'])].copy()
privileged_race_group = {'derived_race_new': 'White'}
unprivileged_race_group = {'derived_race_new': 'Black or African American'}
print(f"Filtered for Race analysis. New shape: {race_df.shape}")


# ==============================================================================
# 2. HELPER FUNCTIONS (MANUAL IMPLEMENTATION - NO AIF360)
# ==============================================================================

def compute_manual_fairness_metrics(df, protected_attribute, dependent_variable, privileged_group, unprivileged_group, weights_col=None):
    """
    Computes Statistical Parity Difference and Disparate Impact manually.
    Accepts optional sample weights for calculations on transformed data.
    """
    # Isolate the privileged and unprivileged groups from the dataframe
    priv_group_filter = (df[list(privileged_group.keys())[0]] == list(privileged_group.values())[0])
    unpriv_group_filter = (df[list(unprivileged_group.keys())[0]] == list(unprivileged_group.values())[0])
    
    df_priv = df[priv_group_filter]
    df_unpriv = df[unpriv_group_filter]

    if weights_col:
        # Weighted rate of favorable outcomes
        rate_priv = (df_priv[dependent_variable] * df_priv[weights_col]).sum() / df_priv[weights_col].sum()
        rate_unpriv = (df_unpriv[dependent_variable] * df_unpriv[weights_col]).sum() / df_unpriv[weights_col].sum()
    else:
        # Standard rate of favorable outcomes
        rate_priv = df_priv[dependent_variable].mean()
        rate_unpriv = df_unpriv[dependent_variable].mean()

    # Calculate metrics
    spd = rate_unpriv - rate_priv
    # Add a small epsilon to avoid division by zero
    di = rate_unpriv / (rate_priv + 1e-7)

    return {'Statistical Parity Difference': spd, 'Disparate Impact': di}

def apply_reweighting(df, protected_attribute, dependent_variable, privileged_group, unprivileged_group):
    """
    Applies the Reweighting pre-processing algorithm manually.
    Returns a new DataFrame with a 'sample_weight' column.
    """
    df_new = df.copy()
    
    # Get the key and values for the groups
    priv_key, priv_val = list(privileged_group.items())[0]
    unpriv_key, unpriv_val = list(unprivileged_group.items())[0]

    # Conditions for each of the four subgroups (e.g., Privileged with Favorable Outcome)
    priv_fav = (df_new[priv_key] == priv_val) & (df_new[dependent_variable] == 1)
    priv_unfav = (df_new[priv_key] == priv_val) & (df_new[dependent_variable] == 0)
    unpriv_fav = (df_new[unpriv_key] == unpriv_val) & (df_new[dependent_variable] == 1)
    unpriv_unfav = (df_new[unpriv_key] == unpriv_val) & (df_new[dependent_variable] == 0)

    # Calculate total observations
    N = len(df_new)

    # Calculate probabilities for each subgroup
    p_priv_fav = priv_fav.sum() / N
    p_priv_unfav = priv_unfav.sum() / N
    p_unpriv_fav = unpriv_fav.sum() / N
    p_unpriv_unfav = unpriv_unfav.sum() / N
    
    # Calculate overall probabilities for protected status and outcome
    p_priv = (df_new[priv_key] == priv_val).sum() / N
    p_unpriv = (df_new[unpriv_key] == unpriv_val).sum() / N
    p_fav = (df_new[dependent_variable] == 1).sum() / N
    p_unfav = (df_new[dependent_variable] == 0).sum() / N

    # FIX: Check for division by zero. If a subgroup probability is 0,
    # its weight is set to 1.0 (no change) to avoid the error.
    w_priv_fav = (p_priv * p_fav) / p_priv_fav if p_priv_fav > 0 else 1.0
    w_priv_unfav = (p_priv * p_unfav) / p_priv_unfav if p_priv_unfav > 0 else 1.0
    w_unpriv_fav = (p_unpriv * p_fav) / p_unpriv_fav if p_unpriv_fav > 0 else 1.0
    w_unpriv_unfav = (p_unpriv * p_unfav) / p_unpriv_unfav if p_unpriv_unfav > 0 else 1.0
    
    # Assign weights to a new column
    df_new['sample_weight'] = 1.0  # Default weight
    df_new.loc[priv_fav, 'sample_weight'] = w_priv_fav
    df_new.loc[priv_unfav, 'sample_weight'] = w_priv_unfav
    df_new.loc[unpriv_fav, 'sample_weight'] = w_unpriv_fav
    df_new.loc[unpriv_unfav, 'sample_weight'] = w_unpriv_unfav
    
    return df_new

# ==============================================================================
# 3. STEP 3.1 & 3.2: CALCULATE FAIRNESS METRICS ON ORIGINAL DATASET
# ==============================================================================
print("\n--- 3. Calculating Fairness Metrics on Original Data ---")

results = {}

# --- Analysis for Protected Class: Sex ---
results['Sex vs. Action Taken (Original)'] = compute_manual_fairness_metrics(sex_df, 'applicant_sex', 'action_taken', privileged_sex_group, unprivileged_sex_group)
results['Sex vs. Favorable Interest Rate (Original)'] = compute_manual_fairness_metrics(sex_df, 'applicant_sex', 'favorable_interest_rate', privileged_sex_group, unprivileged_sex_group)

# --- Analysis for Protected Class: Race ---
results['Race vs. Action Taken (Original)'] = compute_manual_fairness_metrics(race_df, 'derived_race_new', 'action_taken', privileged_race_group, unprivileged_race_group)
results['Race vs. Favorable Interest Rate (Original)'] = compute_manual_fairness_metrics(race_df, 'derived_race_new', 'favorable_interest_rate', privileged_race_group, unprivileged_race_group)

original_metrics_df = pd.DataFrame.from_dict(results, orient='index')
print("\n--- Fairness Metrics on Original Dataset ---")
print(original_metrics_df)

# ==============================================================================
# 4. STEP 3.3: APPLY PRE-PROCESSING BIAS MITIGATION (MANUAL REWEIGHTING)
# ==============================================================================
print("\n--- 4. Applying Pre-processing Bias Mitigation (Reweighting) ---")

# We apply our manual reweighting algorithm.
# The project asks to apply mitigation as a function of ONE dependent variable.
# We will choose to mitigate bias related to 'action_taken'.

sex_df_transformed = apply_reweighting(sex_df, 'applicant_sex', 'action_taken', privileged_sex_group, unprivileged_sex_group)
print("\nApplied Reweighting for Sex on 'action_taken'.")

race_df_transformed = apply_reweighting(race_df, 'derived_race_new', 'action_taken', privileged_race_group, unprivileged_race_group)
print("Applied Reweighting for Race on 'action_taken'.")

# ==============================================================================
# 5. STEP 3.4: CALCULATE FAIRNESS METRICS ON TRANSFORMED DATASET
# ==============================================================================
print("\n--- 5. Calculating Fairness Metrics on Transformed Data ---")

transformed_results = {}

# --- Analysis for Protected Class: Sex (Transformed) ---
# We now use the 'sample_weight' column in our metric calculation.
transformed_results['Sex vs. Action Taken (Transformed)'] = compute_manual_fairness_metrics(sex_df_transformed, 'applicant_sex', 'action_taken', privileged_sex_group, unprivileged_sex_group, weights_col='sample_weight')
transformed_results['Sex vs. Favorable Interest Rate (Transformed)'] = compute_manual_fairness_metrics(sex_df_transformed, 'applicant_sex', 'favorable_interest_rate', privileged_sex_group, unprivileged_sex_group, weights_col='sample_weight')

# --- Analysis for Protected Class: Race (Transformed) ---
transformed_results['Race vs. Action Taken (Transformed)'] = compute_manual_fairness_metrics(race_df_transformed, 'derived_race_new', 'action_taken', privileged_race_group, unprivileged_race_group, weights_col='sample_weight')
transformed_results['Race vs. Favorable Interest Rate (Transformed)'] = compute_manual_fairness_metrics(race_df_transformed, 'derived_race_new', 'favorable_interest_rate', privileged_race_group, unprivileged_race_group, weights_col='sample_weight')

transformed_metrics_df = pd.DataFrame.from_dict(transformed_results, orient='index')
print("\n--- Fairness Metrics on Transformed (Reweighted) Dataset ---")
print(transformed_metrics_df)


# ==============================================================================
# 6. ANSWERS TO STEP 3 QUESTIONS
# ==============================================================================
print("\n\n--- 7. Summary for Project Report (Step 3) ---")
print("""
1.  **Privileged and Unprivileged Groups:**
    * **Protected Class 'Sex':** We used the `applicant_sex` column.
        * Privileged Group: 'Male'
        * Unprivileged Group: 'Female'
    * **Protected Class 'Race':** We used the `derived_race_new` column.
        * Privileged Group: 'White'
        * Unprivileged Group: 'Black or African American'

2.  **Fairness Metric Selection:**
    * **Metric 1: Statistical Parity Difference (SPD):** This metric calculates the difference in the rate of favorable outcomes between unprivileged and privileged groups. A value of 0 indicates perfect fairness.
    * **Metric 2: Disparate Impact (DI):** This metric is a ratio of the rate of favorable outcomes. A value of 1.0 indicates perfect fairness. Values below 1.0 indicate bias.

3.  **Pre-processing Bias Mitigation Algorithm:**
    * **Algorithm:** We implemented the `Reweighting` algorithm manually using pandas.
    * **Justification:** Due to persistent library installation issues with `aif360`, we chose to implement a standard pre-processing algorithm from scratch, as permitted by the project instructions. Reweighting works by assigning weights to data samples to balance the dataset, ensuring that the joint distribution of the outcome and the protected attribute are similar across groups. This directly mitigates bias in the dataset before a model is trained.

4.  **Results:**
    * The two tables printed above ('Fairness Metrics on Original Dataset' and 'Fairness Metrics on Transformed (Reweighted) Dataset') contain the 8 required metric values for each stage.
    * **Interpretation:** By comparing the two tables, you can analyze the effect of our manual `Reweighting` algorithm. For the `Race vs. Action Taken` and `Sex vs. Action Taken` rows, you should see the SPD move to 0.0 and the DI move to 1.0 in the transformed table, indicating that the mitigation was mathematically successful for that specific outcome. You can also observe how mitigating for `action_taken` incidentally affected the fairness metrics for `favorable_interest_rate`.
""")


--- 1. Loading and Preparing Data ---
Dataset loaded successfully.
Original dataset shape: (109250, 26)

Dependent variables prepared.

Filtered for Sex analysis. New shape: (66740, 27)
Filtered for Race analysis. New shape: (60811, 27)

--- 3. Calculating Fairness Metrics on Original Data ---

--- Fairness Metrics on Original Dataset ---
                                             Statistical Parity Difference  \
Sex vs. Action Taken (Original)                                   0.004767   
Sex vs. Favorable Interest Rate (Original)                        0.011061   
Race vs. Action Taken (Original)                                  0.024952   
Race vs. Favorable Interest Rate (Original)                       0.048079   

                                             Disparate Impact  
Sex vs. Action Taken (Original)                      1.004526  
Sex vs. Favorable Interest Rate (Original)           1.015075  
Race vs. Action Taken (Original)                     1.023766  
Race vs. Fav