### Bias & Fairness in Data: Bias Mitigation Techniques
**Question**: Use the Adult Income dataset and apply reweighing technique to balance the
class weights based on sensitive attributes (e.g., gender).

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

def load_adult_income():
    try:
        data = fetch_openml(name='adult', version=2, as_frame=True)
        df = data.frame.copy()

        # Replace '?' with NaN and drop rows with missing values in key columns
        df.replace('?', np.nan, inplace=True)
        required_cols = ['sex', 'class']
        if df[required_cols].isnull().any().any():
            df.dropna(subset=required_cols, inplace=True)

        # Encode target and sensitive attribute
        df['income'] = df['class'].apply(lambda x: 1 if x == '>50K' else 0)
        df.drop(columns=['class'], inplace=True)
        df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})

        if df['sex'].isnull().any():
            raise ValueError("Missing or invalid values in sensitive attribute 'sex' after mapping.")

        return df.reset_index(drop=True)
    except Exception as e:
        print(f"Error loading Adult Income dataset: {e}")
        return None

def compute_reweighing_weights(df, sensitive_attr='sex', target_attr='income'):
    if df is None or sensitive_attr not in df.columns or target_attr not in df.columns:
        raise ValueError(f"Dataframe must contain columns: {sensitive_attr}, {target_attr}")

    if df.empty:
        raise ValueError("Input dataframe is empty.")

    # Compute probabilities with smoothing to avoid zero division
    epsilon = 1e-8

    joint_counts = df.groupby([sensitive_attr, target_attr]).size()
    joint_probs = joint_counts / len(df)

    p_s = df[sensitive_attr].value_counts(normalize=True)
    p_y = df[target_attr].value_counts(normalize=True)

    weights = []
    for idx, row in df.iterrows():
        s_val = row[sensitive_attr]
        y_val = row[target_attr]

        p_s_val = p_s.get(s_val, 0)
        p_y_val = p_y.get(y_val, 0)
        p_joint = joint_probs.get((s_val, y_val), 0)

        # Prevent division by zero by adding epsilon
        if p_joint < epsilon:
            weight = 0
        else:
            weight = (p_s_val * p_y_val) / p_joint
        weights.append(weight)

    weights = pd.Series(weights, index=df.index)

    if (weights < 0).any():
        raise ValueError("Negative weights computed, which is invalid.")

    return weights

def analyze_bias(df, sensitive_attr='sex', target_attr='income'):
    print("Original class distribution by sensitive attribute:")
    print(pd.crosstab(df[sensitive_attr], df[target_attr], margins=True, normalize='index'))

def analyze_weights(df, weights, sensitive_attr='sex', target_attr='income'):
    df_weighted = df.copy()
    df_weighted['weight'] = weights

    print("\nWeighted class distribution by sensitive attribute:")
    weighted_dist = df_weighted.groupby([sensitive_attr, target_attr])['weight'].sum()
    total_weights = df_weighted.groupby(sensitive_attr)['weight'].sum()
    weighted_dist_norm = weighted_dist / total_weights
    print(weighted_dist_norm.unstack())

def test_reweighing():
    print("Testing bias mitigation with reweighing...\n")

    # Test normal run
    df = load_adult_income()
    if df is None:
        print("Failed to load data.")
        return

    analyze_bias(df)

    weights = compute_reweighing_weights(df)
    print(f"\nSample weights (first 10):\n{weights.head(10)}")

    analyze_weights(df, weights)

    assert (weights >= 0).all(), "Weights must be non-negative."

    # Test missing column error
    try:
        compute_reweighing_weights(df.drop(columns=['sex']))
    except ValueError as e:
        print(f"Missing column test caught: {e}")

    # Test empty DataFrame error
    try:
        compute_reweighing_weights(pd.DataFrame(columns=['sex', 'income']))
    except ValueError as e:
        print(f"Empty DataFrame test caught: {e}")

    # Test invalid data types (string in numeric column)
    try:
        df_invalid = df.copy()
        df_invalid.loc[0, 'sex'] = 'invalid'
        compute_reweighing_weights(df_invalid)
    except Exception as e:
        print(f"Invalid data type test caught: {e}")

    print("\nReweighing tests completed successfully.")

if __name__ == "__main__":
    test_reweighing()

Testing bias mitigation with reweighing...

Original class distribution by sensitive attribute:
income         0         1
sex                       
0       0.890749  0.109251
1       0.696233  0.303767
All     0.760718  0.239282


  joint_counts = df.groupby([sensitive_attr, target_attr]).size()



Sample weights (first 10):
0    1.092621
1    1.092621
2    0.787714
3    0.787714
4    0.854021
5    1.092621
6    1.092621
7    0.787714
8    0.854021
9    1.092621
dtype: float64

Weighted class distribution by sensitive attribute:
income         0         1
sex                       
0       0.760718  0.239282
1       0.760718  0.239282
Missing column test caught: Dataframe must contain columns: sex, income
Empty DataFrame test caught: Input dataframe is empty.
Invalid data type test caught: Cannot setitem on a Categorical with a new category (invalid), set the categories first

Reweighing tests completed successfully.


  weighted_dist = df_weighted.groupby([sensitive_attr, target_attr])['weight'].sum()
  total_weights = df_weighted.groupby(sensitive_attr)['weight'].sum()


In [1]:
# write your code from here  

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

def load_adult_income():
    try:
        # Load dataset from OpenML
        data = fetch_openml(name='adult', version=2, as_frame=True)
        df = data.frame

        # Basic cleaning: drop rows with missing values '?'
        df.replace('?', np.nan, inplace=True)
        df.dropna(inplace=True)

        # Binary target: >50K or <=50K
        df['income'] = df['class'].apply(lambda x: 1 if x == '>50K' else 0)
        df.drop(columns=['class'], inplace=True)

        # Sensitive attribute: sex (Male/Female)
        # Encode as binary: Male=1, Female=0
        df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})

        if df['sex'].isnull().any():
            raise ValueError("Missing values in sensitive attribute 'sex' after mapping.")

        return df
    except Exception as e:
        print(f"Error loading Adult Income dataset: {e}")
        return None

def compute_reweighing_weights(df, sensitive_attr='sex', target_attr='income'):
    """
    Computes instance weights using reweighing technique:
    w(x) = P(s) * P(y) / P(s,y)
    where s=sensitive attribute, y=target

    Returns weights as a Pandas Series aligned with df index.
    """

    if df is None or sensitive_attr not in df or target_attr not in df:
        raise ValueError("Dataframe missing required columns")

    # Joint distribution P(s,y)
    joint_counts = df.groupby([sensitive_attr, target_attr]).size()
    joint_probs = joint_counts / len(df)

    # Marginal distributions
    p_s = df[sensitive_attr].value_counts(normalize=True)
    p_y = df[target_attr].value_counts(normalize=True)

    weights = []
    for idx, row in df.iterrows():
        s_val = row[sensitive_attr]
        y_val = row[target_attr]
        p_s_val = p_s[s_val]
        p_y_val = p_y[y_val]
        p_joint = joint_probs[s_val][y_val] if (s_val, y_val) in joint_probs else 0

        if p_joint == 0:
            weight = 0
        else:
            weight = (p_s_val * p_y_val) / p_joint
        weights.append(weight)

    return pd.Series(weights, index=df.index)

def analyze_bias(df, sensitive_attr='sex', target_attr='income'):
    print("Original class distribution by sensitive attribute:")
    print(pd.crosstab(df[sensitive_attr], df[target_attr], margins=True, normalize='index'))

def analyze_weights(df, weights, sensitive_attr='sex', target_attr='income'):
    df_weighted = df.copy()
    df_weighted['weight'] = weights

    print("\nWeighted class distribution by sensitive attribute:")
    weighted_dist = df_weighted.groupby([sensitive_attr, target_attr])['weight'].sum()
    total_weights = df_weighted.groupby(sensitive_attr)['weight'].sum()
    weighted_dist_norm = weighted_dist / total_weights
    print(weighted_dist_norm.unstack())

def test_reweighing():
    print("Testing bias mitigation with reweighing...")

    df = load_adult_income()
    if df is None:
        print("Failed to load data.")
        return

    analyze_bias(df)

    weights = compute_reweighing_weights(df)
    print(f"\nSample weights (first 10):\n{weights.head(10)}")

    analyze_weights(df, weights)

    # Basic check: weights should all be positive
    assert (weights > 0).all(), "Weights must be positive"

    print("\nReweighing test completed successfully.")

if __name__ == "__main__":
    test_reweighing()

Testing bias mitigation with reweighing...
Original class distribution by sensitive attribute:
income         0         1
sex                       
0       0.886424  0.113576
1       0.687523  0.312477
All     0.752156  0.247844


  joint_counts = df.groupby([sensitive_attr, target_attr]).size()



Sample weights (first 10):
0     1.094009
1     1.094009
2     0.793158
3     0.793158
5     1.094009
7     0.793158
8     0.848529
9     1.094009
10    0.793158
11    1.094009
dtype: float64

Weighted class distribution by sensitive attribute:
income         0         1
sex                       
0       0.752156  0.247844
1       0.752156  0.247844

Reweighing test completed successfully.


  weighted_dist = df_weighted.groupby([sensitive_attr, target_attr])['weight'].sum()
  total_weights = df_weighted.groupby(sensitive_attr)['weight'].sum()
