In [None]:
import os
import pandas
import numpy as np
CATEGORICAL_FEATURES = ['Workclass', 'Occupation', 'Education']
NUMERICAL_FEATURES = ['interview_score', 'cv_assessment_score']

*** Feature Selection Pipeline ***

In [3]:
def bin_age(age_series, bin_size=15):
    """Bin age into groups of specified size."""
    min_age = int(age_series.min())
    max_age = int(age_series.max())
    bins = list(range(min_age, max_age + bin_size + 1, bin_size))
    labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]
    return pandas.cut(age_series, bins=bins, labels=labels, include_lowest=True)

In [4]:
def compute_categorical_disparate_impact(df, feature, protected_char, min_category_n=30):
    """
    Compute disparate impact for categorical features using category-level proportion ratios.
    Returns summary metrics: weighted_avg_ratio, min_ratio, max_ratio, geometric_mean, pct_passing.
    """
    C = df[protected_char]
    unique_groups = sorted(C.unique())
    privileged_val = C.value_counts().idxmax()
    
    categories = df[feature].dropna().unique()
    category_ratios = {}
    category_counts = {}
    
    for cat in categories:
        cat_mask = df[feature] == cat
        cat_count = cat_mask.sum()
        
        if cat_count < min_category_n:
            continue
        
        proportions = {}
        for group in unique_groups:
            group_mask = C == group
            group_total = group_mask.sum()
            group_in_cat = (cat_mask & group_mask).sum()
            proportions[group] = group_in_cat / group_total if group_total > 0 else 0
        
        prop_privileged = proportions[privileged_val]
        
        ratios_for_cat = {}
        for group in unique_groups:
            if group != privileged_val:
                if prop_privileged > 0:
                    ratios_for_cat[str(group)] = float(proportions[group] / prop_privileged)
                elif proportions[group] > 0:
                    ratios_for_cat[str(group)] = float('inf')
                else:
                    ratios_for_cat[str(group)] = 1.0
        
        category_ratios[str(cat)] = ratios_for_cat
        category_counts[str(cat)] = int(cat_count)
    
    if not category_ratios:
        return None
    
    all_ratios = []
    for cat_ratios in category_ratios.values():
        all_ratios.extend([r for r in cat_ratios.values() if r != float('inf')])
    
    if not all_ratios:
        return None
    
    total_weight = sum(category_counts.values())
    weighted_sum = 0
    for cat in category_ratios.keys():
        cat_ratios = category_ratios[cat]
        finite_ratios = [r for r in cat_ratios.values() if r != float('inf')]
        if finite_ratios:
            avg_cat_ratio = np.mean(finite_ratios)
            weighted_sum += avg_cat_ratio * category_counts[cat]
    weighted_avg = weighted_sum / total_weight if total_weight > 0 else 1.0
    
    min_ratio = min(all_ratios)
    max_ratio = max(all_ratios)
    
    finite_ratios = [r for r in all_ratios if r > 0]
    geometric_mean = float(np.exp(np.mean(np.log(finite_ratios)))) if finite_ratios else 1.0
    
    passing = [r for r in all_ratios if 0.8 <= r <= 1.25]
    pct_passing = len(passing) / len(all_ratios)
    
    return {
        'weighted_avg_ratio': float(weighted_avg),
        'min_ratio': float(min_ratio),
        'max_ratio': float(max_ratio),
        'geometric_mean': float(geometric_mean),
        'pct_passing': float(pct_passing),
        'passes_weighted_avg': bool(0.8 <= weighted_avg <= 1.25),
        'passes_min': bool(min_ratio >= 0.8),
        'passes_max': bool(max_ratio <= 1.25),
        'passes_geometric': bool(0.8 <= geometric_mean <= 1.25),
        'passes_pct': bool(pct_passing >= 0.8),
        'privileged_group': str(privileged_val),
        'n_categories': int(len(category_ratios)),
        'category_ratios': category_ratios
    }

In [5]:
def disparate_impact(df, feature, protected_characteristics=["Sex", "Race"], threshold=0.8):
    """
    Calculate disparate impact ratio for a feature across protected groups.
    
    For categorical features: uses category-level proportion ratios with summary metrics.
    For numerical features: uses mean-based comparison.
    When Age is a protected characteristic: bins Age into 15-year groups first.
    """
    results = {}
    is_categorical = feature in CATEGORICAL_FEATURES
    
    for char in protected_characteristics:
        df_temp = df.copy()
        
        if char == 'Age':
            df_temp['Age_binned'] = bin_age(df_temp['Age'], bin_size=15)
            protected_col = 'Age_binned'
        else:
            protected_col = char
        
        if is_categorical:
            result = compute_categorical_disparate_impact(df_temp, feature, protected_col)
            if result:
                results[char] = result
        else:
            X = df_temp[feature]
            C = df_temp[protected_col]
            unique_vals = C.dropna().unique()
            
            if len(unique_vals) == 2:
                privileged_val = C.value_counts().idxmax()
                unprivileged_val = [v for v in unique_vals if v != privileged_val][0]
                
                privileged = C == privileged_val
                unprivileged = C == unprivileged_val
                
                mean_privileged = float(X[privileged].mean())
                mean_unprivileged = float(X[unprivileged].mean())
                
                if mean_privileged != 0:
                    ratio = mean_unprivileged / mean_privileged
                else:
                    ratio = float('inf') if mean_unprivileged > 0 else 1.0
                    
                results[char] = {
                    'ratio': float(ratio),
                    'mean_privileged': float(mean_privileged),
                    'mean_unprivileged': float(mean_unprivileged),
                    'passes': bool(0.8 <= ratio <= 1.25) if ratio != float('inf') else False
                }
            else:
                privileged_val = C.value_counts().idxmax()
                mean_privileged = float(X[C == privileged_val].mean())
                
                group_ratios = {}
                for val in unique_vals:
                    if val != privileged_val:
                        mean_group = float(X[C == val].mean())
                        if mean_privileged != 0:
                            group_ratios[str(val)] = float(mean_group / mean_privileged)
                        else:
                            group_ratios[str(val)] = float('inf') if mean_group > 0 else 1.0
                
                finite_ratios = [r for r in group_ratios.values() if r != float('inf')]
                min_r = min(finite_ratios) if finite_ratios else 1.0
                max_r = max(finite_ratios) if finite_ratios else 1.0
                
                results[char] = {
                    'privileged_group': str(privileged_val),
                    'group_ratios': group_ratios,
                    'min_ratio': float(min_r),
                    'max_ratio': float(max_r),
                    'passes': bool(min_r >= 0.8 and max_r <= 1.25)
                }
    
    return results

In [None]:
def valid_features(filepath):
    protected_characteristics = ["Age", "Sex", "Race","Place_Of_Birth"]
    valid_features = {}
    df = pandas.read_csv(filepath)
    df = df.drop(columns=["Hours_Per_Week","Marital_Status","Relationship"])  
    
    for col in [col for col in df.columns if col not in protected_characteristics and col != "prior_hiring_decision"]:
        print(col)
        
        # Compute disparate impact for all features
        valid_features[col] = disparate_impact(df, col, protected_characteristics)

    return {"validity": valid_features}

In [7]:
import json

data_path = "../data"
for path in os.listdir(data_path):
    if path.endswith('.csv'):
        name = path.replace('.csv', '')
        print(f"\n=== Processing {name} ===")
        result = valid_features(os.path.join(data_path, path))
        
        with open(f"feature_info_{name}.json", "w") as f:
            json.dump(result, f, indent=2)
        print(f"Saved feature_info_{name}.json")


=== Processing test ===
Workclass
Education
Occupation
interview_score
cv_assessment_score
Saved feature_info_test.json

=== Processing train ===
Workclass
Education
Occupation
interview_score
cv_assessment_score
Saved feature_info_train.json

=== Processing val ===
Workclass
Education
Occupation
interview_score
cv_assessment_score
Saved feature_info_val.json
