In [32]:
import os
import pandas
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

*** Feature Selection Pipeline ***

In [33]:
feature_info = []

In [34]:
def reliability(df, feature, protected_characteristics=['Sex', 'Race'], n_twins=1000, distance='mahalanobis'):
    """
    Assess feature reliability using distance-based twins method.
    
    1. Compute pairwise distances WITH the feature
    2. Find twins: closest pairs based on distance
    3. Compute pairwise distances WITHOUT the feature
    4. Compare average distance for twins before and after removing the feature
    
    distance: 'mahalanobis' or 'euclidean'
    """

    target = 'prior_hiring_decision'
    
    # Features excluding protected characteristics and target
    all_features = [c for c in df.columns if c not in protected_characteristics + [target]]
    
    if feature not in all_features:
        return None
    
    features_without = [f for f in all_features if f != feature]
    
    # Scale features
    scaler_with = StandardScaler()
    scaler_without = StandardScaler()
    
    X_with = scaler_with.fit_transform(df[all_features].values)
    X_without = scaler_without.fit_transform(df[features_without].values)
    
    # Sample for efficiency if dataset is large
    if len(df) > 2000:
        idx = np.random.choice(len(df), 2000, replace=False)
        X_with = X_with[idx]
        X_without = X_without[idx]
    
    # Compute distances
    if distance == 'mahalanobis':
        # Use covariance matrix for Mahalanobis
        cov_with = np.cov(X_with.T)
        cov_without = np.cov(X_without.T)
        try:
            vi_with = np.linalg.inv(cov_with)
            vi_without = np.linalg.inv(cov_without)
            dist_with = cdist(X_with, X_with, metric='mahalanobis', VI=vi_with)
            dist_without = cdist(X_without, X_without, metric='mahalanobis', VI=vi_without)
        except np.linalg.LinAlgError:
            # Fall back to euclidean if covariance is singular
            dist_with = cdist(X_with, X_with, metric='euclidean')
            dist_without = cdist(X_without, X_without, metric='euclidean')
    else:
        dist_with = cdist(X_with, X_with, metric='euclidean')
        dist_without = cdist(X_without, X_without, metric='euclidean')
    
    # Find twins: for each point, find its closest neighbor (excluding itself)
    np.fill_diagonal(dist_with, np.inf)
    twin_indices = np.argmin(dist_with, axis=1)
    
    # Get distances for twin pairs
    twin_dist_with = [dist_with[i, twin_indices[i]] for i in range(len(twin_indices))]
    twin_dist_without = [dist_without[i, twin_indices[i]] for i in range(len(twin_indices))]
    
    avg_dist_with = float(np.mean(twin_dist_with))
    avg_dist_without = float(np.mean(twin_dist_without))
    
    # Reliability: how much does removing the feature increase distance between twins
    reliability_score = float(avg_dist_without - avg_dist_with)
    
    return {
        'reliability_score': reliability_score,
        'avg_twin_dist_with_feature': avg_dist_with,
        'avg_twin_dist_without_feature': avg_dist_without,
        'n_samples': int(len(X_with))
    }

In [41]:
def disparate_impact(df, feature, protected_characteristics=["Sex", "Race"], threshold=0.8):
    """
    Calculate disparate impact ratio for a feature across protected groups.
    
    Measures whether the feature values differ significantly between protected groups.
    DI = mean(feature | unprivileged) / mean(feature | privileged)
    
    A ratio between 0.8 and 1.25 is generally considered acceptable (4/5ths rule).
    """
    
    X = df[feature]
    results = {}
    
    for char in protected_characteristics:
        C = df[char]
        unique_vals = C.unique()
        
        if len(unique_vals) == 2:
            # Binary protected characteristic
            privileged = C == C.max()  # Assume higher value is privileged
            unprivileged = ~privileged
            
            mean_privileged = float(X[privileged].mean())
            mean_unprivileged = float(X[unprivileged].mean())
            
            if mean_privileged != 0:
                ratio = mean_unprivileged / mean_privileged
            else:
                ratio = float('inf') if mean_unprivileged > 0 else 1.0
                
            results[char] = {
                'ratio': float(ratio),
                'mean_privileged': mean_privileged,
                'mean_unprivileged': mean_unprivileged
            }
        else:
            # Multi-class: compare each group to the majority/privileged group
            privileged_val = int(C.value_counts().idxmax())  # Most common as reference
            mean_privileged = float(X[C == privileged_val].mean())
            
            group_ratios = {}
            for val in unique_vals:
                if val != privileged_val:
                    mean_group = float(X[C == val].mean())
                    if mean_privileged != 0:
                        group_ratios[int(val)] = float(mean_group / mean_privileged)
                    else:
                        group_ratios[int(val)] = float('inf') if mean_group > 0 else 1.0
            
            results[char] = {
                'privileged_group': privileged_val,
                'group_ratios': group_ratios,
                'min_ratio': float(min(group_ratios.values())) if group_ratios else 1.0
            }
    
    return results

In [36]:
def valid_features(filepath):
    ###### Check validity on 
    protected_characteristics = ["Age", "Sex","Race"]
    reliable_features = {}
    valid_features = {}
    df = pandas.read_csv(filepath)
    for col in [col for col in df.columns if col not in protected_characteristics and col != "prior_hiring_decision"]:
        ###### Check correlation with prior hiring decision
        print(col)
        reliable_features[col] = reliability(df,col,protected_characteristics,0.1)
        valid_features[col] = disparate_impact(df,col,protected_characteristics)
    

        

    return {"reliability": reliable_features, "validity": valid_features}

In [37]:
import os

print(os.getcwd())

c:\Users\nikhi\Documents\Imperial\ethics\automated_interview_screening\other_files


In [38]:
data_path = "../data"
for path in os.listdir(data_path):
    feature_info.append(valid_features(os.path.join(data_path, path)))
    




Workclass
Education
Marital_Status
Occupation
Relationship
Hours_Per_Week
Place_Of_Birth
interview_score
cv_assessment_score
Workclass
Education
Marital_Status
Occupation
Relationship
Hours_Per_Week
Place_Of_Birth
interview_score
cv_assessment_score
Workclass
Education
Marital_Status
Occupation
Relationship
Hours_Per_Week
Place_Of_Birth
interview_score
cv_assessment_score


In [40]:
import json


with open("feature_info.json", "w") as f:
    json.dump(feature_info, f,indent=2)