In [1]:
# TrustyAI Bias Detection Implementation Guide
# ==========================================

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Import TrustyAI components
import trustyai
from trustyai.explainers import LimeExplainer
from trustyai.model import FeatureFactory, PredictionInput
from trustyai.utils import TestModels
from trustyai.metrics import ExplainabilityMetrics
from java.util import Arrays

print(f"🎯 TrustyAI Bias Detection Guide")
print(f"TrustyAI version: {trustyai.__version__}")
print("=" * 50)

# 1. Create Biased Dataset for Demonstration
print("\n📊 CREATING BIASED DATASET:")
print("-" * 30)

# Create a dataset with intentional bias patterns
np.random.seed(42)
n_samples = 2000

# Create demographic features (protected attributes)
gender = np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4])
race = np.random.choice(['White', 'Black', 'Asian', 'Hispanic'], n_samples, p=[0.5, 0.2, 0.2, 0.1])
age = np.random.normal(40, 12, n_samples).clip(18, 80)

# Create other features
education = np.random.choice([0, 1, 2, 3, 4], n_samples, p=[0.1, 0.2, 0.3, 0.3, 0.1])
experience = age - 22 + np.random.normal(0, 3, n_samples)
experience = experience.clip(0, None)

# Create biased target (loan approval) - intentionally discriminatory
# Higher approval rates for certain demographics (THIS IS WRONG AND ILLEGAL)
loan_approved = np.zeros(n_samples)

for i in range(n_samples):
    base_score = 0.3 + 0.1 * education[i] + 0.002 * experience[i]
    
    # Add bias (DO NOT DO THIS IN REAL SYSTEMS!)
    if gender[i] == 'Male':
        base_score += 0.15  # Male bias
    if race[i] == 'White':
        base_score += 0.12  # Racial bias
    elif race[i] == 'Asian':
        base_score += 0.08
    
    # Add some randomness
    base_score += np.random.normal(0, 0.1)
    
    loan_approved[i] = 1 if base_score > 0.5 else 0

# Create DataFrame
bias_df = pd.DataFrame({
    'gender': gender,
    'race': race,
    'age': age,
    'education': education,
    'experience': experience,
    'loan_approved': loan_approved
})

print(f"✅ Created biased dataset: {bias_df.shape}")
print(f"Loan approval rate: {loan_approved.mean():.1%}")

# Show bias in raw data
print(f"\n🚨 RAW DATA BIAS ANALYSIS:")
print("Loan approval rates by demographic:")

gender_bias = bias_df.groupby('gender')['loan_approved'].agg(['count', 'mean'])
gender_bias.columns = ['Count', 'Approval_Rate']
print(f"\nBy Gender:")
print(gender_bias)

race_bias = bias_df.groupby('race')['loan_approved'].agg(['count', 'mean'])
race_bias.columns = ['Count', 'Approval_Rate']
print(f"\nBy Race:")
print(race_bias)

# 2. Prepare Data for ML Model
print(f"\n⚙️ PREPARING DATA FOR ML MODEL:")
print("-" * 35)

# Encode categorical variables
label_encoders = {}
for col in ['gender', 'race']:
    le = LabelEncoder()
    bias_df[f'{col}_encoded'] = le.fit_transform(bias_df[col])
    label_encoders[col] = le

# Prepare features and target
feature_columns = ['age', 'education', 'experience', 'gender_encoded', 'race_encoded']
X = bias_df[feature_columns].values
y = bias_df['loan_approved'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print(f"✅ Model trained - Accuracy: {model.score(X_test, y_test):.3f}")

# 3. Statistical Bias Detection
print(f"\n📈 STATISTICAL BIAS DETECTION:")
print("=" * 40)

# Get predictions for test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Create test dataframe with predictions
test_df = pd.DataFrame(X_test, columns=feature_columns)
test_df['actual'] = y_test
test_df['predicted'] = y_pred
test_df['prediction_probability'] = y_pred_proba

# Decode categorical variables for analysis
test_df['gender'] = label_encoders['gender'].inverse_transform(test_df['gender_encoded'].astype(int))
test_df['race'] = label_encoders['race'].inverse_transform(test_df['race_encoded'].astype(int))

def calculate_bias_metrics(df, protected_attr, outcome_col='predicted'):
    """Calculate comprehensive bias metrics"""
    groups = df[protected_attr].unique()
    metrics = {}
    
    for group in groups:
        group_data = df[df[protected_attr] == group]
        total_count = len(group_data)
        positive_predictions = group_data[outcome_col].sum()
        approval_rate = positive_predictions / total_count if total_count > 0 else 0
        
        metrics[group] = {
            'count': total_count,
            'positive_predictions': positive_predictions,
            'approval_rate': approval_rate
        }
    
    return metrics

# Calculate bias metrics
print("🔍 DEMOGRAPHIC PARITY ANALYSIS:")
print("(Equal approval rates across groups)")

gender_metrics = calculate_bias_metrics(test_df, 'gender')
race_metrics = calculate_bias_metrics(test_df, 'race')

print(f"\nGender Bias Metrics:")
for gender, metrics in gender_metrics.items():
    print(f"  {gender}: {metrics['approval_rate']:.1%} approval rate ({metrics['count']} samples)")

print(f"\nRace Bias Metrics:")
for race, metrics in race_metrics.items():
    print(f"  {race}: {metrics['approval_rate']:.1%} approval rate ({metrics['count']} samples)")

# Calculate disparate impact ratios
def calculate_disparate_impact(metrics):
    """Calculate disparate impact ratios (should be close to 1.0 for fairness)"""
    rates = [m['approval_rate'] for m in metrics.values()]
    max_rate = max(rates)
    min_rate = min(rates) 
    
    if max_rate == 0:
        return float('inf')
    
    return min_rate / max_rate

gender_di = calculate_disparate_impact(gender_metrics)
race_di = calculate_disparate_impact(race_metrics)

print(f"\n⚖️ DISPARATE IMPACT RATIOS:")
print(f"Gender Disparate Impact: {gender_di:.3f} (1.0 = perfect fairness)")
print(f"Race Disparate Impact: {race_di:.3f} (1.0 = perfect fairness)")

# Flag concerning bias levels
print(f"\n🚨 BIAS ALERTS:")
if gender_di < 0.8:
    print(f"❌ SEVERE GENDER BIAS DETECTED (DI: {gender_di:.3f})")
elif gender_di < 0.9:
    print(f"⚠️  MODERATE GENDER BIAS DETECTED (DI: {gender_di:.3f})")
else:
    print(f"✅ ACCEPTABLE GENDER FAIRNESS (DI: {gender_di:.3f})")

if race_di < 0.8:
    print(f"❌ SEVERE RACIAL BIAS DETECTED (DI: {race_di:.3f})")
elif race_di < 0.9:
    print(f"⚠️  MODERATE RACIAL BIAS DETECTED (DI: {race_di:.3f})")
else:
    print(f"✅ ACCEPTABLE RACIAL FAIRNESS (DI: {race_di:.3f})")

# 4. TrustyAI Explanation-Based Bias Detection
print(f"\n🎯 TRUSTYAI EXPLANATION-BASED BIAS DETECTION:")
print("=" * 55)

# Convert sample data to TrustyAI format
sample_size = 20
X_sample = X_test[:sample_size]
feature_names = feature_columns

prediction_inputs = []
for i in range(sample_size):
    features = []
    for j, feature_name in enumerate(feature_names):
        feature = FeatureFactory.newNumericalFeature(feature_name, X_sample[i, j])
        features.append(feature)
    
    pred_input = PredictionInput(features)
    prediction_inputs.append(pred_input)

# Create TrustyAI model
weights = np.array([0.1, 0.3, 0.2, 0.25, 0.15])  # Weights for features
trusty_model = TestModels.getLinearModel(weights)

print(f"✅ Created {len(prediction_inputs)} TrustyAI prediction inputs")

# Generate explanations for different demographic groups
def analyze_group_explanations(prediction_inputs, test_samples, demographic_col, model):
    """Analyze explanations across demographic groups"""
    
    lime_explainer = LimeExplainer()
    group_explanations = {}
    
    for i in range(min(len(prediction_inputs), len(test_samples))):
        # Get demographic group
        group = test_samples.iloc[i][demographic_col]
        
        # Generate explanation
        try:
            pred_list = Arrays.asList([prediction_inputs[i]])
            pred_output = model.predictAsync(pred_list).get().get(0)
            lime_result = lime_explainer.explain(prediction_inputs[i], pred_output, model)
            
            # Extract feature importance
            df_result = lime_result.as_dataframe()
            if isinstance(df_result, dict):
                for key, value in df_result.items():
                    if hasattr(value, 'iterrows'):
                        if group not in group_explanations:
                            group_explanations[group] = []
                        
                        explanation = {}
                        for _, row in value.iterrows():
                            explanation[row['Feature']] = row['Saliency']
                        group_explanations[group].append(explanation)
                        break
        except Exception as e:
            print(f"Warning: Explanation failed for sample {i}: {e}")
    
    return group_explanations

# Analyze explanations by gender
print("🔍 ANALYZING EXPLANATIONS BY GENDER:")
gender_explanations = analyze_group_explanations(
    prediction_inputs, 
    test_df.head(sample_size), 
    'gender', 
    trusty_model
)

# Calculate average feature importance by group
def calculate_average_importance(group_explanations):
    """Calculate average feature importance across groups"""
    avg_importance = {}
    
    for group, explanations in group_explanations.items():
        if explanations:
            feature_sums = {}
            for explanation in explanations:
                for feature, importance in explanation.items():
                    if feature not in feature_sums:
                        feature_sums[feature] = []
                    feature_sums[feature].append(importance)
            
            avg_importance[group] = {}
            for feature, values in feature_sums.items():
                avg_importance[group][feature] = np.mean(values)
    
    return avg_importance

if gender_explanations:
    avg_gender_importance = calculate_average_importance(gender_explanations)
    
    print("Average feature importance by gender:")
    for gender, features in avg_gender_importance.items():
        print(f"  {gender}:")
        for feature, importance in sorted(features.items(), key=lambda x: abs(x[1]), reverse=True):
            print(f"    {feature}: {importance:.3f}")

# 5. Bias Detection in Feature Importance
print(f"\n🔍 BIAS DETECTION IN FEATURE IMPORTANCE:")
print("=" * 45)

def detect_protected_attribute_importance(explanations, protected_features):
    """Detect if protected attributes have high importance"""
    bias_alerts = []
    
    for group, features in explanations.items():
        for feature in protected_features:
            if feature in features:
                importance = abs(features[feature])
                if importance > 0.5:  # Threshold for concern
                    bias_alerts.append({
                        'group': group,
                        'feature': feature,
                        'importance': importance,
                        'severity': 'HIGH' if importance > 1.0 else 'MEDIUM'
                    })
    
    return bias_alerts

if 'avg_gender_importance' in locals():
    protected_features = ['gender_encoded', 'race_encoded']
    bias_alerts = detect_protected_attribute_importance(avg_gender_importance, protected_features)
    
    if bias_alerts:
        print("🚨 PROTECTED ATTRIBUTE BIAS ALERTS:")
        for alert in bias_alerts:
            print(f"  {alert['severity']} BIAS: {alert['feature']} has importance {alert['importance']:.3f} for {alert['group']}")
    else:
        print("✅ No high importance detected for protected attributes")

# 6. Comprehensive Bias Report
print(f"\n📋 COMPREHENSIVE BIAS DETECTION REPORT:")
print("=" * 50)

def generate_bias_report(statistical_metrics, explanation_metrics, bias_alerts):
    """Generate comprehensive bias report"""
    
    report = {
        'timestamp': pd.Timestamp.now(),
        'model_accuracy': model.score(X_test, y_test),
        'total_samples': len(X_test),
        'statistical_bias': {
            'gender_disparate_impact': gender_di,
            'race_disparate_impact': race_di
        },
        'bias_severity': 'LOW',
        'recommendations': []
    }
    
    # Determine overall bias severity
    if gender_di < 0.8 or race_di < 0.8:
        report['bias_severity'] = 'HIGH'
        report['recommendations'].extend([
            'URGENT: Review and retrain model without protected attributes',
            'Implement fairness constraints in model training',
            'Conduct legal review before deployment'
        ])
    elif gender_di < 0.9 or race_di < 0.9:
        report['bias_severity'] = 'MEDIUM'
        report['recommendations'].extend([
            'Consider bias mitigation techniques',
            'Monitor model performance across demographics',
            'Review feature engineering process'
        ])
    else:
        report['bias_severity'] = 'LOW'
        report['recommendations'].append('Continue monitoring for bias drift')
    
    # Add explanation-based recommendations
    if bias_alerts:
        report['recommendations'].append('Protected attributes showing high importance in explanations')
    
    return report

bias_report = generate_bias_report(
    {'gender': gender_metrics, 'race': race_metrics},
    avg_gender_importance if 'avg_gender_importance' in locals() else {},
    bias_alerts if 'bias_alerts' in locals() else []
)

print(f"📊 BIAS DETECTION SUMMARY:")
print(f"   Model Accuracy: {bias_report['model_accuracy']:.3f}")
print(f"   Overall Bias Severity: {bias_report['bias_severity']}")
print(f"   Gender Disparate Impact: {bias_report['statistical_bias']['gender_disparate_impact']:.3f}")
print(f"   Race Disparate Impact: {bias_report['statistical_bias']['race_disparate_impact']:.3f}")

print(f"\n🛠️ RECOMMENDATIONS:")
for i, rec in enumerate(bias_report['recommendations'], 1):
    print(f"   {i}. {rec}")

# 7. Bias Monitoring Helper Functions
print(f"\n🔧 BIAS MONITORING HELPER FUNCTIONS:")
print("=" * 40)

def continuous_bias_monitor(model, X_new, y_new, protected_attributes, threshold=0.8):
    """
    Continuously monitor model for bias on new data
    
    Args:
        model: Trained ML model
        X_new: New input data
        y_new: New true labels  
        protected_attributes: Dict mapping attribute names to column indices
        threshold: Disparate impact threshold (default 0.8)
    
    Returns:
        dict: Bias monitoring results
    """
    
    predictions = model.predict(X_new)
    results = {}
    
    for attr_name, col_idx in protected_attributes.items():
        # Get unique groups
        groups = np.unique(X_new[:, col_idx])
        group_rates = {}
        
        for group in groups:
            mask = X_new[:, col_idx] == group
            if np.sum(mask) > 0:  # Ensure group has samples
                approval_rate = np.mean(predictions[mask])
                group_rates[group] = approval_rate
        
        # Calculate disparate impact
        if len(group_rates) > 1:
            rates = list(group_rates.values())
            di_ratio = min(rates) / max(rates) if max(rates) > 0 else 0
            
            results[attr_name] = {
                'disparate_impact': di_ratio,
                'bias_detected': di_ratio < threshold,
                'group_rates': group_rates
            }
    
    return results

def bias_alert_system(bias_results, alert_threshold=0.8):
    """
    Alert system for bias detection
    
    Args:
        bias_results: Results from continuous_bias_monitor
        alert_threshold: Threshold for triggering alerts
    
    Returns:
        list: List of bias alerts
    """
    
    alerts = []
    
    for attribute, results in bias_results.items():
        di_ratio = results['disparate_impact']
        
        if di_ratio < alert_threshold:
            severity = 'CRITICAL' if di_ratio < 0.7 else 'HIGH' if di_ratio < 0.8 else 'MEDIUM'
            
            alerts.append({
                'attribute': attribute,
                'severity': severity,
                'disparate_impact': di_ratio,
                'message': f"{severity} bias detected in {attribute} (DI: {di_ratio:.3f})",
                'action_required': True if severity in ['CRITICAL', 'HIGH'] else False
            })
    
    return alerts

# Demonstrate monitoring functions
print("✅ Bias monitoring functions defined:")
print("   • continuous_bias_monitor()")
print("   • bias_alert_system()")

# Test monitoring functions
test_protected_attrs = {'gender': 3, 'race': 4}  # Column indices
test_results = continuous_bias_monitor(model, X_test[:100], y_test[:100], test_protected_attrs)
test_alerts = bias_alert_system(test_results)

print(f"\n🔍 MONITORING SYSTEM TEST:")
print(f"   Detected bias in {len([r for r in test_results.values() if r['bias_detected']])} attributes")
print(f"   Generated {len(test_alerts)} alerts")

if test_alerts:
    print("   Alert summary:")
    for alert in test_alerts:
        print(f"     {alert['severity']}: {alert['message']}")

# 8. Production Deployment Recommendations
print(f"\n🚀 PRODUCTION DEPLOYMENT RECOMMENDATIONS:")
print("=" * 50)

print("📋 BIAS DETECTION WORKFLOW:")
print("1. 📊 Pre-deployment:")
print("   • Run comprehensive bias analysis on test data")
print("   • Generate bias detection report")
print("   • Review with legal/compliance team")
print("   • Set up bias monitoring thresholds")

print("\n2. 🔄 Production monitoring:")
print("   • Implement continuous bias monitoring")
print("   • Set up automated alerts for bias drift")
print("   • Regular bias audits (monthly/quarterly)")
print("   • Track bias metrics over time")

print("\n3. 🛠️ Bias mitigation:")
print("   • Remove protected attributes from features")
print("   • Use fairness-aware ML algorithms")
print("   • Implement post-processing fairness corrections")
print("   • Regular model retraining with bias constraints")

print("\n4. 📈 Governance:")
print("   • Document all bias detection procedures")
print("   • Maintain audit trails of bias metrics")
print("   • Regular stakeholder reporting")
print("   • Legal compliance verification")

print(f"\n✅ BIAS DETECTION IMPLEMENTATION COMPLETE!")
print("=" * 50)
print("You now have a comprehensive bias detection system using TrustyAI that includes:")
print("• Statistical bias metrics (disparate impact)")
print("• Explanation-based bias detection")
print("• Continuous monitoring capabilities")
print("• Automated alert systems")
print("• Production-ready helper functions")
print("• Comprehensive reporting")

🎯 TrustyAI Bias Detection Guide
TrustyAI version: 0.6.1

📊 CREATING BIASED DATASET:
------------------------------
✅ Created biased dataset: (2000, 6)
Loan approval rate: 87.3%

🚨 RAW DATA BIAS ANALYSIS:
Loan approval rates by demographic:

By Gender:
        Count  Approval_Rate
gender                      
Female    817       0.757650
Male     1183       0.952663

By Race:
          Count  Approval_Rate
race                          
Asian       403       0.903226
Black       398       0.763819
Hispanic    189       0.777778
White      1010       0.921782

⚙️ PREPARING DATA FOR ML MODEL:
-----------------------------------
✅ Model trained - Accuracy: 0.883

📈 STATISTICAL BIAS DETECTION:
🔍 DEMOGRAPHIC PARITY ANALYSIS:
(Equal approval rates across groups)

Gender Bias Metrics:
  Female: 78.9% approval rate (251 samples)
  Male: 96.6% approval rate (349 samples)

Race Bias Metrics:
  Asian: 93.2% approval rate (133 samples)
  White: 92.7% approval rate (286 samples)
  Black: 79.4% appro