### Detect Bias in Data
**Description**: Use statistical tests to detect bias in data, which can affect AI model fairness.

In [1]:
# Write your code from here

import pandas as pd
from scipy.stats import ttest_ind, f_oneway

def check_nulls(df, columns):
    """Raise error if any of the specified columns have too many nulls (>50%)."""
    for col in columns:
        null_ratio = df[col].isnull().mean()
        if null_ratio > 0.5:
            raise ValueError(f"Too many missing values in '{col}' ({null_ratio:.2%}) to perform bias detection.")

def detect_bias_two_groups(df, group_col, target_col, group1, group2, alpha=0.05):
    """
    Detect bias between two groups using an independent t-test.
    
    Parameters:
    - df: DataFrame containing data
    - group_col: column with categorical groups (e.g., 'Gender')
    - target_col: numeric target variable (e.g., 'Salary')
    - group1, group2: two groups to compare
    - alpha: significance level
    
    Returns:
    - p-value and interpretation on bias presence.
    """
    check_nulls(df, [group_col, target_col])
    
    data1 = df[df[group_col] == group1][target_col].dropna()
    data2 = df[df[group_col] == group2][target_col].dropna()

    if len(data1) < 2 or len(data2) < 2:
        raise ValueError("Not enough data points for one or both groups to perform t-test.")
    
    stat, p_value = ttest_ind(data1, data2, equal_var=False)  # Welch's t-test
    print(f"T-test between {group1} and {group2}: p-value = {p_value:.4f}")
    
    if p_value < alpha:
        print("Statistically significant difference detected → Possible bias.")
    else:
        print("No statistically significant difference detected → No evidence of bias.")

    return p_value

def detect_bias_multiple_groups(df, group_col, target_col, alpha=0.05):
    """
    Detect bias among multiple groups using one-way ANOVA.
    
    Parameters:
    - df: DataFrame containing data
    - group_col: categorical group column
    - target_col: numeric target variable
    
    Returns:
    - p-value and interpretation on bias presence.
    """
    check_nulls(df, [group_col, target_col])
    
    groups = df[group_col].dropna().unique()
    group_data = [df[df[group_col] == g][target_col].dropna() for g in groups]

    if any(len(gd) < 2 for gd in group_data):
        raise ValueError("Not enough data points in one or more groups to perform ANOVA.")
    
    stat, p_value = f_oneway(*group_data)
    print(f"ANOVA test across groups {groups}: p-value = {p_value:.4f}")
    
    if p_value < alpha:
        print("Statistically significant differences detected among groups → Possible bias.")
    else:
        print("No statistically significant differences detected → No evidence of bias.")

    return p_value

# Sample data
data = {
    'Gender': ['M', 'F', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'F'],
    'Salary': [70000, 65000, 62000, 72000, 63000, 71000, 61000, 69000, 70000, 64000]
}

df = pd.DataFrame(data)

# Detect bias between two groups
detect_bias_two_groups(df, group_col='Gender', target_col='Salary', group1='M', group2='F')

# If you have more groups, use ANOVA like this:
# detect_bias_multiple_groups(df, group_col='Gender', target_col='Salary')

T-test between M and F: p-value = 0.0000
Statistically significant difference detected → Possible bias.


np.float64(4.992482360733886e-05)