In [0]:
import pyspark.sql.functions as F
from typing import List, Dict, Optional

In [0]:
def safe_divide(numerator_col, denominator_col, default_value=None):
    """Safely divide two columns, handling nulls, zeros, and ACS special values"""
    if isinstance(numerator_col, str):
        numerator_col = F.col(numerator_col)
    if isinstance(denominator_col, str):
        denominator_col = F.col(denominator_col)
    
    # List of ACS special values to treat as null
    acs_special_values = [-666666666, -999999999, -888888888, -555555555, -333333333, -222222222]
    
    return F.when(
        (denominator_col.isNotNull()) & 
        (denominator_col > 0) & 
        (~denominator_col.isin(acs_special_values)) &
        (~numerator_col.isin(acs_special_values)),
        numerator_col / denominator_col
    ).otherwise(default_value)

def safe_column_sum(columns: List[str]):
    """Safely sum columns, treating nulls and ACS special values as 0"""
    if not columns:
        return F.lit(0)
    
    acs_special_values = [-666666666, -999999999, -888888888, -555555555, -333333333, -222222222]
    
    result = F.lit(0)
    for col_name in columns:
        result = result + F.when(
            F.col(col_name).isin(acs_special_values), 
            F.lit(0)
        ).otherwise(
            F.coalesce(F.col(col_name), F.lit(0))
        )
    return result

def clean_acs_value(column_expr):
    """Convert ACS special values to null"""
    acs_special_values = [-666666666, -999999999, -888888888, -555555555, -333333333, -222222222]
    
    return F.when(
        column_expr.isin(acs_special_values),
        F.lit(None)
    ).otherwise(column_expr)

def validate_rate(column_expr):
    """Ensure rate values are between 0 and 1"""
    return F.when(column_expr < 0, F.lit(0)) \
           .when(column_expr > 1, F.lit(1)) \
           .otherwise(column_expr)


In [0]:
# =============================================================================
# MEDICARE-RELEVANT COMPOSITE INDICATORS (EVIDENCE-BASED)
# =============================================================================
# Mathematical Note on Composite Calculations:
# - Direct rates: Used when measuring single population characteristics
# - Minimum (F.least): Used for logical AND - both conditions must be present
# - Maximum (F.greatest): Used for logical OR - either condition matters
# - Average: Used when both factors contribute independently
# - Conditional scaling: When one factor modifies another's importance
#
# Research Foundation:
# 1. STAR RATINGS: Gupta et al. 2024 - Social vulnerability reduces quality
# 2. HEALTH EQUITY INDEX: CMS 2024 - Rewards for serving vulnerable populations
# 3. RISK ADJUSTMENT: CMS V28 - Higher payments for complex conditions
# 4. ACO PERFORMANCE: CBO 2024 - Stability and SDOH drive savings
# 5. SUPPLEMENTAL BENEFITS: CMS 2025 - Transportation/nutrition show highest ROI
# =============================================================================
composite_measure = {
        # =========================================================================
        # 1. MA MARKET ENTRY SCORE
        # Purpose: Identifies attractive markets for new MA plan entry
        # Range: 0-1, Higher = More attractive for entry
        # =========================================================================
        "mimi_ma_market_entry_score": (
            # ADDRESSABLE MARKET SIZE (40% weight)
            (F.col("medicare_65plus_rate") * F.lit(0.30)) +
            (F.col("no_insurance_55_64_rate") * F.lit(0.10)) +
            
            # REVENUE ADEQUACY (35% weight)  
            # Direct measures - no multiplication needed
            (F.col("dual_eligible_rate") * F.lit(0.20)) +
            (F.col("disability_rate_all_ages") * F.lit(0.15)) +
            
            # GROWTH INDICATORS (25% weight)
            # Population turnover in Medicare markets
            (F.col("moved_in_past_year_rate") * F.lit(0.15)) +
            (F.col("hispanic_latino_rate") * F.lit(0.10))
        ),
        
        # =========================================================================
        # 2. D-SNP PRODUCT OPPORTUNITY
        # Purpose: Identifies markets optimal for D-SNP products
        # Range: 0-1, Higher = Stronger D-SNP opportunity
        # =========================================================================
        "mimi_dsnp_product_opportunity": (
            # TARGET POPULATION DENSITY (40% weight)
            (F.col("dual_eligible_rate") * F.lit(0.30)) +
            (F.col("supplemental_security_income_rate") * F.lit(0.10)) +
            
            # CARE COMPLEXITY PAYMENTS (30% weight)
            # Use minimum for "duals WITH condition"
            (F.least(F.col("cognitive_difficulty_rate"), 
                     F.col("dual_eligible_rate")) * F.lit(0.15)) +
            (F.least(F.col("self_care_difficulty_rate"), 
                     F.col("dual_eligible_rate")) * F.lit(0.15)) +
            
            # STATE MEDICAID ENVIRONMENT (15% weight)
            (F.col("medicaid_coverage_rate") * F.lit(0.15)) +
            
            # SUPPLEMENTAL BENEFIT NEEDS (15% weight)
            (F.col("snap_recipients_rate") * F.lit(0.08)) +
            (F.col("no_vehicle_households_rate") * F.lit(0.07))
        ),
        
        # =========================================================================
        # 3. ACO SAVINGS OPPORTUNITY
        # Purpose: Identifies markets with highest ACO savings potential
        # Range: 0-1, Higher = Greater savings opportunity
        # =========================================================================
        "mimi_aco_savings_opportunity": (
            # HIGH-COST MEDICARE POPULATIONS (40% weight)
            # Use average to capture markets high in either factor
            ((F.col("disability_rate_all_ages") + F.col("medicare_65plus_rate"))/2 * F.lit(0.25)) +
            (F.col("dual_eligible_rate") * F.lit(0.15)) +
            
            # PREVENTABLE UTILIZATION (25% weight)
            # Direct measures of utilization risk
            (F.col("ambulatory_difficulty_rate") * F.lit(0.15)) +
            (F.col("living_alone_65_plus_rate") * F.lit(0.10)) +
            
            # CARE COORDINATION GAPS (20% weight)
            # Digital divide affects all ages
            ((F.lit(1) - F.col("broadband_internet_rate")) * F.lit(0.10)) +
            (F.col("cognitive_difficulty_rate") * F.lit(0.10)) +
            
            # ATTRIBUTION STABILITY (15% weight)
            ((F.lit(1) - F.col("moved_in_past_year_rate")) * F.lit(0.15))
        ),
        
        # =========================================================================
        # 4. RISK ADJUSTMENT OPPORTUNITY
        # Purpose: Quantifies potential for RAF score improvement
        # Range: 0-1, Higher = Greater documentation opportunity
        # =========================================================================
        "mimi_risk_adjustment_opportunity": (
            # COMPLEX CONDITIONS UNDERCODING (45% weight)
            # Conditions likely present but underdocumented
            (F.col("disability_rate_all_ages") * F.lit(0.20)) +
            (F.col("cognitive_difficulty_rate") * F.lit(0.15)) +
            (F.col("hearing_difficulty_rate") * F.lit(0.10)) +
            
            # PRIMARY CARE ACCESS BARRIERS (35% weight)
            # Direct barriers to diagnosis documentation
            (F.col("no_vehicle_households_rate") * F.lit(0.20)) +
            (F.col("poverty_65_plus_rate") * F.lit(0.15)) +
            
            # HEALTH SYSTEM NAVIGATION (20% weight)
            (F.col("limited_english_proficiency_rate") * F.lit(0.10)) +
            ((F.lit(1) - F.col("broadband_internet_rate")) * F.lit(0.10))
        ),
        
        # =========================================================================
        # 5. SUPPLEMENTAL BENEFITS ROI
        # Purpose: Identifies where supplemental benefits drive outcomes
        # Range: 0-1, Higher = Greater ROI from benefits
        # =========================================================================
        "mimi_supplemental_benefits_roi": (
            # TRANSPORTATION IMPACT (30% weight)
            (F.col("no_vehicle_households_rate") * F.lit(0.30)) +
            
            # FOOD INSECURITY (25% weight)
            (F.col("snap_recipients_rate") * F.lit(0.15)) +
            (F.col("poverty_65_plus_rate") * F.lit(0.10)) +
            
            # SOCIAL ISOLATION (20% weight)
            (F.col("living_alone_65_plus_rate") * F.lit(0.20)) +
            
            # FUNCTIONAL SUPPORT NEEDS (25% weight)
            (F.col("self_care_difficulty_rate") * F.lit(0.15)) +
            (F.col("independent_living_difficulty_rate") * F.lit(0.10))
        ),
        
        # =========================================================================
        # 6. NETWORK ADEQUACY CHALLENGE
        # Purpose: Quantifies difficulty meeting CMS network standards
        # Range: 0-1, Higher = More challenging
        # =========================================================================
        "mimi_network_adequacy_challenge": (
            # ACCESS BARRIERS (30% weight)
            (F.col("no_vehicle_households_rate") * F.lit(0.30)) +
            
            # SPECIALIST NEEDS (30% weight)
            (F.col("disability_rate_all_ages") * F.lit(0.15)) +
            (F.col("cognitive_difficulty_rate") * F.lit(0.15)) +
            
            # LANGUAGE ACCESS (30% weight)
            (F.col("limited_english_proficiency_rate") * F.lit(0.30)) +
            
            # GERIATRIC GAPS (10% weight)
            (F.col("age_65_and_over_rate") * F.lit(0.10))
        ),
        
        # =========================================================================
        # 7. CARE MANAGEMENT INTENSITY
        # Purpose: Estimates care management resource requirements
        # Range: 0-1, Higher = More intensive management needed
        # =========================================================================
        "mimi_care_management_intensity": (
            # COGNITIVE SUPPORT (25% weight)
            (F.col("cognitive_difficulty_rate") * F.lit(0.25)) +
            
            # SOCIAL SUPPORT GAPS (25% weight)
            (F.col("living_alone_65_plus_rate") * F.lit(0.25)) +
            
            # MEDICAL COMPLEXITY (25% weight)
            (F.col("dual_eligible_rate") * F.lit(0.12)) +
            (F.col("disability_rate_all_ages") * F.lit(0.13)) +
            
            # COMMUNICATION NEEDS (25% weight)
            (F.col("limited_english_proficiency_rate") * F.lit(0.20)) +
            (F.col("less_than_high_school_rate") * F.lit(0.05))
        ),
        
        # =========================================================================
        # 8. QUALITY MEASURE CHALLENGE
        # Purpose: Predicts difficulty achieving high Star Ratings
        # Range: 0-1, Higher = More challenging quality performance
        # =========================================================================
        "mimi_quality_measure_challenge": (
            # MEDICATION ADHERENCE BARRIERS (40% weight)
            (F.col("poverty_rate_all_ages") * F.lit(0.20)) +
            (F.col("cognitive_difficulty_rate") * F.lit(0.20)) +
            
            # PREVENTIVE CARE GAPS (30% weight)
            (F.col("no_vehicle_households_rate") * F.lit(0.15)) +
            (F.col("living_alone_65_plus_rate") * F.lit(0.15)) +
            
            # MEMBER EXPERIENCE (30% weight)
            (F.col("limited_english_proficiency_rate") * F.lit(0.15)) +
            (F.col("disability_rate_all_ages") * F.lit(0.15))
        ),
        
        # =========================================================================
        # 9. TOTAL COST OF CARE RISK
        # Purpose: Estimates relative medical cost burden
        # Range: 0-1, Higher = Higher expected PMPM
        # =========================================================================
        "mimi_total_cost_of_care_risk": (
            # COMPLEX CONDITIONS (30% weight)
            (F.col("disability_rate_all_ages") * F.lit(0.30)) +
            
            # BEHAVIORAL/COGNITIVE (25% weight)
            (F.col("cognitive_difficulty_rate") * F.lit(0.15)) +
            (F.col("self_care_difficulty_rate") * F.lit(0.10)) +
            
            # SOCIAL RISK MULTIPLIERS (25% weight)
            (F.col("living_alone_65_plus_rate") * F.lit(0.15)) +
            (F.col("no_vehicle_households_rate") * F.lit(0.10)) +
            
            # AGE AND POVERTY (20% weight)
            (F.col("age_65_and_over_rate") * F.lit(0.10)) +
            (F.col("poverty_rate_all_ages") * F.lit(0.10))
        ),
        
        # =========================================================================
        # 10. MA GROWTH POTENTIAL
        # Purpose: Identifies markets with future enrollment growth
        # Range: 0-1, Higher = Stronger growth trajectory
        # =========================================================================
        "mimi_ma_growth_potential": (
            # PRE-MEDICARE PIPELINE (40% weight)
            (F.col("no_insurance_55_64_rate") * F.lit(0.25)) +
            (F.col("near_poverty_65_plus_rate") * F.lit(0.15)) +
            
            # UNDERSERVED POPULATIONS (35% weight)
            (F.col("dual_eligible_rate") * F.lit(0.20)) +
            (F.col("poverty_65_plus_rate") * F.lit(0.15)) +
            
            # DEMOGRAPHIC DYNAMICS (25% weight)
            # Use minimum for "Medicare movers" and "Hispanic seniors"
            (F.least(F.col("moved_in_past_year_rate"), 
                     F.col("medicare_65plus_rate")) * F.lit(0.15)) +
            (F.least(F.col("hispanic_latino_rate"), 
                     F.col("age_65_and_over_rate")) * F.lit(0.10))
        ),
        
        # =========================================================================
        # 11. HEALTH EQUITY INDEX ALIGNMENT
        # Purpose: Predicts alignment with CMS Health Equity Index priorities
        # Range: 0-1, Higher = Better positioned for HEI rewards
        # New measure based on 2027 HEI implementation
        # =========================================================================
        "mimi_health_equity_index_alignment": (
            # SOCIAL RISK FACTOR ENROLLMENT (50% weight)
            # Must exceed median for full HEI benefit
            (F.col("dual_eligible_rate") * F.lit(0.20)) +
            (F.col("supplemental_security_income_rate") * F.lit(0.15)) +
            (F.col("disability_rate_all_ages") * F.lit(0.15)) +
            
            # QUALITY ACHIEVEMENT POTENTIAL (50% weight)
            # Ability to achieve quality despite challenges
            # Use inverse of major barriers
            ((F.lit(1) - F.col("limited_english_proficiency_rate")) * F.lit(0.15)) +
            ((F.lit(1) - F.col("no_vehicle_households_rate")) * F.lit(0.15)) +
            ((F.lit(1) - F.col("cognitive_difficulty_rate")) * F.lit(0.10)) +
            ((F.lit(1) - F.col("poverty_rate_all_ages")) * F.lit(0.10))
        ),
        
        # =========================================================================
        # 12. VALUE-BASED CARE READINESS
        # Purpose: Assesses market readiness for value-based care models
        # Range: 0-1, Higher = Better prepared for VBC
        # =========================================================================
        "mimi_value_based_care_readiness": (
            # POPULATION STABILITY (25% weight)
            ((F.lit(1) - F.col("moved_in_past_year_rate")) * F.lit(0.25)) +
            
            # DIGITAL ENABLEMENT (25% weight)
            (F.col("broadband_internet_rate") * F.lit(0.15)) +
            (F.col("has_computer_rate") * F.lit(0.10)) +
            
            # CARE COORDINATION CAPABILITY (25% weight)
            # Use maximum - either education or language skills help
            (F.greatest((F.lit(1) - F.col("limited_english_proficiency_rate")),
                        (F.lit(1) - F.col("less_than_high_school_rate"))) * F.lit(0.25)) +
            
            # PREVENTIVE CARE CULTURE (25% weight)
            # Use inverse of barriers
            ((F.lit(1) - F.col("no_health_insurance_rate")) * F.lit(0.15)) +
            ((F.lit(1) - F.col("poverty_rate_all_ages")) * F.lit(0.10))
        )
    }