In [1]:
#!/usr/bin/env python3
# dhs_sti_figure42_bulletproof.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import math

# ------------------------------------------------------------------
# 1. SETTINGS
# ------------------------------------------------------------------
DATA_DIR = "data"
WOMEN_FILE = os.path.join(DATA_DIR, "RWIR81FL.DTA")
MEN_FILE   = os.path.join(DATA_DIR, "RWMR81FL.DTA")

# Official District Codes for Eastern Province
DIST_MAP = {
    51: 'Rwamagana', 52: 'Nyagatare', 53: 'Gatsibo',
    54: 'Kayonza', 55: 'Kirehe', 56: 'Ngoma', 57: 'Bugesera'
}

# The specific requested order
ORDER = ["Rwamagana", "Nyagatare", "Gatsibo", "Kayonza", "Kirehe", "Ngoma", "Bugesera", "East", "Rwanda"]

# ------------------------------------------------------------------
# 2. ANALYSIS ENGINE
# ------------------------------------------------------------------
def process_sti_report(file_path, is_men=False):
    # Check if file exists before trying to read
    if not os.path.exists(file_path):
        print(f"❌ Error: File NOT FOUND at {file_path}")
        return {}

    print(f"Loading {'Men' if is_men else 'Women'} data...")
    df = pd.read_stata(file_path, convert_categoricals=False)
    df.columns = df.columns.str.lower()
    
    p = 'm' if is_men else ''
    
    # --- DYNAMIC COLUMN PICKER ---
    def find_col(base):
        return next((c for c in df.columns if c.startswith(f'{p}{base}')), None)

    v_wt = f'{p}v005'
    v_reg = f'{p}v024'
    v_defacto = f'{p}v103'
    v_dist = 'smdistrict' if is_men else 'sdistrict'
    
    # STI symptom variables: STI (763a), Discharge (763b), Sore (763c)
    v_sti = find_col('v763a')
    v_dis = find_col('v763b')
    v_sor = find_col('v763c')

    if not all([v_wt, v_reg, v_sti]):
        print(f"❌ Error: Required STI variables not found in {file_path}")
        return {}

    # Cleanup: Force specific columns to numeric to avoid PerformanceWarnings/0 results
    needed = [v_wt, v_reg, v_defacto, v_dist, v_sti, v_dis, v_sor]
    work_df = df[[c for c in needed if c is not None]].copy()
    for col in work_df.columns:
        work_df[col] = pd.to_numeric(work_df[col], errors='coerce').fillna(0)

    # --- ACCURACY FILTERS ---
    # 1. De Facto Population (Slept in house last night)
    if v_defacto in work_df.columns:
        work_df = work_df[work_df[v_defacto] == 1].copy()
    
    # 2. Weights
    work_df['w'] = work_df[v_wt] / 1000000.0

    # 3. INDICATOR LOGIC: "Any STI or STI Symptom in last 12 months"
    # Matches RDHS 2019-20 Table 12.10/12.11
    work_df['any_symptom'] = ((work_df[v_sti] == 1) | 
                              (work_df[v_dis] == 1) | 
                              (work_df[v_sor] == 1)).astype(int)

    def calc_pct(subset):
        if subset.empty or subset['w'].sum() == 0: return 0
        val = np.average(subset['any_symptom'], weights=subset['w']) * 100
        return int(math.floor(val + 0.5))

    # --- AGGREGATION ---
    results = {}
    df_east = work_df[work_df[v_reg] == 5].copy()
    
    results["Rwanda"] = calc_pct(work_df)
    results["East"] = calc_pct(df_east)
    for code, name in DIST_MAP.items():
        subset = df_east[df_east[v_dist] == code]
        results[name] = calc_pct(subset)

    return results

# ------------------------------------------------------------------
# 3. EXECUTION
# ------------------------------------------------------------------
if __name__ == "__main__":
    w_data = process_sti_report(WOMEN_FILE, is_men=False)
    m_data = process_sti_report(MEN_FILE, is_men=True)

    # Combined results dictionary with 0 as default to prevent AttributeError
    final_dict = {}
    for loc in ORDER:
        final_dict[loc] = {
            "Female": w_data.get(loc, 0),
            "Male": m_data.get(loc, 0)
        }

    final_df = pd.DataFrame(final_dict).T[["Female", "Male"]]
    print("\n--- Final Matched Results (Figure 42 Targets) ---")
    print(final_df)

    # JSON Output
    with open("eastern_sti_prevalence_figure42.json", "w") as f:
        json.dump(final_df.to_dict(orient='index'), f, indent=4)

    # Plot (Matches visual image style)
    ax = final_df.plot(kind="bar", figsize=(14, 7), color=["#4F81BD", "#C0504D"], width=0.8, edgecolor="white")
    
    plt.title("Figure 42: Prevalence of sexually transmitted infections (STIs) and STI symptoms in last 12 months", 
              fontsize=13, fontweight='bold', loc='left', pad=25)
    
    plt.xticks(rotation=0, fontsize=11)
    plt.xlabel("")
    ax.yaxis.set_visible(False)
    for s in ["top", "right", "left"]: ax.spines[s].set_visible(False)
    plt.legend(["Female", "Male"], loc="lower center", bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False, fontsize=12)

    for c in ax.containers:
        ax.bar_label(c, fmt='%d', padding=3, fontweight="bold", fontsize=11)

    plt.tight_layout()
    plt.savefig("STI_Figure42_Final.png", dpi=300)
    print("✅ Success! Check the table above for your matched numbers.")

❌ Error: Missing data files.


AttributeError: 'NoneType' object has no attribute 'get'

: 