In [None]:
#!/usr/bin/env python3
# dhs_birth_interval_eastern.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

# ------------------------------------------------------------------
# 1. SETTINGS
# ------------------------------------------------------------------
DATA_DIR = "../data"
FILE_PATH = os.path.join(DATA_DIR, "RWIR81FL.DTA")

# Eastern Province Districts
DIST_MAP = {
    51: 'Rwamagana', 
    52: 'Nyagatare', 
    53: 'Gatsibo',
    54: 'Kayonza',
    55: 'Kirehe',
    56: 'Ngoma',
    57: 'Bugesera'
}

# Colors: 7 Districts (Blue), Eastern Province (Green), Rwanda (Orange)
BAR_COLORS = ["#4472C4"] * 7 + ["#00B050", "#ED7D31"]

# ------------------------------------------------------------------
# 2. HELPER: INTERPOLATED MEDIAN (EXACT FLOAT)
# ------------------------------------------------------------------
def get_interpolated_median(df, val_col, wt_col):
    """Calculates the Interpolated Median for birth intervals."""
    if df.empty or df[wt_col].sum() == 0:
        return np.nan
    
    data = df[[val_col, wt_col]].dropna()
    freq = data.groupby(val_col)[wt_col].sum().sort_index()
    
    if freq.empty:
        return np.nan

    total_wt = freq.sum()
    props = freq / total_wt
    cum_props = props.cumsum()
    
    try:
        median_bin = cum_props[cum_props >= 0.5].index[0]
    except IndexError:
        return np.nan

    idx_loc = cum_props.index.get_loc(median_bin)
    prop_prev = 0.0 if idx_loc == 0 else cum_props.iloc[idx_loc - 1]
    prop_curr = props.loc[median_bin]
    
    # Formula: Lower Limit + ((0.5 - Prev_Cum) / Current_Prop)
    median_val = median_bin + ((0.5 - prop_prev) / prop_curr)
    return median_val

# ------------------------------------------------------------------
# 3. ANALYSIS PIPELINE
# ------------------------------------------------------------------
if __name__ == "__main__":
    if not os.path.exists(FILE_PATH):
        print(f"❌ Error: {FILE_PATH} not found.")
        exit()

    print("Loading Women's Data...")
    
    # Load necessary columns
    cols = ["v005", "v008", "v024", "sdistrict"] 
    for i in range(1, 21):
        idx = f"_{i:02d}"
        cols.append(f"b3{idx}")  # Date of birth
        cols.append(f"b11{idx}") # Interval

    try:
        df = pd.read_stata(FILE_PATH, convert_categoricals=False, columns=cols)
    except ValueError:
        df = pd.read_stata(FILE_PATH, convert_categoricals=False)

    df.columns = df.columns.str.lower()
    df["w"] = df["v005"] / 1000000.0

    # --- RESHAPE TO CHILD LEVEL ---
    print("Reshaping to Child Level...")
    df['row_id'] = df.index
    df_long = pd.wide_to_long(
        df, stubnames=["b3", "b11"], i="row_id", j="birth_idx", sep="_", suffix='\d+'
    ).reset_index()

    # --- FILTERS ---
    df_long = df_long.dropna(subset=["b3"]) # Existing children
    months_ago = df_long["v008"] - df_long["b3"]
    df_long = df_long[months_ago < 60]       # Last 5 years
    df_long = df_long.dropna(subset=["b11"]) # Exclude first births

    print(f"Analyzing {len(df_long)} birth intervals...")

    # --- AGGREGATION ---
    results = {}

    # A. Districts (Eastern Province Only - Region 5)
    df_east = df_long[df_long['v024'] == 5]
    for d_code, d_name in DIST_MAP.items():
        subset = df_east[df_east['sdistrict'] == d_code]
        results[d_name] = get_interpolated_median(subset, 'b11', 'w')

    # B. Eastern Province Total
    results["Eastern Province"] = get_interpolated_median(df_east, 'b11', 'w')
    
    # C. Rwanda (National)
    results["Rwanda (National)"] = get_interpolated_median(df_long, 'b11', 'w')

    # Create DataFrame
    final_df = pd.DataFrame(list(results.items()), columns=["Location", "Median Months"])
    final_df.set_index("Location", inplace=True)
    final_df["Median Months"] = final_df["Median Months"].astype(float).round(1)

    print("\n--- Summary Results ---")
    print(final_df)

    # --- JSON OUTPUT (Replaces CSV) ---
    json_filename = "eastern_Median_Birth_Interval.json"
    output_dict = {
        "indicator": "Median birth interval (months) since preceding birth",
        "unit": "Months",
        "population_type": "Births in the last 5 years (interpolated median)",
        "data": final_df["Median Months"].to_dict()
    }
    with open(json_filename, "w") as f:
        json.dump(output_dict, f, indent=4)
    print(f"✅ JSON saved: {json_filename}")

    # ------------------------------------------------------------------
    # 4. PLOTTING
    # ------------------------------------------------------------------
    ax = final_df.plot(kind="bar", legend=False, color=BAR_COLORS, figsize=(12, 7), 
                       width=0.6, edgecolor="white")

    plt.title("Median Birth Interval (Months) - Eastern Province\n(Preceding Births in Last 5 Years)", 
              fontsize=14, fontweight="bold", pad=20)
    
    # Rotated labels because there are 7 districts
    plt.xticks(rotation=45, ha='right', fontsize=11)
    plt.xlabel("")
    plt.grid(axis="y", ls="--", alpha=0.3)
    
    ax.yaxis.set_visible(False)
    for s in ["top", "right", "left"]: 
        ax.spines[s].set_visible(False)

    # Add Labels
    for c in ax.containers:
        ax.bar_label(c, fmt='%.1f', label_type="edge", padding=3,
                     fontsize=11, fontweight="bold", color="black")

    plt.tight_layout()
    plt.savefig("Eastern_Median_Birth_Interval.png", dpi=300)
    print("✅ Plot saved: Eastern_Median_Birth_Interval.png")