## Run RQ5 scenario simulation using the trained RQ4 model

In [None]:
import joblib
from src.config import FEATURE_MATRIX_CSV, MODEL_DIR, RQ5_OUTPUTS_DIR, RQ5_SCENARIOS_CSV, RQ5_TRADEOFF_CSV
from src.io_utils import read_csv, write_csv
from src.rq5_policy import generate_scenarios, simulate_tradeoffs, build_policy_memo_table

X = read_csv(FEATURE_MATRIX_CSV)
model = joblib.load(MODEL_DIR / "rq4_ew_logit_calibrated.joblib")

scenarios = generate_scenarios()
write_csv(scenarios, RQ5_SCENARIOS_CSV, index=False)

tradeoffs = simulate_tradeoffs(X, model, scenarios)
write_csv(tradeoffs, RQ5_TRADEOFF_CSV, index=False)

policy_top10 = build_policy_memo_table(tradeoffs)
write_csv(policy_top10, RQ5_OUTPUTS_DIR / "rq5_policy_top10.csv", index=False)

policy_top10


## Run SHAP

In [None]:
from src.config import FEATURE_MATRIX_CSV, OUTPUTS_DIR
from src.io_utils import read_csv
from src.labels import make_quantile_label
from src.rq4_shap import train_tree_model, compute_shap_values, shap_global_importance, export_rq4_shap_outputs

df = read_csv(FEATURE_MATRIX_CSV)

# Label using fraud_loss_ratio (if present)
df = make_quantile_label(df, target_col="fraud_loss_ratio", q=0.75, label_col="y_high_risk")

feature_cols = [c for c in df.columns if c not in ("y_high_risk", "year", "period")]

res = train_tree_model(df, feature_cols=feature_cols, label_col="y_high_risk", time_col="year")
model = res["model"]
data = res["data"]
proba = res["proba"]

preds = data[["year","period"]].copy() if "period" in data.columns else data[["year"]].copy()
preds["p_high_risk_tree"] = proba
preds["ew_fri_tree"] = 100 * preds["p_high_risk_tree"]

shap_out = compute_shap_values(model, data[feature_cols])
shap_imp = shap_global_importance(shap_out["shap_values"], feature_cols)

export_rq4_shap_outputs(
    model_pipe=model,
    metrics=res["metrics"],
    preds=preds,
    shap_importance=shap_imp,
    model_path=OUTPUTS_DIR / "models" / "rq4_tree_model.joblib",
    metrics_path=OUTPUTS_DIR / "metrics" / "rq4_tree_metrics.json",
    preds_path=OUTPUTS_DIR / "predictions" / "rq4_tree_predictions.csv",
    shap_path=OUTPUTS_DIR / "metrics" / "rq4_tree_shap_importance.csv"
)

shap_imp.head(10)


## Generate figures/tables

In [None]:
from src.config import MERGED_PANEL_CSV, OUTPUTS_DIR, FIGURES_DIR
from src.io_utils import read_csv
from src.reporting import save_table, plot_time_series, plot_bar

panel = read_csv(MERGED_PANEL_CSV)

# --- Figure 1: Payment value over time (aggregate) ---
if "total_epayment_value_ngn" in panel.columns:
    p = panel.dropna(subset=["year", "total_epayment_value_ngn"]).sort_values("year")
    plot_time_series(
        p, x="year", y="total_epayment_value_ngn",
        title="Total E-Payment Value (NGN) Over Time",
        out_path=FIGURES_DIR / "fig_payments_total_value.png"
    )

# --- Figure 2: Fraud loss ratio over time ---
if "fraud_loss_ratio" in panel.columns:
    f = panel.dropna(subset=["year", "fraud_loss_ratio"]).sort_values("year")
    plot_time_series(
        f, x="year", y="fraud_loss_ratio",
        title="Fraud Loss Ratio Over Time (Loss / Payment Value)",
        out_path=FIGURES_DIR / "fig_fraud_loss_ratio.png"
    )

# --- Table: Summary statistics for core RQ3/RQ4 variables ---
summary_cols = [c for c in [
    "total_epayment_volume","total_epayment_value_ngn",
    "fraud_cases","fraud_losses_ngn_billion","fraud_loss_ratio",
    "pay_value_g","pay_volume_g","fraud_losses_g","fraud_cases_per_1m_txn"
] if c in panel.columns]

summary_table = panel[summary_cols].describe().T.reset_index().rename(columns={"index":"variable"})
save_table(summary_table, OUTPUTS_DIR / "tables" / "tbl_summary_stats_rq3_rq4.csv")

summary_table.head()


In [None]:
from src.reporting import plot_bar
from src.config import FIGURES_DIR

# shap_imp created earlier
plot_bar(
    shap_imp, x="feature", y="mean_abs_shap",
    title="Global Feature Importance (Mean |SHAP|) â€” RQ4 Tree Model",
    out_path=FIGURES_DIR / "fig_rq4_shap_global_importance.png",
    top_n=12
)
