1. Extracting and forming the macro-economic indicators csv file

In [11]:
import sys
import time
import json
import math
import requests
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT")
OUT = BASE / "phase-3(final)" / "macro_indicators_wb.csv"

# World Bank indicator → output column name
INDICATORS = {
    "NY.GDP.MKTP.KD.ZG": "GDP_Growth_Rate",     # Real GDP growth (annual %)
    "FP.CPI.TOTL.ZG": "Inflation_CPI",          # CPI inflation (annual %)
    "PA.NUS.FCRF": "Exchange_Rate_USD",         # LCU per USD (avg)
    "BN.CAB.XOKA.GD.ZS": "Current_Account_GDP", # Current account balance (% of GDP)
}
WB_CODES = list(INDICATORS.keys())

def wb_fetch_series(country: str, indicator: str, start: int, end: int, retries: int = 3, sleep_s: float = 0.8) -> pd.DataFrame:
    """Fetch a single indicator as Calendar_Year/value using WB JSON API."""
    url = f"https://api.worldbank.org/v2/country/{country}/indicator/{indicator}"
    params = {"format": "json", "per_page": 200, "date": f"{start}:{end}"}
    last_err = None
    for _ in range(retries):
        try:
            r = requests.get(url, params=params, timeout=20)
            r.raise_for_status()
            data = r.json()
            if not isinstance(data, list) or len(data) < 2:
                raise ValueError(f"Unexpected response for {country} {indicator}")
            rows = data[1] or []
            recs = []
            for row in rows:
                year = row.get("date")
                val = row.get("value")
                if year is None:
                    continue
                try:
                    year = int(year)
                except Exception:
                    continue
                # Keep numeric values only
                if val is None or (isinstance(val, float) and (math.isnan(val))):
                    continue
                recs.append({"Calendar_Year": year, indicator: float(val)})
            df = pd.DataFrame(recs).drop_duplicates(subset=["Calendar_Year"])
            return df
        except Exception as e:
            last_err = e
            time.sleep(sleep_s)
    raise RuntimeError(f"World Bank API fetch failed for {country} {indicator}: {last_err}")

def to_fy_label(cal_year: int) -> str:
    y2 = cal_year % 100
    y1 = (cal_year - 1) % 100
    return f"{y1:02d}-{y2:02d}"

def main():
    start, end = 2005, 2025  # CY → will map to FY 05-06 … 24-25

    # Fetch India indicators
    df = None
    for code in WB_CODES:
        part = wb_fetch_series("IND", code, start, end)
        df = part if df is None else pd.merge(df, part, on="Calendar_Year", how="outer")

    # Fetch Global GDP growth for control
    wld = wb_fetch_series("WLD", "NY.GDP.MKTP.KD.ZG", start, end)
    wld = wld.rename(columns={"NY.GDP.MKTP.KD.ZG": "Global_GDP_Growth"})
    df = pd.merge(df, wld, on="Calendar_Year", how="left")

    # Rename indicator codes to friendly names
    df = df.rename(columns=INDICATORS)

    # Map Calendar Year → Fiscal Year label (uses end-year)
    df["Fiscal_Year"] = df["Calendar_Year"].apply(to_fy_label)
    df = df.drop(columns=["Calendar_Year"])
    # Ensure numeric
    for c in ["GDP_Growth_Rate","Inflation_CPI","Exchange_Rate_USD","Current_Account_GDP","Global_GDP_Growth"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Election and inflation flags
    elections = [2009, 2014, 2019, 2024]
    election_fys = {f"{e%100:02d}-{(e+1)%100:02d}": 1 for e in elections}
    df["Election_Year"] = df["Fiscal_Year"].map(election_fys).fillna(0).astype(int)
    df["High_Inflation"] = (df["Inflation_CPI"] > 6).astype("Int64")

    # Reindex to full FY range and make lags
    fy_keep = [to_fy_label(y) for y in range(2006, 2026)]  # 05-06 … 24-25
    df = df.set_index("Fiscal_Year").reindex(fy_keep)
    df["GDP_Growth_Lag1"] = pd.to_numeric(df["GDP_Growth_Rate"], errors="coerce").shift(1)
    df["Inflation_Lag1"] = pd.to_numeric(df["Inflation_CPI"], errors="coerce").shift(1)
    df = df.reset_index().rename(columns={"index": "Fiscal_Year"})

    # Sanity: warn if negative GDP growth before FY 19-20
    pre_covid_bad = df[(df["Fiscal_Year"] < "19-20") & (pd.to_numeric(df["GDP_Growth_Rate"], errors="coerce") < -3)]
    if not pre_covid_bad.empty:
        print("Warning: unexpected negative GDP growth before FY 19-20:\n", pre_covid_bad[["Fiscal_Year","GDP_Growth_Rate"]])

    # Order columns
    cols = [
        "Fiscal_Year",
        "GDP_Growth_Rate",
        "Inflation_CPI",
        "Exchange_Rate_USD",
        "Current_Account_GDP",
        "Global_GDP_Growth",
        "Election_Year",
        "High_Inflation",
        "GDP_Growth_Lag1",
        "Inflation_Lag1",
    ]
    for c in cols:
        if c not in df.columns:
            df[c] = pd.NA
    df_out = df[cols].reset_index(drop=True)

    OUT.parent.mkdir(parents=True, exist_ok=True)
    df_out.to_csv(OUT, index=False)
    print(f"Saved macro indicators to: {OUT}")
    print(df_out.head(5))

if __name__ == "__main__":
    main()

Saved macro indicators to: /Users/vvmohith/Desktop/PROJECT/phase-3(final)/macro_indicators_wb.csv
  Fiscal_Year  GDP_Growth_Rate  Inflation_CPI  Exchange_Rate_USD  \
0       05-06         8.060733       5.796523          45.307008   
1       06-07         7.660815       6.372881          41.348533   
2       07-08         3.086698       8.349267          43.505183   
3       08-09         7.861889      10.882353          48.405267   
4       09-10         8.497585      11.989390          45.725812   

   Current_Account_GDP  Global_GDP_Growth  Election_Year  High_Inflation  \
0            -0.988988           4.471221            0.0               0   
1            -0.663718           4.385583            0.0               1   
2            -2.583377           2.073953            0.0               1   
3            -1.951462          -1.319598            0.0               1   
4            -3.253484           4.527147            1.0               1   

   GDP_Growth_Lag1  Inflation_Lag1  

2. this is where i have combined the columns of csv along with the macro-economic indicators


In [12]:
import re
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
BUDGETS = BASE / "standardized_budget_time_series.csv"
MACROS = BASE.parent / "phase-3(final)" / "macro_indicators_wb.csv"
OUT = BASE / "cumulative_budget_macro.csv"

# Keep only ministries with a value in every FY column present in the budgets file
FULL_COVERAGE_ONLY = True  # set False to keep all ministries (even with gaps)

def main():
    # Load budgets (wide)
    dfb = pd.read_csv(BUDGETS, dtype={"Base_Ministry": "string"})
    dfb["Base_Ministry"] = (
        dfb["Base_Ministry"]
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

    # Identify FY columns like 05-06, 23-24, etc.
    fy_cols = [c for c in dfb.columns if re.fullmatch(r"\d{2}-\d{2}", str(c))]
    if not fy_cols:
        raise RuntimeError("No fiscal-year columns found in standardized_budget_time_series.csv")

    # Coerce budget numbers to numeric
    dfb[fy_cols] = (
        dfb[fy_cols]
        .replace(r"[,\s]", "", regex=True)
        .apply(pd.to_numeric, errors="coerce")
    )

    # If duplicate ministry names exist, sum their values (min_count=1 keeps NaN if all NaN)
    dfb = dfb.groupby("Base_Ministry", as_index=False)[fy_cols].sum(min_count=1)

    # Tidy to long format: one row per (Base_Ministry, Fiscal_Year)
    long = dfb.melt(
        id_vars="Base_Ministry",
        value_vars=fy_cols,
        var_name="Fiscal_Year",
        value_name="Budget_Amount"
    )

    # Optionally keep only ministries with complete coverage across all FY columns
    if FULL_COVERAGE_ONLY:
        coverage = long.groupby("Base_Ministry")["Budget_Amount"].apply(lambda s: s.notna().sum())
        required = len(fy_cols)
        keep_ministries = coverage[coverage == required].index
        long = long[long["Base_Ministry"].isin(keep_ministries)]

    # Drop rows with no budget
    long = long.dropna(subset=["Budget_Amount"])

    # Load macros and keep overlapping FYs only
    macros = pd.read_csv(MACROS, dtype={"Fiscal_Year": "string"})
    macros = macros[macros["Fiscal_Year"].isin(fy_cols)].copy()

    # Merge
    merged = long.merge(macros, on="Fiscal_Year", how="inner")

    # Sort for readability
    # order FY by end-year
    def fy_sort_key(s: pd.Series) -> pd.Series:
        return s.str.split("-", expand=True)[1].astype(int)
    merged = merged.sort_values(["Base_Ministry", "Fiscal_Year"], key=lambda col: fy_sort_key(col) if col.name=="Fiscal_Year" else col)

    # Save
    OUT.parent.mkdir(parents=True, exist_ok=True)
    merged.to_csv(OUT, index=False)

    # Report
    fy_span = ", ".join(fy_cols[:3] + (["..."] if len(fy_cols) > 6 else []) + fy_cols[-3:])
    print(f"Saved: {OUT}")
    print(f"Rows: {len(merged)}, Ministries: {merged['Base_Ministry'].nunique()}, FY columns used: {len(fy_cols)} [{fy_span}]")
    if FULL_COVERAGE_ONLY:
        print("Filter: kept only ministries with values for all FY columns.")

if __name__ == "__main__":
    main()

Saved: /Users/vvmohith/Desktop/PROJECT/final_data/cumulative_budget_macro.csv
Rows: 1037, Ministries: 61, FY columns used: 17 [05-06, 06-07, 07-08, ..., 21-22, 22-23, 23-24]
Filter: kept only ministries with values for all FY columns.


3. Per‑ministry single‑year modeling (FY 23‑24) + visualizations

- Builds features (macro vars + Budget_Lag1) and trains per‑ministry models on data up to FY 22‑23.
- Predicts FY 23‑24 for each ministry using: Linear, Ridge, Lasso, Random Forest, GBM.
- Saves:
  - ministry_macro_predictions.csv and macro_model_metrics.csv
  - Plots in final_data/figs: metrics_*.png, preds_vs_actual_best.png, residuals_best.png, top_ministries_actual_vs_best.png, feature_importance_*.png.
- Use these plots to assess accuracy, bias, and which features matter most (pooled importances).

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("viridis")

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
DATA = BASE / "cumulative_budget_macro.csv"
OUT_PRED = BASE / "ministry_macro_predictions.csv"
OUT_METRICS = BASE / "macro_model_metrics.csv"
PLOTS_DIR = BASE / "figs"

# Feature columns from macros + we add a per‑ministry budget lag
MACRO_COLS = [
    "GDP_Growth_Rate",
    "Inflation_CPI",
    "Exchange_Rate_USD",
    "Current_Account_GDP",
    "Global_GDP_Growth",
    "Election_Year",
    "High_Inflation",
    "GDP_Growth_Lag1",
    "Inflation_Lag1",
]

def fy_end_year(fy: str) -> int:
    # '05-06' -> 2006
    end = int(str(fy).split("-")[1])
    return 2000 + end

def make_plots(preds: pd.DataFrame, metrics_df: pd.DataFrame, df_all: pd.DataFrame, feature_cols: list):
    PLOTS_DIR.mkdir(parents=True, exist_ok=True)
    if metrics_df.empty or preds.empty:
        print("No metrics/predictions to plot.")
        return

    # 1) Metrics bar charts (MAE and RMSE)
    for metric in ["MAE", "RMSE", "MAPE_%", "R2"]:
        if metric not in metrics_df.columns:
            continue
        plt.figure(figsize=(8, 5))
        order = metrics_df.sort_values(metric, ascending=(metric != "R2"))["Model"]
        sns.barplot(data=metrics_df, x="Model", y=metric, order=order)
        plt.title(f"Model {metric}")
        plt.tight_layout()
        out = PLOTS_DIR / f"metrics_{metric.lower()}.png"
        plt.savefig(out, dpi=300)
        plt.close()
        print(f"Saved: {out}")

    # Best model
    best = metrics_df.iloc[0]["Model"]
    col = f"{best}_Prediction"
    valid = preds.dropna(subset=[col, "Actual_Budget"]).copy()
    if valid.empty:
        print("No valid rows for best-model plots.")
        return

    # 2) Predictions vs Actual (scatter)
    plt.figure(figsize=(6, 6))
    sns.scatterplot(x=valid["Actual_Budget"], y=valid[col])
    lim = [
        0,
        max(valid["Actual_Budget"].max(), valid[col].max()) * 1.05
    ]
    plt.plot(lim, lim, "r--", linewidth=1)
    plt.xlim(lim); plt.ylim(lim)
    plt.xlabel("Actual 23-24")
    plt.ylabel(f"{best} Prediction")
    plt.title(f"Predictions vs Actuals ({best})")
    plt.tight_layout()
    out = PLOTS_DIR / "preds_vs_actual_best.png"
    plt.savefig(out, dpi=300); plt.close()
    print(f"Saved: {out}")

    # 3) Residuals histogram
    residuals = valid[col] - valid["Actual_Budget"]
    plt.figure(figsize=(8, 5))
    plt.hist(residuals, bins=20, alpha=0.8)
    plt.axvline(residuals.mean(), color="red", linestyle="--", label=f"Mean {residuals.mean():.1f}")
    plt.title(f"Residuals ({best})")
    plt.xlabel("Prediction - Actual")
    plt.legend()
    plt.tight_layout()
    out = PLOTS_DIR / "residuals_best.png"
    plt.savefig(out, dpi=300); plt.close()
    print(f"Saved: {out}")

    # 4) Top ministries by actual spend (bar, side-by-side)
    topn = valid.sort_values("Actual_Budget", ascending=False).head(15).copy()
    plt.figure(figsize=(14, 7))
    x = np.arange(len(topn))
    width = 0.45
    plt.bar(x - width/2, topn["Actual_Budget"], width=width, label="Actual")
    plt.bar(x + width/2, topn[col], width=width, label=f"{best}")
    plt.xticks(x, topn["Base_Ministry"], rotation=45, ha="right")
    plt.ylabel("Budget (crores)")
    plt.title(f"Top 15 ministries: Actual vs {best}")
    plt.legend()
    plt.tight_layout()
    out = PLOTS_DIR / "top_ministries_actual_vs_best.png"
    plt.savefig(out, dpi=300); plt.close()
    print(f"Saved: {out}")

    # 5) Pooled feature importance (RF and GBM) trained on all train rows
    train_all = df_all[(df_all["Year_End"] <= 2023)].dropna(subset=["Budget_Amount"] + feature_cols).copy()
    if not train_all.empty:
        X_all = train_all[feature_cols]
        y_all = train_all["Budget_Amount"]

        rf = RandomForestRegressor(n_estimators=400, random_state=42)
        rf.fit(X_all, y_all)
        imp_rf = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False).head(12)

        plt.figure(figsize=(10, 6))
        sns.barplot(x=imp_rf.values, y=imp_rf.index)
        plt.title("Random Forest feature importance (pooled)")
        plt.xlabel("Importance")
        plt.tight_layout()
        out = PLOTS_DIR / "feature_importance_rf.png"
        plt.savefig(out, dpi=300); plt.close()
        print(f"Saved: {out}")

        gbm = GradientBoostingRegressor(n_estimators=400, random_state=42)
        gbm.fit(X_all, y_all)
        imp_gbm = pd.Series(gbm.feature_importances_, index=feature_cols).sort_values(ascending=False).head(12)

        plt.figure(figsize=(10, 6))
        sns.barplot(x=imp_gbm.values, y=imp_gbm.index)
        plt.title("GBM feature importance (pooled)")
        plt.xlabel("Importance")
        plt.tight_layout()
        out = PLOTS_DIR / "feature_importance_gbm.png"
        plt.savefig(out, dpi=300); plt.close()
        print(f"Saved: {out}")
    else:
        print("Skipped pooled feature importance (no training rows).")

def main():
    df = pd.read_csv(DATA, dtype={"Base_Ministry": "string", "Fiscal_Year": "string"})
    # Ensure numerics
    for c in ["Budget_Amount"] + MACRO_COLS:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Sort and create time helper
    df["Year_End"] = df["Fiscal_Year"].apply(fy_end_year)

    # Add per‑ministry budget lag (strong predictor)
    df = df.sort_values(["Base_Ministry", "Year_End"])
    df["Budget_Lag1"] = df.groupby("Base_Ministry")["Budget_Amount"].shift(1)

    # Final feature set
    feature_cols = [c for c in MACRO_COLS if c in df.columns] + ["Budget_Lag1"]

    # Target test year (FY 23-24 -> end year 2024)
    TEST_END_YEAR = 2024
    TRAIN_MAX_END_YEAR = TEST_END_YEAR - 1

    preds_rows = []

    # Models
    linear = Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", LinearRegression())])
    ridge = Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", Ridge(alpha=10.0, random_state=42))])
    lasso = Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", Lasso(alpha=1.0, random_state=42))])
    rf = RandomForestRegressor(n_estimators=300, random_state=42)
    gbm = GradientBoostingRegressor(n_estimators=300, random_state=42)

    models = {
        "Linear": linear,
        "Ridge": ridge,
        "Lasso": lasso,
        "RF": rf,
        "GBM": gbm,
    }

    # Train per ministry and predict 23-24
    for ministry, g in df.groupby("Base_Ministry", dropna=True):
        g = g.copy().sort_values("Year_End")

        train = g[g["Year_End"] <= TRAIN_MAX_END_YEAR].dropna(subset=["Budget_Amount"] + feature_cols)
        test = g[g["Year_End"] == TEST_END_YEAR]

        if train.empty or test.empty:
            continue

        X_train = train[feature_cols]
        y_train = train["Budget_Amount"]
        X_test = test[feature_cols]
        y_test = test["Budget_Amount"]

        row = {"Base_Ministry": ministry, "Fiscal_Year": "23-24", "Actual_Budget": float(y_test.iloc[0]) if not y_test.isna().all() else np.nan}

        for name, model in models.items():
            try:
                model.fit(X_train, y_train)
                y_pred = float(model.predict(X_test)[0])
                row[f"{name}_Prediction"] = y_pred
            except Exception:
                row[f"{name}_Prediction"] = np.nan

        preds_rows.append(row)

    preds = pd.DataFrame(preds_rows)

    # Compute metrics across ministries (only where both pred and actual exist)
    metrics = []
    for name in models.keys():
        col = f"{name}_Prediction"
        valid = preds.dropna(subset=[col, "Actual_Budget"]).copy()
        if valid.empty:
            continue
        y_true = valid["Actual_Budget"].astype(float).values
        y_pred = valid[col].astype(float).values
        mae = mean_absolute_error(y_true, y_pred)
        rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
        r2 = r2_score(y_true, y_pred)
        mape = float(np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-12))) * 100.0)
        metrics.append({"Model": name, "Count": len(valid), "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape})

    metrics_df = pd.DataFrame(metrics).sort_values("MAE")

    # Save CSVs
    OUT_PRED.parent.mkdir(parents=True, exist_ok=True)
    preds.to_csv(OUT_PRED, index=False)
    metrics_df.to_csv(OUT_METRICS, index=False)
    print(f"Saved predictions: {OUT_PRED}")
    print(f"Saved metrics:     {OUT_METRICS}")

    # Plots
    make_plots(preds, metrics_df, df, feature_cols)

if __name__ == "__main__":
    main()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Saved predictions: /Users/vvmohith/Desktop/PROJECT/final_data/ministry_macro_predictions.csv
Saved metrics:     /Users/vvmohith/Desktop/PROJECT/final_data/macro_model_metrics.csv
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_mae.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_rmse.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_mape_%.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_r2.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_rmse.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_mape_%.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/metrics_r2.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/preds_vs_actual_best.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/residuals_best.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/preds_vs_actual_best.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs/residuals_best.png
Saved: /Us

4. Rolling backtest with engineered features and extra models

- Adds richer features: Budget_Lag1/Lag2, Budget_Growth_Lag1, Trend, Inflation×Election.
- Evaluates models via forward‑chaining across multiple years (train < year, predict year).
- Uses log1p target for stability (configurable).
- Models: Linear, Ridge, Lasso, RF, GBM, HistGradientBoosting, and a small MLP.
- Saves:
  - ministry_macro_backtest_predictions.csv and macro_model_metrics_backtest.csv
  - Plots in final_data/figs-2: backtest_mae/rmse/mape_%/r2.png, backtest_error_over_years.png.
- Use these to judge out‑of‑sample performance and stability over time.

In [15]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("viridis")

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
DATA = BASE / "cumulative_budget_macro.csv"
OUT_PRED = BASE / "ministry_macro_backtest_predictions.csv"
OUT_METRICS = BASE / "macro_model_metrics_backtest.csv"
PLOTS = BASE / "figs-2"

MACRO_COLS = [
    "GDP_Growth_Rate",
    "Inflation_CPI",
    "Exchange_Rate_USD",
    "Current_Account_GDP",
    "Global_GDP_Growth",
    "Election_Year",
    "High_Inflation",
    "GDP_Growth_Lag1",
    "Inflation_Lag1",
]

USE_LOG_TARGET = True  # log1p target often stabilizes trends
MIN_TRAIN_YEARS = 6    # require at least this many training rows per split

def fy_end_year(fy: str) -> int:
    end = int(str(fy).split("-")[1])
    return 2000 + end

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Year_End"] = df["Fiscal_Year"].apply(fy_end_year)
    df = df.sort_values(["Base_Ministry", "Year_End"])

    # Per‑ministry lags and growth
    df["Budget_Lag1"] = df.groupby("Base_Ministry")["Budget_Amount"].shift(1)
    df["Budget_Lag2"] = df.groupby("Base_Ministry")["Budget_Amount"].shift(2)
    df["Budget_Growth_Lag1"] = (df["Budget_Amount"] / df["Budget_Lag1"] - 1).replace([np.inf, -np.inf], np.nan)

    # Time trend (helps linear models)
    df["Trend"] = df.groupby("Base_Ministry")["Year_End"].transform(lambda s: s - s.min())

    # Optional simple interaction
    if "Inflation_CPI" in df.columns and "Election_Year" in df.columns:
        df["Inflation_x_Election"] = pd.to_numeric(df["Inflation_CPI"], errors="coerce") * pd.to_numeric(df["Election_Year"], errors="coerce")
    else:
        df["Inflation_x_Election"] = np.nan

    # Final feature list
    feature_cols = [c for c in MACRO_COLS if c in df.columns] + [
        "Budget_Lag1", "Budget_Lag2", "Budget_Growth_Lag1", "Trend", "Inflation_x_Election"
    ]
    return df, feature_cols

def main():
    PLOTS.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(DATA, dtype={"Base_Ministry": "string", "Fiscal_Year": "string"})
    # Coerce numerics
    df["Budget_Amount"] = pd.to_numeric(df["Budget_Amount"], errors="coerce")
    for c in MACRO_COLS:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    df, feature_cols = build_features(df)

    # Models to compare
    models = {
        "Linear": Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", LinearRegression())]),
        "Ridge":  Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", Ridge(alpha=10.0, random_state=42))]),
        "Lasso":  Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", Lasso(alpha=1.0, random_state=42))]),
        "RF":     RandomForestRegressor(n_estimators=400, random_state=42),
        "GBM":    GradientBoostingRegressor(n_estimators=400, random_state=42),
        "HGB":    HistGradientBoostingRegressor(max_depth=3, l2_regularization=0.1, random_state=42),
        "MLP":    Pipeline([("scaler", StandardScaler(with_mean=False)), ("m", MLPRegressor(hidden_layer_sizes=(64, 32), alpha=1e-3, max_iter=2000, random_state=42))]),
    }

    # Rolling backtest across many years (predict end year t using data <= t-1)
    # Choose a reasonable window given your data span
    all_years = sorted(df["Year_End"].dropna().unique())
    start_eval = max(min(all_years) + 6, 2014)  # ensure enough train years
    end_eval = max(all_years)                   # should include 2024
    eval_years = [y for y in all_years if start_eval <= y <= end_eval]

    preds_rows = []
    for ministry, g in df.groupby("Base_Ministry", dropna=True):
        g = g.copy().sort_values("Year_End")
        for test_year in eval_years:
            train = g[g["Year_End"] < test_year].dropna(subset=["Budget_Amount"] + feature_cols)
            test = g[g["Year_End"] == test_year]
            if len(train) < MIN_TRAIN_YEARS or test.empty:
                continue

            X_train = train[feature_cols]
            y_train = train["Budget_Amount"].astype(float)
            X_test = test[feature_cols]
            y_test = test["Budget_Amount"].astype(float)

            row = {"Base_Ministry": ministry, "Fiscal_Year": test["Fiscal_Year"].iloc[0], "Year_End": test_year,
                   "Actual_Budget": float(y_test.iloc[0]) if not y_test.isna().all() else np.nan}

            for name, model in models.items():
                try:
                    if USE_LOG_TARGET:
                        y_tr = np.log1p(y_train)
                        model.fit(X_train, y_tr)
                        y_pred = float(np.expm1(model.predict(X_test))[0])
                    else:
                        model.fit(X_train, y_train)
                        y_pred = float(model.predict(X_test)[0])
                    row[f"{name}_Prediction"] = y_pred
                except Exception:
                    row[f"{name}_Prediction"] = np.nan
            preds_rows.append(row)

    preds = pd.DataFrame(preds_rows)

    # Metrics aggregated across all backtest forecasts
    metrics = []
    for name in models.keys():
        col = f"{name}_Prediction"
        valid = preds.dropna(subset=[col, "Actual_Budget"]).copy()
        if valid.empty:
            continue
        y_true = valid["Actual_Budget"].values.astype(float)
        y_pred = valid[col].values.astype(float)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
        r2 = r2_score(y_true, y_pred)
        mape = float(np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-12))) * 100.0)
        metrics.append({"Model": name, "Count": len(valid), "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape})

    metrics_df = pd.DataFrame(metrics).sort_values("MAE")

    # Save
    OUT_PRED.parent.mkdir(parents=True, exist_ok=True)
    preds.to_csv(OUT_PRED, index=False)
    metrics_df.to_csv(OUT_METRICS, index=False)
    print(f"Saved predictions: {OUT_PRED}")
    print(f"Saved metrics:     {OUT_METRICS}")
    if not metrics_df.empty:
        print(metrics_df.to_string(index=False))

    # Basic plots
    if not metrics_df.empty and not preds.empty:
        # Metrics bars
        for metric in ["MAE", "RMSE", "MAPE_%", "R2"]:
            if metric not in metrics_df.columns:
                continue
            plt.figure(figsize=(8, 5))
            order = metrics_df.sort_values(metric, ascending=(metric != "R2"))["Model"]
            sns.barplot(data=metrics_df, x="Model", y=metric, order=order)
            plt.title(f"Backtest {metric} by model")
            plt.tight_layout()
            out = PLOTS / f"backtest_{metric.lower()}.png"
            plt.savefig(out, dpi=300); plt.close()
            print(f"Saved: {out}")

        # Error over years for best model
        best = metrics_df.iloc[0]["Model"]
        col = f"{best}_Prediction"
        year_err = preds.dropna(subset=[col, "Actual_Budget"]).copy()
        year_err["Abs_Error"] = (year_err[col] - year_err["Actual_Budget"]).abs()
        yearly = year_err.groupby("Year_End")["Abs_Error"].median().reset_index()
        plt.figure(figsize=(8,5))
        sns.lineplot(data=yearly, x="Year_End", y="Abs_Error", marker="o")
        plt.title(f"Median absolute error over years ({best})")
        plt.tight_layout()
        out = PLOTS / "backtest_error_over_years.png"
        plt.savefig(out, dpi=300); plt.close()
        print(f"Saved: {out}")

if __name__ == "__main__":
    main()



Saved predictions: /Users/vvmohith/Desktop/PROJECT/final_data/ministry_macro_backtest_predictions.csv
Saved metrics:     /Users/vvmohith/Desktop/PROJECT/final_data/macro_model_metrics_backtest.csv
 Model  Count          MAE         RMSE            R2       MAPE_%
   GBM    549 1.085168e+04 3.434682e+04  5.502673e-01 1.273510e+02
    RF    549 1.227256e+04 3.379402e+04  5.646273e-01 1.126442e+02
   HGB    549 1.662989e+04 4.008722e+04  3.873767e-01 1.086272e+02
 Ridge    549 1.737282e+14 4.070558e+15 -6.316689e+21 2.840390e+11
Linear    549 4.766114e+19 1.116735e+21 -4.754251e+32 1.500797e+18
   MLP    549 2.878266e+22 6.743993e+23 -1.733867e+38 4.705406e+19
 Lasso    549 5.087235e+22 1.191977e+24 -5.416483e+38 8.316640e+19
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs-2/backtest_mae.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs-2/backtest_rmse.png
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/figs-2/backtest_mape_%.png
Saved: /Users/vvmohith/Desktop/PROJECT/fi

5. Model comparison and diagnostics (backtest vs single‑year)

- Builds a combined leaderboard of metrics from both runs (MAE/RMSE/R2/MAPE + MedianAE/P90AE/Hit10%/Hit20%).
- Checks rank agreement between backtest and single‑year (Kendall tau).
- Finds hardest/easiest ministries (median absolute error) for the best backtest model.
- Compares FY 23‑24 absolute errors to each ministry’s historical median error.
- Quick calibration for the best model (Pred vs Actual linear fit) to detect systematic bias.
```5. Model comparison and diagnostics (backtest vs single‑year)

- Builds a combined leaderboard of metrics from both runs (MAE/RMSE/R2/MAPE + MedianAE/P90AE/Hit10%/Hit20%).
- Checks rank agreement between backtest and single‑year (Kendall tau).
- Finds hardest/easiest ministries (median absolute error) for the best backtest model.
- Compares FY 23‑24 absolute errors to each ministry’s historical median error.
- Quick calibration for the best model (Pred vs Actual linear fit) to detect systematic bias.

In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import kendalltau

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
pred_single = pd.read_csv(BASE / "ministry_macro_predictions.csv")
met_single  = pd.read_csv(BASE / "macro_model_metrics.csv")
pred_bt     = pd.read_csv(BASE / "ministry_macro_backtest_predictions.csv")
met_bt      = pd.read_csv(BASE / "macro_model_metrics_backtest.csv")

# 1) Build per‑model error summaries
def summarize_preds(df: pd.DataFrame, label: str):
    rows = []
    model_cols = [c for c in df.columns if c.endswith("_Prediction")]
    for mc in model_cols:
        name = mc.replace("_Prediction","")
        valid = df.dropna(subset=["Actual_Budget", mc]).copy()
        if valid.empty: 
            continue
        ae = (valid[mc] - valid["Actual_Budget"]).abs()
        pe = ae / np.maximum(valid["Actual_Budget"].abs(), 1e-12)
        rows.append({
            "Model": name,
            f"{label}_Count": len(valid),
            f"{label}_MedianAE": float(ae.median()),
            f"{label}_P90AE": float(ae.quantile(0.90)),
            f"{label}_MeanError": float((valid[mc] - valid["Actual_Budget"]).mean()),
            f"{label}_Hit10%": float((pe <= 0.10).mean()*100.0),
            f"{label}_Hit20%": float((pe <= 0.20).mean()*100.0),
        })
    return pd.DataFrame(rows)

sum_single = summarize_preds(pred_single, "Single")
sum_bt     = summarize_preds(pred_bt, "BT")

# 2) Merge with your existing metrics (MAE/RMSE/R2/MAPE)
met_single = met_single.rename(columns={c: f"Single_{c}" for c in met_single.columns if c!="Model"})
met_bt     = met_bt.rename(columns={c: f"BT_{c}" for c in met_bt.columns if c!="Model"})

leaderboard = (
    sum_bt.merge(sum_single, on="Model", how="outer")
          .merge(met_bt, on="Model", how="outer")
          .merge(met_single, on="Model", how="outer")
).sort_values("BT_MAE", na_position="last")

print("Combined leaderboard (backtest first):")
display(leaderboard)

# 3) Do rankings agree? (Kendall rank correlation on MAE)
rank_bt = leaderboard[["Model","BT_MAE"]].dropna().sort_values("BT_MAE")
rank_sg = leaderboard[["Model","Single_MAE"]].dropna().sort_values("Single_MAE")
common  = rank_bt.merge(rank_sg, on="Model", how="inner")
if len(common) >= 2:
    tau, p = kendalltau(common["BT_MAE"].rank(), common["Single_MAE"].rank())
    print(f"Rank agreement (Kendall tau) between backtest and single-year MAE: {tau:.2f} (p={p:.3f})")
else:
    print("Not enough common models to compare ranks.")

# 4) Per-ministry difficulty (best backtest model)
best = leaderboard.iloc[0]["Model"]
col  = f"{best}_Prediction"
per_min = pred_bt.dropna(subset=["Actual_Budget", col]).copy()
per_min["AbsErr"] = (per_min[col] - per_min["Actual_Budget"]).abs()
hardest = per_min.groupby("Base_Ministry")["AbsErr"].median().sort_values(ascending=False).head(10)
easiest = per_min.groupby("Base_Ministry")["AbsErr"].median().sort_values().head(10)
print(f"Hardest ministries (median AE) for {best}:")
display(hardest)
print(f"Easiest ministries (median AE) for {best}:")
display(easiest)

# 5) FY23-24 vs history consistency for best model
sg = pred_single.dropna(subset=["Actual_Budget", f"{best}_Prediction"]).copy()
sg["AE_2324"] = (sg[f"{best}_Prediction"] - sg["Actual_Budget"]).abs()
hist = per_min.groupby("Base_Ministry")["AbsErr"].median().rename("Hist_MedianAE")
cmp = sg.merge(hist, on="Base_Ministry", how="left")[["Base_Ministry","AE_2324","Hist_MedianAE"]]
print("FY23-24 AE vs historical median AE (best model):")
display(cmp.sort_values("AE_2324", ascending=False).head(15))

# 6) Optional: quick calibration check for best model (single-year)
from sklearn.linear_model import LinearRegression
X = sg[["Actual_Budget"]].values
y = sg[f"{best}_Prediction"].values
reg = LinearRegression().fit(X, y)
print(f"Calibration (single-year, {best}): Pred ≈ {reg.coef_[0]:.3f} * Actual + {reg.intercept_:.1f}")

Combined leaderboard (backtest first):


Unnamed: 0,Model,BT_Count_x,BT_MedianAE,BT_P90AE,BT_MeanError,BT_Hit10%,BT_Hit20%,Single_Count_x,Single_MedianAE,Single_P90AE,...,BT_Count_y,BT_MAE,BT_RMSE,BT_R2,BT_MAPE_%,Single_Count_y,Single_MAE,Single_RMSE,Single_R2,Single_MAPE_%
0,GBM,549,1219.428844,33454.128934,-5638.307,18.943534,39.162113,61.0,2883.914091,27527.437481,...,549,10851.68,34346.82,0.5502673,127.351,61.0,8315.143387,18891.344179,0.876074,180.66664
5,RF,549,1549.772501,36820.062055,-7531.923,10.018215,25.500911,61.0,3547.925,28403.459967,...,549,12272.56,33794.02,0.5646273,112.6442,61.0,10427.826005,23416.769803,0.809589,151.6299
1,HGB,549,2365.908499,51710.063909,-9816.809,5.282332,9.289617,,,,...,549,16629.89,40087.22,0.3873767,108.6272,,,,,
6,Ridge,549,1444.7376,29355.372091,173728200000000.0,20.400729,37.887067,61.0,3342.977253,25206.988093,...,549,173728200000000.0,4070558000000000.0,-6.316689e+21,284039000000.0,61.0,10911.426538,23961.010083,0.800635,154.165701
3,Linear,549,1958.822291,81094.839575,4.766114e+19,19.672131,34.42623,61.0,2033.639473,30396.899049,...,549,4.766114e+19,1.116735e+21,-4.754251e+32,1.500797e+18,61.0,9721.463409,19071.382845,0.8737,194.410827
4,MLP,549,5602.453669,178634.396963,2.878266e+22,4.918033,9.836066,,,,...,549,2.878266e+22,6.743993e+23,-1.7338670000000002e+38,4.705406e+19,,,,,
2,Lasso,549,2360.605765,51385.083077,5.087235e+22,5.464481,9.471767,61.0,1867.840112,30346.41246,...,549,5.087235e+22,1.191977e+24,-5.416483e+38,8.31664e+19,61.0,9677.767727,19042.355135,0.874084,191.514618


Rank agreement (Kendall tau) between backtest and single-year MAE: 0.00 (p=1.000)
Hardest ministries (median AE) for GBM:


Base_Ministry
Rural Development                            58544.670265
Agriculture and Cooperation                  44414.433090
Road Transport and Highways                  43809.813699
Food and Public Distribution                 37742.484652
Elementary Education and Literacy            36993.951751
Defence Services                             33441.248682
Defence (Civil estimates)                    25222.581700
Economic Affairs (centralised provisions)    19883.838692
Health                                       17919.689064
Drinking Water Supply                        17764.789790
Name: AbsErr, dtype: float64

Easiest ministries (median AE) for GBM:


Base_Ministry
Public Enterprises              1.923152
Parliamentary Affairs           3.676922
Steel                          37.454800
Shipping                       86.880021
Company Affairs               115.630182
Small Scale Industries        189.917503
Food Processing Industries    190.295823
Mines                         202.925571
Land Resources                208.906937
Tourism                       224.311187
Name: AbsErr, dtype: float64

FY23-24 AE vs historical median AE (best model):


Unnamed: 0,Base_Ministry,AE_2324,Hist_MedianAE
42,Road Transport and Highways,87063.467026,43809.813699
22,Fertilisers,71781.230179,13544.143871
1,Agriculture and Cooperation,62355.813946,44414.43309
18,Economic Affairs (centralised provisions),35655.418802,19883.838692
53,Telecommunications,32925.598474,10743.425372
19,Elementary Education and Literacy,30959.097668,36993.951751
43,Rural Development,27527.437481,58544.670265
37,"Personnel, Public Grievances and Pensions",14679.729538,10294.790873
17,Drinking Water Supply,12417.719686,17764.78979
25,Health,10094.893491,17919.689064


Calibration (single-year, GBM): Pred ≈ 0.757 * Actual + 3415.2
