In [13]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

# Build sector-ministry time series → final_data/data_2/data/sector_ministry_timeseries.csv
BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
DATA.mkdir(parents=True, exist_ok=True)

# Prefer inputs from data_2/data, fallback to final_data/data
CANDIDATE_IN_DIRS = [DATA, BASE / "data"]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    return DATA / filename

def tidy_ministry(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

def fy_end_year_short(s: str) -> int:
    s = str(s).strip()
    m = re.fullmatch(r"(\d{2})-(\d{2})", s)
    if not m:
        raise ValueError(f"Unexpected Fiscal_Year format: {s}")
    return 2000 + int(m.group(2))

IN_BUDGETS = resolve_input("standardized_budget_time_series.csv")
IN_MAP     = resolve_input("ministry_to_sector12.csv")
OUT_MIN    = DATA / "sector_ministry_timeseries.csv"

print(f"Using budgets: {IN_BUDGETS}")
print(f"Using mapping: {IN_MAP}")
print(f"Will save to : {OUT_MIN}")

# Load standardized budgets
dfb = pd.read_csv(IN_BUDGETS, dtype={"Base_Ministry": "string"})
dfb["Base_Ministry"] = dfb["Base_Ministry"].map(tidy_ministry)

# Find FY columns like 05-06
fy_cols = [c for c in dfb.columns if re.fullmatch(r"\d{2}-\d{2}", str(c))]
if not fy_cols:
    raise RuntimeError("No fiscal-year columns found (expected yy-yy like 05-06).")

# Coerce numerics; collapse duplicate ministry rows if any
dfb[fy_cols] = (
    dfb[fy_cols]
    .replace(r"[, \t]", "", regex=True)
    .replace("", np.nan)
    .apply(pd.to_numeric, errors="coerce")
)
dfb = dfb.groupby("Base_Ministry", as_index=False)[fy_cols].sum(min_count=1)

# Melt to long at ministry level
long = (
    dfb.melt(
        id_vars="Base_Ministry",
        value_vars=fy_cols,
        var_name="Fiscal_Year",
        value_name="Budget_Amount",
    )
    .dropna(subset=["Budget_Amount"])
)

# Map Base_Ministry → Sector_12
map_df = pd.read_csv(IN_MAP, dtype={"Base_Ministry": "string", "Sector_12": "string"})
map_df["Base_Ministry"] = map_df["Base_Ministry"].map(tidy_ministry)
map_df["Sector_12"] = map_df["Sector_12"].astype("string").str.strip()

min_panel = long.merge(map_df, on="Base_Ministry", how="left")
missing = min_panel[min_panel["Sector_12"].isna()]["Base_Ministry"].dropna().unique().tolist()
if missing:
    print(f"Warning: {len(missing)} ministries unmapped to Sector_12. Dropping them.")
min_panel = min_panel.dropna(subset=["Sector_12"])

# Compute Year_End, Sector totals, and shares
min_panel["Year_End"] = min_panel["Fiscal_Year"].map(fy_end_year_short)
min_panel["Sector_Total"] = (
    min_panel.groupby(["Sector_12", "Fiscal_Year"])["Budget_Amount"].transform("sum")
)
min_panel["Ministry_Share_Sector"] = min_panel["Budget_Amount"] / min_panel["Sector_Total"]

# Reorder columns and sort
min_panel = min_panel[
    ["Sector_12", "Base_Ministry", "Fiscal_Year", "Year_End", "Budget_Amount", "Sector_Total", "Ministry_Share_Sector"]
].sort_values(["Sector_12", "Base_Ministry", "Year_End"])

# Save
min_panel.to_csv(OUT_MIN, index=False)
print(f"Saved: {OUT_MIN}")
print("Preview:")
print(min_panel.head(10))

Using budgets: /Users/vvmohith/Desktop/PROJECT/final_data/data/standardized_budget_time_series.csv
Using mapping: /Users/vvmohith/Desktop/PROJECT/final_data/data/ministry_to_sector12.csv
Will save to : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_ministry_timeseries.csv
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_ministry_timeseries.csv
Preview:
                        Sector_12                        Base_Ministry  \
0    agriculture forestry fishing  Agricultural Research and Education   
61   agriculture forestry fishing  Agricultural Research and Education   
122  agriculture forestry fishing  Agricultural Research and Education   
183  agriculture forestry fishing  Agricultural Research and Education   
244  agriculture forestry fishing  Agricultural Research and Education   
305  agriculture forestry fishing  Agricultural Research and Education   
366  agriculture forestry fishing  Agricultural Research and Education   
427  agriculture f

1. Build sector budget time series
* Input: data/standardized_budget_time_series.csv, data/ministry_to_sector12.csv
* Clean Base_Ministry, melt FY columns (yy-yy), map to Sector_12, drop unmapped, aggregate to (Sector_12, Fiscal_Year).
* Output: data/sector_budget_timeseries.csv

In [2]:
# Step 1: Build sector budget time series → data_2/data/sector_budget_timeseries.csv

import re
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
DATA.mkdir(parents=True, exist_ok=True)

# Prefer inputs from data_2/data, fallback to final_data/data
CANDIDATE_IN_DIRS = [DATA, BASE / "data"]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    # Default to data_2/data (even if missing) to make path explicit
    return DATA / filename

IN_BUDGETS = resolve_input("standardized_budget_time_series.csv")
IN_MAP     = resolve_input("ministry_to_sector12.csv")
OUT_SECTOR = DATA / "sector_budget_timeseries.csv"

print(f"Using budgets: {IN_BUDGETS}")
print(f"Using mapping: {IN_MAP}")
print(f"Will save to : {OUT_SECTOR}")

def tidy_ministry(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

# Load budgets
dfb = pd.read_csv(IN_BUDGETS, dtype={"Base_Ministry": "string"})
dfb["Base_Ministry"] = dfb["Base_Ministry"].map(tidy_ministry)

# Identify FY columns like 05-06, 23-24, etc.
fy_cols = [c for c in dfb.columns if re.fullmatch(r"\d{2}-\d{2}", str(c))]
if not fy_cols:
    raise RuntimeError("No fiscal-year columns found in standardized_budget_time_series.csv (expected yy-yy like 05-06).")

# Sort FY columns by end year (...05-06, 06-07, ..., 24-25)
def fy_end_year(col: str) -> int:
    end = int(str(col).split("-")[1])
    return 2000 + end

fy_cols = sorted(fy_cols, key=fy_end_year)

# Coerce numbers (remove commas/spaces), sum duplicates per ministry if any
dfb[fy_cols] = (
    dfb[fy_cols]
    .replace(r"[, \t]", "", regex=True)
    .replace("", np.nan)
    .apply(pd.to_numeric, errors="coerce")
)
dfb = dfb.groupby("Base_Ministry", as_index=False)[fy_cols].sum(min_count=1)

# Melt to long
long = (
    dfb.melt(
        id_vars="Base_Ministry",
        value_vars=fy_cols,
        var_name="Fiscal_Year",
        value_name="Budget_Amount",
    )
    .dropna(subset=["Budget_Amount"])
)

# Load mapping and clean keys
map_df = pd.read_csv(IN_MAP, dtype={"Base_Ministry": "string", "Sector_12": "string"})
map_df["Base_Ministry"] = map_df["Base_Ministry"].map(tidy_ministry)
map_df["Sector_12"] = map_df["Sector_12"].astype("string").str.strip()

# Join mapping
merged = long.merge(map_df, on="Base_Ministry", how="left")

# Report unmapped ministries
unmapped = merged[merged["Sector_12"].isna()]["Base_Ministry"].dropna().unique().tolist()
if unmapped:
    print(f"Warning: {len(unmapped)} ministries have no Sector_12 mapping. Dropping these rows.")
    print("Examples:", unmapped[:10])

# Drop unmapped and aggregate to Sector_12
merged = merged.dropna(subset=["Sector_12"])
sector_ts = (
    merged.groupby(["Sector_12", "Fiscal_Year"], as_index=False)
          .agg(Budget_Amount=("Budget_Amount", "sum"))
)

# Sort and save
sector_ts = sector_ts.sort_values(
    ["Sector_12", "Fiscal_Year"],
    key=lambda s: s if s.name != "Fiscal_Year" else s.str.split("-").str[1].astype(int)
)
sector_ts.to_csv(OUT_SECTOR, index=False)

# Summary
years = sorted(sector_ts["Fiscal_Year"].unique(), key=lambda s: int(s.split("-")[1]))
print(f"Saved: {OUT_SECTOR}")
print(f"Sectors: {sector_ts['Sector_12'].nunique()}, Years: {len(years)}, Rows: {len(sector_ts)}")
print("Year span:", years[0], "→", years[-1])
print(sector_ts.head(10))

Using budgets: /Users/vvmohith/Desktop/PROJECT/final_data/data/standardized_budget_time_series.csv
Using mapping: /Users/vvmohith/Desktop/PROJECT/final_data/data/ministry_to_sector12.csv
Will save to : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_timeseries.csv
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_timeseries.csv
Sectors: 11, Years: 17, Rows: 187
Year span: 05-06 → 23-24
                      Sector_12 Fiscal_Year  Budget_Amount
0  agriculture forestry fishing       05-06       27237.86
1  agriculture forestry fishing       06-07       38240.82
2  agriculture forestry fishing       07-08       44713.38
3  agriculture forestry fishing       09-10       86589.42
4  agriculture forestry fishing       10-11       93610.87
5  agriculture forestry fishing       11-12      104371.66
6  agriculture forestry fishing       12-13      109450.59
7  agriculture forestry fishing       13-14      115835.73
8  agriculture forestry fishing  

2.Integrate macro indicators
* Input: data/macro_indicators_wb.csv
* Keep: GDP_Growth_Rate, Inflation_CPI, Exchange_Rate_USD, Fiscal_Deficit_GDP, Global_GDP_Growth, Election_Year, High_Inflation, GDP_Growth_Lag1, Inflation_Lag1.
* Left-join on Fiscal_Year.
* Output: data/sector_budget_macro.csv

In [5]:
# Step 2: Integrate macro indicators → final_data/data_2/data/sector_budget_macro.csv

import re
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
DATA.mkdir(parents=True, exist_ok=True)

# Prefer inputs from data_2/data, fallback to final_data/data
CANDIDATE_IN_DIRS = [DATA, BASE / "data"]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    return DATA / filename  # explicit default

def fy_end_year(s: str) -> int:
    # Accept "yy-yy", "yyyy-yy", optional "FY" prefix, extra spaces
    s = str(s).strip().upper().replace("FY", "")
    m = re.fullmatch(r"\s*(\d{2,4})\s*-\s*(\d{2})\s*", s)
    if not m:
        raise ValueError(f"Unexpected Fiscal_Year format: {s}")
    end2 = int(m.group(2))
    # Assume 2000s
    return 2000 + end2

def to_short_fy_from_end(end_year: int) -> str:
    yy = end_year % 100
    return f"{(yy-1)%100:02d}-{yy:02d}"

IN_SECTOR = resolve_input("sector_budget_timeseries.csv")
IN_MACRO  = resolve_input("macro_indicators_wb.csv")
OUT_MERGE = DATA / "sector_budget_macro.csv"

print(f"Using sector time series: {IN_SECTOR}")
print(f"Using macro indicators : {IN_MACRO}")
print(f"Will save to           : {OUT_MERGE}")

# Load sector time series
sector = pd.read_csv(IN_SECTOR, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
sector["Fiscal_Year"] = sector["Fiscal_Year"].str.strip()
sector["Budget_Amount"] = pd.to_numeric(sector["Budget_Amount"], errors="coerce")

# Load macros
macro_cols_pref = [
    "GDP_Growth_Rate", "Inflation_CPI", "Exchange_Rate_USD", "Fiscal_Deficit_GDP",
    "Global_GDP_Growth", "Election_Year", "High_Inflation", "GDP_Growth_Lag1", "Inflation_Lag1"
]
mac = pd.read_csv(IN_MACRO, dtype={"Fiscal_Year": "string"})
mac["Fiscal_Year"] = mac["Fiscal_Year"].str.strip()

# Keep only preferred columns that exist
present_initial = [c for c in macro_cols_pref if c in mac.columns]
keep_cols = ["Fiscal_Year"] + present_initial
mac = mac[keep_cols].copy()

# Coerce numerics
for c in mac.columns:
    if c != "Fiscal_Year":
        mac[c] = pd.to_numeric(mac[c], errors="coerce")

# Normalize FY and recompute lags from base series
mac["Year_End"] = mac["Fiscal_Year"].map(fy_end_year)
mac = mac.sort_values("Year_End")
if "GDP_Growth_Rate" in mac.columns:
    mac["GDP_Growth_Lag1"] = mac["GDP_Growth_Rate"].shift(1)
if "Inflation_CPI" in mac.columns:
    mac["Inflation_Lag1"] = mac["Inflation_CPI"].shift(1)

# Collapse duplicates by FY (mean numeric)
mac["Fiscal_Year"] = mac["Year_End"].map(to_short_fy_from_end)
mac = mac.groupby("Fiscal_Year", as_index=False).mean(numeric_only=True)

# Columns present after computing lags
present = [c for c in macro_cols_pref if c in mac.columns]

# Merge (left) on Fiscal_Year
merged = sector.merge(mac, on="Fiscal_Year", how="left")

# Order columns
ordered = ["Sector_12", "Fiscal_Year", "Budget_Amount"] + present
merged = merged[ordered]

# Diagnostics
print("\nDiagnostics:")
print("Sector FY count:", sector["Fiscal_Year"].nunique(), "rows:", len(sector))
print("Macro  FY count:", mac["Fiscal_Year"].nunique(), "rows:", len(mac))

lag_cols = [c for c in ["GDP_Growth_Lag1", "Inflation_Lag1"] if c in merged.columns]
if lag_cols:
    na_by_fy = merged.groupby("Fiscal_Year")[lag_cols].apply(lambda df: df.isna().all()).reset_index()
    na_any = na_by_fy.set_index("Fiscal_Year")[lag_cols].any(axis=1)
    if na_any.any():
        missing_fys = na_any[na_any].index.tolist()
        print("FYs with missing lag values (expected for first available FY if prior year absent):", missing_fys)

# Save
merged.to_csv(OUT_MERGE, index=False)
print(f"\nSaved: {OUT_MERGE}")
print("Preview:")
print(merged.head(10))

Using sector time series: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_timeseries.csv
Using macro indicators : /Users/vvmohith/Desktop/PROJECT/final_data/data/macro_indicators_wb.csv
Will save to           : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro.csv

Diagnostics:
Sector FY count: 17 rows: 187
Macro  FY count: 20 rows: 20
FYs with missing lag values (expected for first available FY if prior year absent): ['05-06']

Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro.csv
Preview:
                      Sector_12 Fiscal_Year  Budget_Amount  GDP_Growth_Rate  \
0  agriculture forestry fishing       05-06       27237.86         8.060733   
1  agriculture forestry fishing       06-07       38240.82         7.660815   
2  agriculture forestry fishing       07-08       44713.38         3.086698   
3  agriculture forestry fishing       09-10       86589.42         8.497585   
4  agriculture forestry fishing

3.Integrate sector growth (shares of GDP)
* Input: data/sector_shares_gdp.csv
* Melt FY columns (YYYY-YY) to long, map sector keys → Sector_12 (normalize/fuzzy-match), convert to short FY (yy-yy).
* Join with macro panel on (Sector_12, Fiscal_Year).
* Output: data/sector_budget_macro_panel.csv

In [6]:
# Step 3: Integrate sector growth (shares of GDP) → final_data/data_2/data/sector_budget_macro_panel.csv

import re
import difflib
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
DATA.mkdir(parents=True, exist_ok=True)

# Prefer inputs from data_2/data, fallback to final_data/data
CANDIDATE_IN_DIRS = [DATA, BASE / "data"]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    return DATA / filename  # explicit default

def norm_txt(s: str) -> str:
    s = str(s).lower().strip()
    s = re.sub(r"[\u2013\u2014–—]", "-", s)        # normalize dashes
    s = re.sub(r"[^a-z0-9\s/&-]+", " ", s)        # drop punctuation except separators
    s = re.sub(r"\s+", " ", s)
    return s

def fy_end_year_any(s: str) -> int:
    # Accept "yyyy-yy", "yy-yy", optional "FY" prefix, extra spaces/dashes
    s = str(s).strip().upper().replace("FY", "")
    s = re.sub(r"[\u2013\u2014–—]", "-", s)
    m = re.fullmatch(r"\s*(\d{4})\s*-\s*(\d{2})\s*", s)
    if m:
        return 2000 + int(m.group(2))  # end year from second part
    m2 = re.fullmatch(r"\s*(\d{2})\s*-\s*(\d{2})\s*", s)
    if m2:
        return 2000 + int(m2.group(2))
    raise ValueError(f"Unexpected Fiscal_Year format: {s}")

def to_short_fy_from_end(end_year: int) -> str:
    yy = end_year % 100
    return f"{(yy-1)%100:02d}-{yy:02d}"

IN_MACRO_PANEL = resolve_input("sector_budget_macro.csv")
IN_SECTOR_TS   = resolve_input("sector_budget_timeseries.csv")
IN_SHARES      = resolve_input("sector_shares_gdp.csv")
OUT_PANEL      = DATA / "sector_budget_macro_panel.csv"

print(f"Using macro panel : {IN_MACRO_PANEL}")
print(f"Using sector TS   : {IN_SECTOR_TS}")
print(f"Using shares file : {IN_SHARES}")
print(f"Will save to      : {OUT_PANEL}")

# Load macro panel (may be empty if macro CSV had only headers)
macro_panel = pd.read_csv(IN_MACRO_PANEL, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
macro_panel["Fiscal_Year"] = macro_panel["Fiscal_Year"].astype("string").str.strip()

# Canonical Sector_12 list (fallback to sector TS if macro panel empty)
canon_sectors = macro_panel["Sector_12"].dropna().unique().tolist()
if not canon_sectors:
    sector_ts = pd.read_csv(IN_SECTOR_TS, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
    canon_sectors = sector_ts["Sector_12"].dropna().unique().tolist()

canon_norm_map = {norm_txt(s): s for s in canon_sectors}

# Load sector shares (wide format with FY columns like 2005-06)
shares = pd.read_csv(IN_SHARES)
# Heuristic to find sector name column
candidate_keys = [
    "Sector_12", "Sector", "Sector_Name", "Sector Key", "Sector_Key",
    "sector", "Industry", "Category", "SectorName", "SectorName12"
]
key_col = next((c for c in candidate_keys if c in shares.columns), None)
if key_col is None:
    raise RuntimeError(
        f"Could not find a sector key column in {IN_SHARES.name}. "
        f"Tried: {candidate_keys}"
    )

# Identify FY columns that look like YYYY-YY (also tolerate FY prefix and unicode dashes)
fy_cols = [c for c in shares.columns if re.fullmatch(r"\s*(?:FY)?\s*\d{4}\s*[-\u2013\u2014–—]\s*\d{2}\s*", str(c))]
if not fy_cols:
    raise RuntimeError("No fiscal-year columns found in sector_shares_gdp.csv (expected like 2005-06).")

# Melt to long
long = shares.melt(
    id_vars=[key_col],
    value_vars=fy_cols,
    var_name="Fiscal_Year_raw",
    value_name="Sector_Share_GDP"
)

# Clean share values (e.g., "12.3%" -> 12.3). Values are kept as percentage points, not 0-1.
long["Sector_Share_GDP"] = (
    long["Sector_Share_GDP"]
    .astype("string")
    .str.replace("%", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
    .replace({"": np.nan})
)
long["Sector_Share_GDP"] = pd.to_numeric(long["Sector_Share_GDP"], errors="coerce")
long = long.dropna(subset=["Sector_Share_GDP"])

# Normalize FY to short format yy-yy
fy_raw = long["Fiscal_Year_raw"].astype("string").str.replace("FY", "", case=False, regex=False)
fy_raw = fy_raw.str.replace("\u2013", "-", regex=False).str.replace("\u2014", "-", regex=False).str.replace("–", "-", regex=False).str.replace("—", "-", regex=False)
end_year = fy_raw.map(fy_end_year_any)
long["Fiscal_Year"] = end_year.map(to_short_fy_from_end)

# Map sector keys → Sector_12 (exact-normalized, then fuzzy)
long["key_norm"] = long[key_col].astype("string").map(norm_txt)
long["Sector_12"] = long["key_norm"].map(canon_norm_map)

unmatched = long[long["Sector_12"].isna()]["key_norm"].dropna().unique().tolist()
if unmatched:
    canon_keys = list(canon_norm_map.keys())
    resolved = {}
    for k in unmatched:
        match = difflib.get_close_matches(k, canon_keys, n=1, cutoff=0.86)
        if match:
            resolved[k] = canon_norm_map[match[0]]
    if resolved:
        long.loc[long["Sector_12"].isna(), "Sector_12"] = long.loc[long["Sector_12"].isna(), "key_norm"].map(resolved)

# Final unmatched report
still_unmatched = long[long["Sector_12"].isna()][key_col].dropna().unique().tolist()
if still_unmatched:
    print(f"Warning: {len(still_unmatched)} sector names could not be mapped to Sector_12. Examples:", still_unmatched[:8])

# Keep necessary columns and aggregate per (Sector_12, Fiscal_Year)
shares_panel = (
    long.dropna(subset=["Sector_12"])
        .groupby(["Sector_12", "Fiscal_Year"], as_index=False)
        .agg(Sector_Share_GDP=("Sector_Share_GDP", "mean"))
)

# Join with macro panel on (Sector_12, Fiscal_Year)
# Keep all rows from macro panel; shares fill where available
merged_panel = macro_panel.merge(
    shares_panel,
    on=["Sector_12", "Fiscal_Year"],
    how="left"
)

# Save
merged_panel.to_csv(OUT_PANEL, index=False)

# Diagnostics
print("Saved:", OUT_PANEL)
print("Macro rows:", len(macro_panel), "→ Panel rows:", len(merged_panel))
if len(shares_panel):
    coverage = (
        merged_panel["Sector_Share_GDP"].notna().mean() * 100.0
        if len(merged_panel) else 0.0
    )
    print(f"Share coverage in merged panel: {coverage:.1f}%")
else:
    print("Warning: No valid share rows parsed from sector_shares_gdp.csv")

print("Preview:")
print(merged_panel.head(10))

Using macro panel : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro.csv
Using sector TS   : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_timeseries.csv
Using shares file : /Users/vvmohith/Desktop/PROJECT/final_data/data/sector_shares_gdp.csv
Will save to      : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Macro rows: 187 → Panel rows: 187
Share coverage in merged panel: 100.0%
Preview:
                      Sector_12 Fiscal_Year  Budget_Amount  GDP_Growth_Rate  \
0  agriculture forestry fishing       05-06       27237.86         8.060733   
1  agriculture forestry fishing       06-07       38240.82         7.660815   
2  agriculture forestry fishing       07-08       44713.38         3.086698   
3  agriculture forestry fishing       09-10       86589.42         8.497585   
4  agriculture forestry fishing  

4.Feature engineering (growth-focused)
* Add Year_End; sector lags: Budget_Lag1/Lag2, Budget_Growth_Lag1; shares: Sector_Share_Lag1, Sector_Share_Growth.
* Add log levels and YoY log-diff growth: Budget_Log, Share_Log, Budget_Log_Diff1, Share_Log_Diff1; add 3y MA smoothing: Budget_LogDiff1_MA3, Share_LogDiff1_MA3.
* Add Trend, Inflation_x_Election, GDPGrowth_x_Election. Clean inf/NaN.
* Output (overwrite): data/sector_budget_macro_panel.csv

In [7]:
# Step 4: Feature engineering (growth-focused) → final_data/data_2/data/sector_budget_macro_panel.csv

import re
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
DATA.mkdir(parents=True, exist_ok=True)

# Prefer inputs from data_2/data, fallback to final_data/data
CANDIDATE_IN_DIRS = [DATA, BASE / "data"]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    return DATA / filename  # explicit default

def fy_end_year_short(s: str) -> int:
    s = str(s).strip()
    m = re.fullmatch(r"(\d{2})-(\d{2})", s)
    if not m:
        raise ValueError(f"Unexpected Fiscal_Year format: {s}")
    return 2000 + int(m.group(2))

IN_PANEL = resolve_input("sector_budget_macro_panel.csv")
OUT_PANEL = DATA / "sector_budget_macro_panel.csv"

print(f"Using panel: {IN_PANEL}")
print(f"Will save to: {OUT_PANEL}")

# Load
panel = pd.read_csv(IN_PANEL, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
panel["Fiscal_Year"] = panel["Fiscal_Year"].str.strip()

# Ensure key numerics
for c in ["Budget_Amount", "Sector_Share_GDP", "GDP_Growth_Rate", "Inflation_CPI", "Election_Year"]:
    if c in panel.columns:
        panel[c] = pd.to_numeric(panel[c], errors="coerce")

# Year_End
panel["Year_End"] = panel["Fiscal_Year"].map(fy_end_year_short)

# Sort for time-aware ops
panel = panel.sort_values(["Sector_12", "Year_End"])

# Sector lags for budget
panel["Budget_Lag1"] = panel.groupby("Sector_12")["Budget_Amount"].shift(1)
panel["Budget_Lag2"] = panel.groupby("Sector_12")["Budget_Amount"].shift(2)
panel["Budget_Growth_Lag1"] = (panel["Budget_Amount"] / panel["Budget_Lag1"]) - 1

# Sector lags for shares
if "Sector_Share_GDP" in panel.columns:
    panel["Sector_Share_Lag1"] = panel.groupby("Sector_12")["Sector_Share_GDP"].shift(1)
    panel["Sector_Share_Growth"] = (panel["Sector_Share_GDP"] / panel["Sector_Share_Lag1"]) - 1

# Logs and YoY log-diff growth
panel["Budget_Log"] = np.log(panel["Budget_Amount"].where(panel["Budget_Amount"] > 0))
if "Sector_Share_GDP" in panel.columns:
    panel["Share_Log"] = np.log(panel["Sector_Share_GDP"].where(panel["Sector_Share_GDP"] > 0))

panel["Budget_Log_Diff1"] = panel.groupby("Sector_12")["Budget_Log"].diff(1)
if "Sector_Share_GDP" in panel.columns:
    panel["Share_Log_Diff1"] = panel.groupby("Sector_12")["Share_Log"].diff(1)

# 3y moving average smoothing of log-diff
panel["Budget_LogDiff1_MA3"] = panel.groupby("Sector_12")["Budget_Log_Diff1"].transform(lambda s: s.rolling(3, min_periods=1).mean())
if "Sector_Share_GDP" in panel.columns:
    panel["Share_LogDiff1_MA3"] = panel.groupby("Sector_12")["Share_Log_Diff1"].transform(lambda s: s.rolling(3, min_periods=1).mean())

# Trend (within sector)
panel["Trend"] = panel.groupby("Sector_12").cumcount() + 1

# Interactions
if {"Inflation_CPI", "Election_Year"}.issubset(panel.columns):
    panel["Inflation_x_Election"] = panel["Inflation_CPI"] * panel["Election_Year"]
if {"GDP_Growth_Rate", "Election_Year"}.issubset(panel.columns):
    panel["GDPGrowth_x_Election"] = panel["GDP_Growth_Rate"] * panel["Election_Year"]

# Clean inf/NaN artifacts from divisions/logs
panel.replace([np.inf, -np.inf], np.nan, inplace=True)

# Save (overwrite)
panel.to_csv(OUT_PANEL, index=False)
print("Saved:", OUT_PANEL)
print("Preview:")
print(panel.head(10))

Using panel: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Will save to: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Preview:
                      Sector_12 Fiscal_Year  Budget_Amount  GDP_Growth_Rate  \
0  agriculture forestry fishing       05-06       27237.86         8.060733   
1  agriculture forestry fishing       06-07       38240.82         7.660815   
2  agriculture forestry fishing       07-08       44713.38         3.086698   
3  agriculture forestry fishing       09-10       86589.42         8.497585   
4  agriculture forestry fishing       10-11       93610.87         5.241316   
5  agriculture forestry fishing       11-12      104371.66         5.456388   
6  agriculture forestry fishing       12-13      109450.59         6.386106   
7  agriculture forestry fishing       13-14      115835.73         7.4

5.Define splits and scaling (whitelist features)
* Hold out FY23-24 (Year_End=2024) for test; train on ≤2023; validation = 2023.
* Whitelist features: Sector_Share_* (level/lag/growth/logdiff/MA3), Budget_Lag1/Lag2/Budget_Growth_Lag1, macro fields above, interactions, Trend.
* Fit median-impute + StandardScaler on train only; transform all rows.
* Outputs: data/sector_budget_features_train.csv, data/sector_budget_features_test_2024.csv, data/feature_columns.json, data/feature_imputer_scaler.joblib

In [9]:
# Step 5: Define splits and scaling (whitelist features)
# Outputs (all under data_2/data):
# - final_data/data_2/data/sector_budget_features_train.csv
# - final_data/data_2/data/sector_budget_features_test_2024.csv
# - final_data/data_2/data/feature_columns.json
# - final_data/data_2/data/feature_imputer_scaler.joblib

import re
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from joblib import dump

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA2 = ROOT / "data"
DATA2.mkdir(parents=True, exist_ok=True)

# Inputs come from data_2/data only
CANDIDATE_IN_DIRS = [DATA2]

def resolve_input(filename: str) -> Path:
    for d in CANDIDATE_IN_DIRS:
        p = d / filename
        if p.exists():
            return p
    return DATA2 / filename  # explicit default

def fy_end_year_short(s: str) -> int:
    s = str(s).strip()
    m = re.fullmatch(r"(\d{2})-(\d{2})", s)
    if not m:
        raise ValueError(f"Unexpected Fiscal_Year format: {s}")
    return 2000 + int(m.group(2))

IN_PANEL = resolve_input("sector_budget_macro_panel.csv")
OUT_TRAIN = DATA2 / "sector_budget_features_train.csv"
OUT_TEST  = DATA2 / "sector_budget_features_test_2024.csv"
OUT_COLS  = DATA2 / "feature_columns.json"
OUT_MODEL = DATA2 / "feature_imputer_scaler.joblib"

print(f"Using panel: {IN_PANEL}")
print(f"Will save train: {OUT_TRAIN}")
print(f"Will save test : {OUT_TEST}")

# Load panel
panel = pd.read_csv(IN_PANEL, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
panel["Fiscal_Year"] = panel["Fiscal_Year"].str.strip()

# Ensure numerics for potential features and target
num_candidates = [
    "Budget_Amount",
    "Sector_Share_GDP", "Sector_Share_Lag1", "Sector_Share_Growth",
    "Share_Log", "Share_Log_Diff1", "Share_LogDiff1_MA3",
    "Budget_Lag1", "Budget_Lag2", "Budget_Growth_Lag1",
    "GDP_Growth_Rate", "Inflation_CPI", "Exchange_Rate_USD", "Fiscal_Deficit_GDP",
    "Global_GDP_Growth", "Election_Year", "High_Inflation", "GDP_Growth_Lag1", "Inflation_Lag1",
    "Inflation_x_Election", "GDPGrowth_x_Election",
    "Trend", "Year_End"
]
for c in num_candidates:
    if c in panel.columns:
        panel[c] = pd.to_numeric(panel[c], errors="coerce")

# Ensure Year_End
if "Year_End" not in panel.columns:
    panel["Year_End"] = panel["Fiscal_Year"].map(fy_end_year_short)

# Define whitelist
macro_cols = [
    "GDP_Growth_Rate", "Inflation_CPI", "Exchange_Rate_USD", "Fiscal_Deficit_GDP",
    "Global_GDP_Growth", "Election_Year", "High_Inflation", "GDP_Growth_Lag1", "Inflation_Lag1",
]
budget_cols = ["Budget_Lag1", "Budget_Lag2", "Budget_Growth_Lag1"]
share_cols  = ["Sector_Share_GDP", "Sector_Share_Lag1", "Sector_Share_Growth",
               "Share_Log", "Share_Log_Diff1", "Share_LogDiff1_MA3"]
interaction_cols = ["Inflation_x_Election", "GDPGrowth_x_Election"]
trend_cols = ["Trend"]

whitelist = share_cols + budget_cols + macro_cols + interaction_cols + trend_cols
features = [c for c in whitelist if c in panel.columns]

if not features:
    raise RuntimeError("No whitelist features present in panel. Ensure Step 3/4 created the engineered columns.")

# Split flags
panel["Split"] = np.where(panel["Year_End"] == 2024, "test",
                   np.where(panel["Year_End"] == 2023, "val", "train"))

train_mask = panel["Year_End"] <= 2023
test_mask  = panel["Year_End"] == 2024

# Fit imputer + scaler on train only
X_train = panel.loc[train_mask, features]
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
_ = scaler.fit(X_train_imp)

# Transform all rows
X_all_imp = imputer.transform(panel[features])
X_all_std = scaler.transform(X_all_imp)

z_cols = [f"z_{c}" for c in features]
for i, c in enumerate(z_cols):
    panel[c] = X_all_std[:, i]

# Prepare outputs
base_cols = ["Sector_12", "Fiscal_Year", "Year_End", "Budget_Amount", "Split"]
present_base = [c for c in base_cols if c in panel.columns]
out_cols = present_base + z_cols

train_out = panel.loc[panel["Split"].isin(["train", "val"]), out_cols].copy()
test_out  = panel.loc[test_mask, out_cols].copy()

# Save CSVs and artifacts to data_2/data
train_out.to_csv(OUT_TRAIN, index=False)
test_out.to_csv(OUT_TEST, index=False)

meta = {
    "original_features": features,
    "z_features": z_cols,
    "target": "Budget_Amount",
    "split": {"train_max_year_end": 2023, "validation_year_end": 2023, "test_year_end": 2024},
}
with open(OUT_COLS, "w") as f:
    json.dump(meta, f, indent=2)

artifacts = {
    "imputer": imputer,
    "scaler": scaler,
    "feature_names": features,
    "z_feature_names": z_cols,
}
dump(artifacts, OUT_MODEL)

# Diagnostics
print("Saved:")
print(" -", OUT_TRAIN)
print(" -", OUT_TEST)
print(" -", OUT_COLS)
print(" -", OUT_MODEL)
print("Train rows:", len(train_out), "Test rows:", len(test_out))
print("n_features:", len(features))
print("Example cols:", out_cols[:8], "...")

Using panel: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Will save train: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_train.csv
Will save test : /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_test_2024.csv
Saved:
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_train.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_test_2024.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/feature_columns.json
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/feature_imputer_scaler.joblib
Train rows: 176 Test rows: 11
n_features: 21
Example cols: ['Sector_12', 'Fiscal_Year', 'Year_End', 'Budget_Amount', 'Split', 'z_Sector_Share_GDP', 'z_Sector_Share_Lag1', 'z_Sector_Share_Growth'] ...


6.Baseline models (same FY23-24 sample as DL)
* Build Naive(Lag1) from panel for FY2024 rows evaluated by DL.
* Train LinearRegression, Ridge, GBM on z_ features using train ≤2023; evaluate on same FY23-24 rows.
* Metrics: MAE, RMSE, R2, MAPE_%.
* Output: data/metrics/sector_baseline_metrics_23_24.csv


In [12]:
# Step 6: Baseline models (train ≤2023, test on FY2024) → final_data/data_2/data/metrics/sector_baseline_metrics_23_24.csv

import json
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA2 = ROOT / "data"
METRICS_DIR = DATA2 / "metrics"
METRICS_DIR.mkdir(parents=True, exist_ok=True)

# Inputs
IN_FEATS_TRAIN = DATA2 / "sector_budget_features_train.csv"
IN_FEATS_TEST  = DATA2 / "sector_budget_features_test_2024.csv"
IN_PANEL       = DATA2 / "sector_budget_macro_panel.csv"   # for Naive(Lag1)
IN_COLS        = DATA2 / "feature_columns.json"

# Output
OUT_METRICS = METRICS_DIR / "sector_baseline_metrics_23_24.csv"

print(f"Train features: {IN_FEATS_TRAIN}")
print(f"Test  features: {IN_FEATS_TEST}")
print(f"Panel (for Naive): {IN_PANEL}")
print(f"Metrics will be saved to: {OUT_METRICS}")

# Load data
train_df = pd.read_csv(IN_FEATS_TRAIN)
test_df  = pd.read_csv(IN_FEATS_TEST)

# Feature list
with open(IN_COLS, "r") as f:
    meta = json.load(f)
z_features = meta.get("z_features", [])

# Drop rows without target (safety)
train_df = train_df.dropna(subset=["Budget_Amount"]).copy()
test_eval = test_df.dropna(subset=["Budget_Amount"]).copy()

X_train = train_df[z_features]
y_train = train_df["Budget_Amount"].values

X_test = test_eval[z_features]
y_test = test_eval["Budget_Amount"].values

def mape_pct(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = (y_true != 0) & np.isfinite(y_true) & np.isfinite(y_pred)
    if not np.any(mask):
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0

def eval_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    # Version-agnostic RMSE (older sklearn has no `squared` kwarg)
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2 = r2_score(y_true, y_pred)
    mape = mape_pct(y_true, y_pred)
    return mae, rmse, r2, mape

results = []

# 1) Naive(Lag1) using Budget_Lag1 from the engineered panel for Year_End=2024
panel = pd.read_csv(IN_PANEL)
# Ensure numeric
if "Budget_Lag1" in panel.columns:
    panel["Budget_Lag1"] = pd.to_numeric(panel["Budget_Lag1"], errors="coerce")
# Join on (Sector_12, Fiscal_Year) for FY2024 rows only
naive_merge = test_eval[["Sector_12", "Fiscal_Year", "Budget_Amount"]].merge(
    panel[["Sector_12", "Fiscal_Year", "Budget_Lag1"]],
    on=["Sector_12", "Fiscal_Year"],
    how="left"
)
naive_valid = naive_merge.dropna(subset=["Budget_Amount", "Budget_Lag1"]).copy()
if len(naive_valid):
    y_true_nv = naive_valid["Budget_Amount"].values
    y_pred_nv = naive_valid["Budget_Lag1"].values
    mae, rmse, r2, mape = eval_metrics(y_true_nv, y_pred_nv)
    results.append({
        "model": "Naive_Lag1",
        "n_eval": int(len(naive_valid)),
        "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape
    })
else:
    results.append({
        "model": "Naive_Lag1",
        "n_eval": 0, "MAE": np.nan, "RMSE": np.nan, "R2": np.nan, "MAPE_%": np.nan
    })

# 2) Linear Regression
lin = LinearRegression()
lin.fit(X_train, y_train)
pred_lin = lin.predict(X_test)
mae, rmse, r2, mape = eval_metrics(y_test, pred_lin)
results.append({
    "model": "LinearRegression",
    "n_eval": int(len(y_test)),
    "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape
})

# 3) Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)
mae, rmse, r2, mape = eval_metrics(y_test, pred_ridge)
results.append({
    "model": "Ridge",
    "n_eval": int(len(y_test)),
    "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape
})

# 4) Gradient Boosting Regressor (GBM)
gbm = GradientBoostingRegressor(random_state=42)
gbm.fit(X_train, y_train)
pred_gbm = gbm.predict(X_test)
mae, rmse, r2, mape = eval_metrics(y_test, pred_gbm)
results.append({
    "model": "GBM",
    "n_eval": int(len(y_test)),
    "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape
})

metrics_df = pd.DataFrame(results, columns=["model", "n_eval", "MAE", "RMSE", "R2", "MAPE_%"])
metrics_df.to_csv(OUT_METRICS, index=False)

print("\nSaved metrics:", OUT_METRICS)
print(metrics_df)

Train features: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_train.csv
Test  features: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_features_test_2024.csv
Panel (for Naive): /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_budget_macro_panel.csv
Metrics will be saved to: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/metrics/sector_baseline_metrics_23_24.csv

Saved metrics: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/metrics/sector_baseline_metrics_23_24.csv
              model  n_eval           MAE          RMSE        R2     MAPE_%
0        Naive_Lag1      11  22696.873636  33404.003003  0.908105  12.774786
1  LinearRegression      11  23256.136970  27133.149205  0.939369  23.561556
2             Ridge      11  24358.801538  28983.020707  0.930820  22.826874
3               GBM      11  13344.972027  16743.237586  0.976913   9.394340


7.Deep learning (residual learning + embeddings, lookback=5)
* Build sequences per sector on z_ features (t-5..t-1 → residual log-growth at t vs Naive(Lag1)).
* Add sector embeddings; train with Huber loss, EarlyStopping, ReduceLROnPlateau; validate on 2023.
* Calibrate blend α on 2023 between DL and Naive, then evaluate FY23-24.
* Train variants: GRU_L5_U64, LSTM_L5_U64, StackedGRU_L5_64_32, BiGRU_L5_U64, TCN_L5_F64_K3; also ensemble average.
* Outputs: data/sector_dl_predictions_23_24.csv (per model), results/fy2425/metrics/* (summaries), results/fy2425/plots/* (DL vs baselines)

In [20]:
# Step 7: Deep learning (residual learning + embeddings, lookback=5)
# - Build sequences per sector on z_ features (t-5..t-1 → residual log-growth at t vs Naive(Lag1)).
# - Sector embeddings; Huber loss; EarlyStopping + ReduceLROnPlateau; validate on 2023.
# - Calibrate blend α on 2023 between DL and Naive; evaluate on FY2024.
# - Train: GRU_L5_U64, LSTM_L5_U64, StackedGRU_L5_64_32, BiGRU_L5_U64, TCN_L5_F64_K3; ensemble average.
# - Save predictions CSV and metrics/plots under data_2.

import os
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

# TF setup
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, optimizers, losses
except Exception as e:
    raise RuntimeError("TensorFlow is required for Step 7. Install with: pip3 install tensorflow") from e

# Paths
BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
RESULTS_ROOT = ROOT / "results_Data_2"
PLOTS_DIR = RESULTS_ROOT / "plots"
METRICS_DIR = RESULTS_ROOT / "metrics"
RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)

# Inputs
IN_TRAIN = DATA / "sector_budget_features_train.csv"
IN_TEST  = DATA / "sector_budget_features_test_2024.csv"
IN_META  = DATA / "feature_columns.json"
IN_PANEL = DATA / "sector_budget_macro_panel.csv"  # has Budget_Lag1, Budget_Log_Diff1
IN_BASELINE = DATA / "metrics" / "sector_baseline_metrics_23_24.csv"

# Outputs
OUT_PRED_CSV = DATA / "sector_dl_predictions_23_24.csv"
OUT_METRICS_CSV = METRICS_DIR / "dl_metrics_23_24.csv"
OUT_METRICS_ALL_CSV = METRICS_DIR / "metrics_all_23_24.csv"

LOOKBACK = 5
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

print("Loading feature metadata and panels...")
with open(IN_META, "r") as f:
    meta = json.load(f)
z_features = meta.get("z_features", [])
if not z_features:
    raise RuntimeError("No z_features found. Run Step 5 first.")

# Load features (train contains train+val(2023); test is 2024)
train_df = pd.read_csv(IN_TRAIN)
test_df  = pd.read_csv(IN_TEST)

# Merge with engineered panel to get targets and naive baseline
# ...existing code...
# Merge with engineered panel to get targets and naive baseline
panel = pd.read_csv(IN_PANEL, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
for c in ["Budget_Amount", "Budget_Lag1", "Budget_Log_Diff1", "Year_End"]:
    if c in panel.columns:
        panel[c] = pd.to_numeric(panel[c], errors="coerce")
panel["Fiscal_Year"] = panel["Fiscal_Year"].astype("string").str.strip()

# ...existing code...
# Merge feature frames to have all years up to 2024
feats_all = pd.concat([train_df, test_df], ignore_index=True)
feats_all["Fiscal_Year"] = feats_all["Fiscal_Year"].astype("string").str.strip()

# Attach target residual and naive baseline (from engineered panel)
# IMPORTANT: do NOT include Budget_Amount from panel to avoid *_x/*_y suffixes
use_cols = ["Sector_12", "Fiscal_Year", "Year_End", "Budget_Lag1", "Budget_Log_Diff1"]
panel_use = panel[use_cols].copy()
feats = feats_all.merge(panel_use, on=["Sector_12", "Fiscal_Year", "Year_End"], how="left")

# If previous runs created *_x/*_y, coalesce to a single column
def _coalesce(df, base):
    if base in df.columns:
        return df
    x, y = f"{base}_x", f"{base}_y"
    if x in df.columns or y in df.columns:
        df[base] = df.get(x).combine_first(df.get(y))
        df.drop(columns=[c for c in [x, y] if c in df.columns], inplace=True)
    return df

feats = _coalesce(feats, "Budget_Amount")

# Final sanity
if "Budget_Amount" not in feats.columns:
    raise RuntimeError(
        "Budget_Amount missing after merge. Ensure Step 5 outputs include Budget_Amount "
        "and re-run Step 5 (splits/scaling) before Step 8."
    )

# Sector index, features, embedding dims
sectors = feats["Sector_12"].astype("string").fillna("UNK").unique().tolist()
# ...existing code...
sector2id = {s: i for i, s in enumerate(sorted(sectors))}
n_sectors = len(sector2id)
n_features = len(z_features)
embed_dim = max(4, min(16, int(round(math.sqrt(n_sectors)))))

def build_sequences(df: pd.DataFrame, lookback: int, target_year: int | None = None):
    X_list, S_list, y_list, keys = [], [], [], []
    A_list, N_list = [], []
    for s, g in df.groupby("Sector_12", sort=False):
        g = g.sort_values("Year_End")
        Z = g[z_features].values
        y = g["Budget_Log_Diff1"].values
        A = g["Budget_Amount"].values
        N = g["Budget_Lag1"].values
        yrs = g["Year_End"].values
        fyears = g["Fiscal_Year"].values

        sid = sector2id.get(s, 0)
        for i in range(lookback, len(g)):
            # keep only desired target year if specified
            if target_year is not None and int(yrs[i]) != int(target_year):
                continue
            if not np.isfinite(y[i]) or not np.isfinite(N[i]) or N[i] <= 0:
                continue
            X_seq = Z[i - lookback:i, :]
            X_list.append(X_seq.astype(np.float32))
            S_list.append(np.int32(sid))
            y_list.append(np.float32(y[i]))
            A_list.append(np.float64(A[i]))
            N_list.append(np.float64(N[i]))
            keys.append((str(s), str(fyears[i]), int(yrs[i])))
    return (
        np.array(X_list, dtype=np.float32),
        np.array(S_list, dtype=np.int32),
        np.array(y_list, dtype=np.float32),
        np.array(A_list, dtype=np.float64),
        np.array(N_list, dtype=np.float64),
        keys,
    )

# Build from history ≤ target year, and filter targets inside
feats_hist_tr = feats[feats["Year_End"] <= 2022].copy()
feats_hist_va = feats[feats["Year_End"] <= 2023].copy()
feats_hist_te = feats[feats["Year_End"] <= 2024].copy()

X_tr, S_tr, y_tr, A_tr, N_tr, K_tr = build_sequences(feats_hist_tr, LOOKBACK, target_year=None)
X_va, S_va, y_va, A_va, N_va, K_va = build_sequences(feats_hist_va, LOOKBACK, target_year=2023)
X_te, S_te, y_te, A_te, N_te, K_te = build_sequences(feats_hist_te, LOOKBACK, target_year=2024)

print(f"Built sequences with lookback={LOOKBACK}:")
print("Train:", X_tr.shape, "Val:", X_va.shape, "Test:", X_te.shape, "n_features:", n_features, "n_sectors:", n_sectors)

if X_tr.shape[0] == 0:
    raise RuntimeError(f"No training sequences with lookback={LOOKBACK}. Reduce lookback or check feature z_ columns.")

def huber_model_core(model_type: str, units_main: int = 64, kernel_size: int = 3):
    seq_in = layers.Input(shape=(LOOKBACK, n_features), name="seq_in")
    sec_in = layers.Input(shape=(), dtype="int32", name="sec_in")
    emb = layers.Embedding(input_dim=n_sectors, output_dim=embed_dim, name="sector_emb")(sec_in)
    emb_v = layers.Flatten()(emb)

    if model_type == "gru":
        enc = layers.GRU(units_main, name="enc_gru")(seq_in)
    elif model_type == "lstm":
        enc = layers.LSTM(units_main, name="enc_lstm")(seq_in)
    elif model_type == "bigru":
        enc = layers.Bidirectional(layers.GRU(units_main, name="enc_bigru_base"), name="enc_bigru")(seq_in)
    elif model_type == "stacked_gru":
        x = layers.GRU(units_main, return_sequences=True, name="enc_gru_1")(seq_in)
        enc = layers.GRU(max(16, units_main // 2), name="enc_gru_2")(x)
    elif model_type == "tcn":
        # lightweight TCN via dilated causal Conv1D blocks
        x = seq_in
        for d in [1, 2, 4]:
            res = x
            x = layers.Conv1D(filters=units_main, kernel_size=kernel_size, padding="causal", dilation_rate=d, activation="relu")(x)
            x = layers.BatchNormalization()(x)
            x = layers.Dropout(0.1)(x)
            # match channels for residual
            if res.shape[-1] != x.shape[-1]:
                res = layers.Conv1D(filters=units_main, kernel_size=1, padding="same")(res)
            x = layers.Add()([res, x])
            x = layers.Activation("relu")(x)
        enc = layers.GlobalAveragePooling1D()(x)
    else:
        raise ValueError(f"Unknown model_type: {model_type}")

    z = layers.Concatenate()([enc, emb_v])
    z = layers.Dense(64, activation="relu")(z)
    z = layers.Dropout(0.2)(z)
    out = layers.Dense(1, name="residual_log_growth")(z)

    model = models.Model(inputs=[seq_in, sec_in], outputs=out)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss=losses.Huber(delta=1.0),
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )
    return model

MODEL_SPECS = {
    "GRU_L5_U64":        ("gru",          64, 3),
    "LSTM_L5_U64":       ("lstm",         64, 3),
    "StackedGRU_L5_64_32": ("stacked_gru", 64, 3),
    "BiGRU_L5_U64":      ("bigru",        64, 3),
    "TCN_L5_F64_K3":     ("tcn",          64, 3),
}

es = callbacks.EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=7, min_lr=1e-5)
cb = [es, rlr]

def metrics_all(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if not np.any(mask):
        return np.nan, np.nan, np.nan, np.nan
    yt, yp = y_true[mask], y_pred[mask]
    mae = np.mean(np.abs(yt - yp))
    rmse = float(np.sqrt(np.mean((yt - yp) ** 2)))
    r2 = 1.0 - (np.sum((yt - yp) ** 2) / np.sum((yt - np.mean(yt)) ** 2) if np.sum((yt - np.mean(yt)) ** 2) > 0 else np.nan)
    mape = np.mean(np.abs((yt - yp) / yt)) * 100.0 if np.all(yt != 0) else np.nan
    return mae, rmse, r2, mape

def best_alpha(naive_lvl, dl_lvl, actual_lvl):
    # Find α in [0,1] minimizing MAE on validation set
    alphas = np.linspace(0.0, 1.0, 101)
    best_a, best_mae = 0.0, float("inf")
    for a in alphas:
        blend = a * dl_lvl + (1 - a) * naive_lvl
        mae = np.mean(np.abs(actual_lvl - blend))
        if mae < best_mae:
            best_mae, best_a = mae, a
    return float(best_a), float(best_mae)

# Train and evaluate
model_preds_test = {}
model_preds_val = {}
model_alphas = {}
dl_metrics_rows = []


print("\nTraining models...")
for name, (mtype, units_main, ksz) in MODEL_SPECS.items():
    print(f"\nModel: {name}")
    tf.keras.backend.clear_session()
    model = huber_model_core(mtype, units_main=units_main, kernel_size=ksz)
    hist = model.fit(
        x=[X_tr, S_tr], y=y_tr,
        validation_data=([X_va, S_va], y_va) if len(X_va) else None,
        epochs=200, batch_size=32, verbose=0, callbacks=cb
    )

    # Predict residuals (guard empty arrays)
    has_val = len(X_va) > 0
    has_test = len(X_te) > 0

    res_val = model.predict([X_va, S_va], verbose=0).reshape(-1) if has_val else np.array([], dtype=np.float32)
    res_te  = model.predict([X_te, S_te], verbose=0).reshape(-1) if has_test else np.array([], dtype=np.float32)

    # Reconstruct levels: B_hat = B_naive * exp(residual)
    dl_val_lvl = (N_va * np.exp(res_val)) if has_val else np.array([], dtype=np.float64)
    dl_te_lvl  = (N_te * np.exp(res_te))  if has_test else np.array([], dtype=np.float64)

    # Calibrate α on 2023 (fallback if no val)
    if has_val:
        alpha, _ = best_alpha(N_va, dl_val_lvl, A_va)
    else:
        alpha = 1.0  # default to DL only when no validation to tune blend
    model_alphas[name] = alpha

    # Blend on val/test
    val_blend = (alpha * dl_val_lvl + (1 - alpha) * N_va) if has_val else np.array([], dtype=np.float64)
    te_blend  = (alpha * dl_te_lvl  + (1 - alpha) * N_te) if has_test else np.array([], dtype=np.float64)

    # Save preds
    model_preds_val[name] = {"dl": dl_val_lvl, "blend": val_blend}
    model_preds_test[name] = {"dl": dl_te_lvl,  "blend": te_blend}

    # Metrics on 2024 (test)
    mae, rmse, r2, mape = metrics_all(A_te, te_blend) if has_test else (np.nan, np.nan, np.nan, np.nan)
    dl_metrics_rows.append({
        "model": f"{name}_Blend",
        "n_eval": int(len(A_te)),
        "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape,
        "alpha": alpha
    })
    mae2, rmse2, r22, mape2 = metrics_all(A_te, dl_te_lvl) if has_test else (np.nan, np.nan, np.nan, np.nan)
    dl_metrics_rows.append({
        "model": f"{name}_DL_Level",
        "n_eval": int(len(A_te)),
        "MAE": mae2, "RMSE": rmse2, "R2": r22, "MAPE_%": mape2,
        "alpha": np.nan
    })
    print(f"  α={alpha:.2f} | 2024 Blend MAE={mae:.2f} RMSE={rmse:.2f} R2={r2:.3f} MAPE%={mape:.2f}")

# Ensemble of blended predictions (mean across models)
if len(model_preds_test):
    te_blends_stack = np.column_stack([model_preds_test[m]["blend"] for m in MODEL_SPECS.keys()])
    ens_blend = te_blends_stack.mean(axis=1) if te_blends_stack.size else np.array([], dtype=np.float64)
    mae, rmse, r2, mape = metrics_all(A_te, ens_blend) if len(A_te) else (np.nan, np.nan, np.nan, np.nan)
    dl_metrics_rows.append({
        "model": "Ensemble_Blend",
        "n_eval": int(len(A_te)),
        "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape,
        "alpha": np.nan
    })

# Build predictions dataframe for FY2024
pred_rows = []
if len(K_te):
    for i, (sec, fy, ye) in enumerate(K_te):
        row = {
            "Sector_12": sec,
            "Fiscal_Year": fy,
            "Year_End": ye,
            "Actual": float(A_te[i]),
            "Naive_Lag1": float(N_te[i]),
        }
        for m in MODEL_SPECS.keys():
            row[f"{m}_DL_Level"] = float(model_preds_test[m]["dl"][i]) if len(model_preds_test[m]["dl"]) else np.nan
            row[f"{m}_Blend"]    = float(model_preds_test[m]["blend"][i]) if len(model_preds_test[m]["blend"]) else np.nan
        if "ens_blend" in locals() and len(ens_blend):
            row["Ensemble_Blend"] = float(ens_blend[i])
        pred_rows.append(row)

# Safely build/save predictions even if empty
if pred_rows:
    pred_df = pd.DataFrame(pred_rows).sort_values(["Sector_12"]).reset_index(drop=True)
else:
    cols = ["Sector_12","Fiscal_Year","Year_End","Actual","Naive_Lag1"] \
           + [f"{m}_DL_Level" for m in MODEL_SPECS.keys()] \
           + [f"{m}_Blend" for m in MODEL_SPECS.keys()]
    if "ens_blend" in locals():
        cols += ["Ensemble_Blend"]
    pred_df = pd.DataFrame(columns=cols)
    print("Warning: No FY2024 sequences available for lookback,"
          " predictions CSV will be empty. Consider reducing LOOKBACK.")

pred_df.to_csv(OUT_PRED_CSV, index=False)
print("\nSaved predictions:", OUT_PRED_CSV)
# ...existing code...

# Save DL metrics
dl_metrics_df = pd.DataFrame(dl_metrics_rows, columns=["model","n_eval","MAE","RMSE","R2","MAPE_%","alpha"])
dl_metrics_df.to_csv(OUT_METRICS_CSV, index=False)
print("Saved DL metrics:", OUT_METRICS_CSV)

# Combine with baseline (Linear/Ridge/GBM) if available
if IN_BASELINE.exists():
    base_metrics = pd.read_csv(IN_BASELINE)
    # Ensure consistent columns
    common_cols = ["model","n_eval","MAE","RMSE","R2","MAPE_%"]
    base_metrics = base_metrics[common_cols]
    all_metrics = pd.concat([base_metrics, dl_metrics_df[common_cols]], ignore_index=True)
else:
    all_metrics = dl_metrics_df.drop(columns=["alpha"])

all_metrics.to_csv(OUT_METRICS_ALL_CSV, index=False)
print("Saved combined metrics:", OUT_METRICS_ALL_CSV)

# Plots: compare MAE/RMSE/R2/MAPE across models
def barplot_metric(df, metric, out_path):
    plt.figure(figsize=(12, 6))
    dfx = df.copy()
    dfx = dfx.sort_values(metric, ascending=(metric not in ["R2"]))
    palette = ["#4c78a8" if "DL" not in m and "Blend" not in m else "#59a14f" for m in dfx["model"]]
    sns.barplot(data=dfx, x="model", y=metric, palette=palette)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

barplot_metric(all_metrics, "MAE",   PLOTS_DIR / "dl_vs_baselines_23_24_MAE.png")
barplot_metric(all_metrics, "RMSE",  PLOTS_DIR / "dl_vs_baselines_23_24_RMSE.png")
barplot_metric(all_metrics, "R2",    PLOTS_DIR / "dl_vs_baselines_23_24_R2.png")
barplot_metric(all_metrics, "MAPE_%",PLOTS_DIR / "dl_vs_baselines_23_24_MAPE_%.png")

print("Saved plots to:", PLOTS_DIR)

# Optional: scatter Actual vs Best (Ensemble_Blend if present, else best blend)
best_series = None
best_name = None
if "ens_blend" in locals():
    best_series = ens_blend
    best_name = "Ensemble_Blend"
else:
    # choose best by lowest MAE among blends
    blend_rows = [r for r in dl_metrics_rows if r["model"].endswith("_Blend")]
    if blend_rows:
        best_name = min(blend_rows, key=lambda r: r["MAE"])["model"]
        mkey = best_name.replace("_Blend","")
        best_series = model_preds_test[mkey]["blend"]

# Guard empty arrays before plotting
if best_series is not None and len(A_te) and len(best_series):
    plt.figure(figsize=(6,6))
    plt.scatter(A_te, best_series, alpha=0.6)
    lim = [0, max(np.max(A_te), np.max(best_series))*1.05]
    plt.plot(lim, lim, "k--", lw=1)
    plt.xlabel("Actual (FY2024)")
    plt.ylabel(f"Predicted ({best_name})")
    plt.title("Actual vs Predicted (FY2024)")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "actual_vs_pred_best.png", dpi=150)
    plt.close()
    print("Saved scatter:", PLOTS_DIR / "actual_vs_pred_best.png")

print("\nStep 7 complete.")

Loading feature metadata and panels...
Built sequences with lookback=5:
Train: (110, 5, 21) Val: (11, 5, 21) Test: (11, 5, 21) n_features: 21 n_sectors: 11

Training models...

Model: GRU_L5_U64
  α=0.64 | 2024 Blend MAE=31179.28 RMSE=45580.17 R2=0.829 MAPE%=17.79

Model: LSTM_L5_U64
  α=0.60 | 2024 Blend MAE=22559.93 RMSE=33410.99 R2=0.908 MAPE%=12.17

Model: StackedGRU_L5_64_32
  α=1.00 | 2024 Blend MAE=26500.98 RMSE=40899.77 R2=0.862 MAPE%=14.34

Model: BiGRU_L5_U64
  α=0.58 | 2024 Blend MAE=32831.00 RMSE=43556.30 R2=0.844 MAPE%=16.36

Model: TCN_L5_F64_K3
  α=0.43 | 2024 Blend MAE=20721.56 RMSE=31558.48 R2=0.918 MAPE%=11.82

Saved predictions: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/data/sector_dl_predictions_23_24.csv
Saved DL metrics: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/metrics/dl_metrics_23_24.csv
Saved combined metrics: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/metrics/metrics_all_23_24.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=dfx, x="model", y=metric, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=dfx, x="model", y=metric, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=dfx, x="model", y=metric, palette=palette)


Saved plots to: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/plots
Saved scatter: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/plots/actual_vs_pred_best.png

Step 7 complete.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=dfx, x="model", y=metric, palette=palette)


8.Forecast next FY (optional FY24-25)
* Create last lookback window per sector from latest year; predict FY24-25 levels via residual+naive reconstruction; attach last actual and growth vs 2024.
* Save ensemble forecast and context; plot preds vs actuals if available.
* Outputs: results/fy2425/sector_dl_forecast_2425.csv, results/fy2425/sector_dl_forecast_2425_with_context.csv, results/fy2425/forecasts/, results/fy2425/plots/

In [22]:
# Step 8: Forecast next FY (FY2024-25) → final_data/data_2/results_Data_2/fy2425/*
# - Create last lookback window (ending 2024) per sector on z_ features
# - Predict residual for 2025, reconstruct level via B_2025 = B_2024 * exp(residual)
# - Calibrate blend α on 2023 (same as Step 7), then produce blended 2025 forecasts
# - Save ensemble forecast and context; plot predictions (and vs actual if FY2025 is available)
#
# Outputs (under results_Data_2/fy2425):
# - sector_dl_forecast_2425.csv
# - sector_dl_forecast_2425_with_context.csv
# - forecasts/ (per-model CSVs)
# - plots/

import os
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, optimizers, losses
except Exception as e:
    raise RuntimeError("TensorFlow is required for Step 8. Install with: pip3 install tensorflow") from e

# Paths
BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
RESULTS_ROOT = ROOT / "results_Data_2"
FY_DIR = RESULTS_ROOT / "fy2425"
FORECASTS_DIR = FY_DIR / "forecasts"
PLOTS_DIR = FY_DIR / "plots"

FY_DIR.mkdir(parents=True, exist_ok=True)
FORECASTS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Inputs
IN_TRAIN = DATA / "sector_budget_features_train.csv"   # ≤2023
IN_TEST  = DATA / "sector_budget_features_test_2024.csv"  # 2024
IN_META  = DATA / "feature_columns.json"
IN_PANEL = DATA / "sector_budget_macro_panel.csv"  # has Budget_Amount, Budget_Lag1, Budget_Log_Diff1

# Outputs
OUT_FORECAST_CSV = FY_DIR / "sector_dl_forecast_2425.csv"
OUT_FORECAST_CTX = FY_DIR / "sector_dl_forecast_2425_with_context.csv"

LOOKBACK = 5
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Load feature meta
with open(IN_META, "r") as f:
    meta = json.load(f)
z_features = meta.get("z_features", [])
if not z_features:
    raise RuntimeError("No z_features found. Run Step 5 first.")

# Load inputs
train_df = pd.read_csv(IN_TRAIN)
test_df  = pd.read_csv(IN_TEST)

panel = pd.read_csv(IN_PANEL, dtype={"Sector_12": "string", "Fiscal_Year": "string"})
for c in ["Budget_Amount", "Budget_Lag1", "Budget_Log_Diff1", "Year_End"]:
    if c in panel.columns:
        panel[c] = pd.to_numeric(panel[c], errors="coerce")
panel["Fiscal_Year"] = panel["Fiscal_Year"].astype("string").str.strip()

# Merge feature frames to have all years up to 2024
feats_all = pd.concat([train_df, test_df], ignore_index=True)
feats_all["Fiscal_Year"] = feats_all["Fiscal_Year"].astype("string").str.strip()

# Attach target residual and naive baseline (from engineered panel)
# IMPORTANT: do NOT include Budget_Amount from panel to avoid *_x/*_y suffixes
use_cols = ["Sector_12", "Fiscal_Year", "Year_End", "Budget_Lag1", "Budget_Log_Diff1"]
panel_use = panel[use_cols].copy()
feats = feats_all.merge(panel_use, on=["Sector_12", "Fiscal_Year", "Year_End"], how="left")

# If previous runs created *_x/*_y, coalesce to a single column
def _coalesce(df, base):
    if base in df.columns:
        return df
    x, y = f"{base}_x", f"{base}_y"
    if x in df.columns or y in df.columns:
        df[base] = df.get(x).combine_first(df.get(y))
        df.drop(columns=[c for c in [x, y] if c in df.columns], inplace=True)
    return df

feats = _coalesce(feats, "Budget_Amount")

# Final sanity
if "Budget_Amount" not in feats.columns:
    raise RuntimeError(
        "Budget_Amount missing after merge. Ensure Step 5 outputs include Budget_Amount "
        "and re-run Step 5 (splits/scaling) before Step 8."
    )

# Sector index, features, embedding dims
sectors = feats["Sector_12"].astype("string").fillna("UNK").unique().tolist()
sector2id = {s: i for i, s in enumerate(sorted(sectors))}
n_sectors = len(sector2id)
n_features = len(z_features)
embed_dim = max(4, min(16, int(round(math.sqrt(n_sectors)))))

# Train/val/test sequences (same protocol as Step 7)
def build_sequences(df: pd.DataFrame, lookback: int, target_year: int | None = None):
    X_list, S_list, y_list, keys = [], [], [], []
    A_list, N_list = [], []
    for s, g in df.groupby("Sector_12", sort=False):
        g = g.sort_values("Year_End")
        Z = g[z_features].values
        y = g["Budget_Log_Diff1"].values
        A = g["Budget_Amount"].values
        N = g["Budget_Lag1"].values
        yrs = g["Year_End"].values
        fyears = g["Fiscal_Year"].values

        sid = sector2id.get(s, 0)
        for i in range(lookback, len(g)):
            if target_year is not None and int(yrs[i]) != int(target_year):
                continue
            if not np.isfinite(y[i]) or not np.isfinite(N[i]) or N[i] <= 0:
                continue
            X_seq = Z[i - lookback:i, :]
            X_list.append(X_seq.astype(np.float32))
            S_list.append(np.int32(sid))
            y_list.append(np.float32(y[i]))
            A_list.append(np.float64(A[i]))
            N_list.append(np.float64(N[i]))
            keys.append((str(s), str(fyears[i]), int(yrs[i])))
    return (
        np.array(X_list, dtype=np.float32),
        np.array(S_list, dtype=np.int32),
        np.array(y_list, dtype=np.float32),
        np.array(A_list, dtype=np.float64),
        np.array(N_list, dtype=np.float64),
        keys,
    )

feats_hist_tr = feats[feats["Year_End"] <= 2022].copy()
feats_hist_va = feats[feats["Year_End"] <= 2023].copy()

X_tr, S_tr, y_tr, A_tr, N_tr, K_tr = build_sequences(feats_hist_tr, LOOKBACK, target_year=None)
X_va, S_va, y_va, A_va, N_va, K_va = build_sequences(feats_hist_va, LOOKBACK, target_year=2023)

if X_tr.shape[0] == 0:
    raise RuntimeError(f"No training sequences with lookback={LOOKBACK}. Reduce lookback or check z_ features.")

# Model core (same as Step 7)
def huber_model_core(model_type: str, units_main: int = 64, kernel_size: int = 3):
    seq_in = layers.Input(shape=(LOOKBACK, n_features), name="seq_in")
    sec_in = layers.Input(shape=(), dtype="int32", name="sec_in")
    emb = layers.Embedding(input_dim=n_sectors, output_dim=embed_dim, name="sector_emb")(sec_in)
    emb_v = layers.Flatten()(emb)

    if model_type == "gru":
        enc = layers.GRU(units_main, name="enc_gru")(seq_in)
    elif model_type == "lstm":
        enc = layers.LSTM(units_main, name="enc_lstm")(seq_in)
    elif model_type == "bigru":
        enc = layers.Bidirectional(layers.GRU(units_main, name="enc_bigru_base"), name="enc_bigru")(seq_in)
    elif model_type == "stacked_gru":
        x = layers.GRU(units_main, return_sequences=True, name="enc_gru_1")(seq_in)
        enc = layers.GRU(max(16, units_main // 2), name="enc_gru_2")(x)
    elif model_type == "tcn":
        x = seq_in
        for d in [1, 2, 4]:
            res = x
            x = layers.Conv1D(filters=units_main, kernel_size=kernel_size, padding="causal", dilation_rate=d, activation="relu")(x)
            x = layers.BatchNormalization()(x)
            x = layers.Dropout(0.1)(x)
            if res.shape[-1] != x.shape[-1]:
                res = layers.Conv1D(filters=units_main, kernel_size=1, padding="same")(res)
            x = layers.Add()([res, x])
            x = layers.Activation("relu")(x)
        enc = layers.GlobalAveragePooling1D()(x)
    else:
        raise ValueError(f"Unknown model_type: {model_type}")

    z = layers.Concatenate()([enc, emb_v])
    z = layers.Dense(64, activation="relu")(z)
    z = layers.Dropout(0.2)(z)
    out = layers.Dense(1, name="residual_log_growth")(z)

    model = models.Model(inputs=[seq_in, sec_in], outputs=out)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss=losses.Huber(delta=1.0),
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )
    return model

MODEL_SPECS = {
    "GRU_L5_U64":          ("gru",         64, 3),
    "LSTM_L5_U64":         ("lstm",        64, 3),
    "StackedGRU_L5_64_32": ("stacked_gru", 64, 3),
    "BiGRU_L5_U64":        ("bigru",       64, 3),
    "TCN_L5_F64_K3":       ("tcn",         64, 3),
}

es = callbacks.EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=7, min_lr=1e-5)
cb = [es, rlr]

def best_alpha(naive_lvl, dl_lvl, actual_lvl):
    alphas = np.linspace(0.0, 1.0, 101)
    best_a, best_mae = 0.0, float("inf")
    for a in alphas:
        blend = a * dl_lvl + (1 - a) * naive_lvl
        mae = np.mean(np.abs(actual_lvl - blend))
        if mae < best_mae:
            best_mae, best_a = mae, a
    return float(best_a), float(best_mae)

# Train models, tune α on 2023
print("\nStep 8: Training models (for forecast) and tuning α on 2023...")
model_objs = {}
model_alphas = {}

for name, (mtype, units_main, ksz) in MODEL_SPECS.items():
    print(f"  Training {name} ...")
    tf.keras.backend.clear_session()
    model = huber_model_core(mtype, units_main=units_main, kernel_size=ksz)
    _ = model.fit(
        x=[X_tr, S_tr], y=y_tr,
        validation_data=([X_va, S_va], y_va) if len(X_va) else None,
        epochs=200, batch_size=32, verbose=0, callbacks=cb
    )
    # Val residuals -> levels
    if len(X_va):
        res_val = model.predict([X_va, S_va], verbose=0).reshape(-1)
        dl_val_lvl = (N_va * np.exp(res_val))
        alpha, _ = best_alpha(N_va, dl_val_lvl, A_va)
    else:
        alpha = 1.0
    model_objs[name] = model
    model_alphas[name] = alpha
    print(f"    α tuned on 2023: {alpha:.2f}")

# Build last windows (ending 2024) for each sector for 2025 inference
feats_2024 = feats[feats["Year_End"] <= 2024].copy()

last_rows = []
X_2025_list, S_2025_list, N_2025_list, keys_2025 = [], [], [], []

for s, g in feats_2024.groupby("Sector_12", sort=False):
    g = g.sort_values("Year_End")
    if len(g) < LOOKBACK:
        continue
    # Need last window ending Year_End=2024
    if (g["Year_End"] == 2024).any():
        # get indices for last LOOKBACK rows ending 2024
        g_idx = g.index.tolist()
        # ensure last row is 2024
        last_idx = g[g["Year_End"] == 2024].index[-1]
        # position of last_idx within g
        pos = g.index.get_loc(last_idx)
        if pos + 1 >= LOOKBACK:
            window_idx = g_idx[pos + 1 - LOOKBACK: pos + 1]
            Z = g.loc[window_idx, z_features].values
            if Z.shape != (LOOKBACK, n_features):
                continue
            # Naive for 2025 is last actual (2024 Budget)
            a_last = g.loc[last_idx, "Budget_Amount"]
            if not np.isfinite(a_last) or a_last <= 0:
                continue
            X_2025_list.append(Z.astype(np.float32))
            S_2025_list.append(np.int32(sector2id.get(s, 0)))
            N_2025_list.append(float(a_last))
            keys_2025.append((str(s), "24-25", 2025, float(a_last)))

X_2025 = np.array(X_2025_list, dtype=np.float32)
S_2025 = np.array(S_2025_list, dtype=np.int32)
N_2025 = np.array(N_2025_list, dtype=np.float64)

print(f"\nBuilt last windows for 2025 forecast: n_sectors={len(keys_2025)}, X_2025.shape={X_2025.shape}")

if X_2025.shape[0] == 0:
    print("Warning: No sectors have sufficient history for the chosen LOOKBACK to forecast 2025. Consider reducing LOOKBACK.")
    # still write empty outputs
    pd.DataFrame(columns=["Sector_12","Fiscal_Year","Year_End","Naive_Lag1_2024","Ensemble_Blend_2025","Growth_vs_2024_%"]).to_csv(OUT_FORECAST_CSV, index=False)
    pd.DataFrame(columns=["Sector_12","Fiscal_Year","Year_End"]).to_csv(OUT_FORECAST_CTX, index=False)
else:
    # Predict per model; reconstruct DL level and blend with α; build ensemble
    model_forecasts = {}
    for name, model in model_objs.items():
        res_pred = model.predict([X_2025, S_2025], verbose=0).reshape(-1)
        dl_level = N_2025 * np.exp(res_pred)  # B_2025_DL
        alpha = model_alphas[name]
        blend = alpha * dl_level + (1 - alpha) * N_2025
        model_forecasts[name] = {"dl": dl_level, "blend": blend}

        # Save per-model forecast CSV
        rows = []
        for i, (sec, fy, ye, naive_2024) in enumerate(keys_2025):
            rows.append({
                "Sector_12": sec,
                "Fiscal_Year": fy,
                "Year_End": ye,
                "Naive_Lag1_2024": naive_2024,
                f"{name}_DL_2025": float(dl_level[i]),
                f"{name}_Blend_2025": float(blend[i]),
                "alpha": float(alpha)
            })
        pd.DataFrame(rows).to_csv(FORECASTS_DIR / f"{name}_forecast_2425.csv", index=False)

    # Ensemble across blended forecasts
    blends_stack = np.column_stack([model_forecasts[m]["blend"] for m in MODEL_SPECS.keys()])
    ens_blend = blends_stack.mean(axis=1)

    # Build compact forecast CSV
    compact_rows = []
    for i, (sec, fy, ye, naive_2024) in enumerate(keys_2025):
        comp = {
            "Sector_12": sec,
            "Fiscal_Year": fy,
            "Year_End": ye,
            "Naive_Lag1_2024": float(naive_2024),
            "Ensemble_Blend_2025": float(ens_blend[i]),
        }
        # Growth vs 2024 in %
        if naive_2024 and np.isfinite(naive_2024):
            comp["Growth_vs_2024_%"] = float((ens_blend[i] - naive_2024) / naive_2024 * 100.0)
        else:
            comp["Growth_vs_2024_%"] = np.nan
        compact_rows.append(comp)
    compact_df = pd.DataFrame(compact_rows).sort_values(["Sector_12"]).reset_index(drop=True)
    compact_df.to_csv(OUT_FORECAST_CSV, index=False)

    # Build context-rich forecast CSV (include per-model DL and Blend, ensemble, alpha per model)
    ctx_rows = []
    for i, (sec, fy, ye, naive_2024) in enumerate(keys_2025):
        row = {
            "Sector_12": sec,
            "Fiscal_Year": fy,
            "Year_End": ye,
            "Naive_Lag1_2024": float(naive_2024),
            "Ensemble_Blend_2025": float(ens_blend[i]),
            "Growth_vs_2024_%": float((ens_blend[i] - naive_2024) / naive_2024 * 100.0) if naive_2024 else np.nan,
        }
        for m in MODEL_SPECS.keys():
            row[f"{m}_DL_2025"] = float(model_forecasts[m]["dl"][i])
            row[f"{m}_Blend_2025"] = float(model_forecasts[m]["blend"][i])
            row[f"{m}_alpha"] = float(model_alphas[m])
        ctx_rows.append(row)
    ctx_df = pd.DataFrame(ctx_rows).sort_values(["Sector_12"]).reset_index(drop=True)
    ctx_df.to_csv(OUT_FORECAST_CTX, index=False)

    # Plot: bar plot of Ensemble forecast vs naive (2024), sorted by growth
    plot_df = compact_df.copy()
    plot_df["Growth_vs_2024_%"] = pd.to_numeric(plot_df["Growth_vs_2024_%"], errors="coerce")
    plot_df = plot_df.dropna(subset=["Growth_vs_2024_%"]).sort_values("Growth_vs_2024_%", ascending=False)

    plt.figure(figsize=(12, 6))
    sns.barplot(data=plot_df, x="Sector_12", y="Growth_vs_2024_%", color="#59a14f")
    plt.title("Forecasted Growth vs 2024 (Ensemble Blend) — FY2024-25")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "forecast_growth_vs_2024.png", dpi=150)
    plt.close()

    # Optional: if actual 2025 exists in panel, plot vs actual
    has_2025 = (panel["Year_End"] == 2025).any()
    if has_2025:
        actual_2025 = (
            panel.loc[panel["Year_End"] == 2025, ["Sector_12", "Budget_Amount"]]
                 .dropna()
                 .groupby("Sector_12", as_index=False)["Budget_Amount"].sum()
        )
        merged = compact_df.merge(actual_2025, on="Sector_12", how="left", suffixes=("", "_Actual_2025"))
        merged = merged.rename(columns={"Budget_Amount": "Actual_2025"})
        merged_nonan = merged.dropna(subset=["Actual_2025"])
        if len(merged_nonan):
            plt.figure(figsize=(6,6))
            plt.scatter(merged_nonan["Actual_2025"], merged_nonan["Ensemble_Blend_2025"], alpha=0.6)
            lim = [0, max(merged_nonan["Actual_2025"].max(), merged_nonan["Ensemble_Blend_2025"].max()) * 1.05]
            plt.plot(lim, lim, "k--", lw=1)
            plt.xlabel("Actual 2025")
            plt.ylabel("Forecast 2025 (Ensemble Blend)")
            plt.title("Actual vs Forecast (FY2024-25)")
            plt.tight_layout()
            plt.savefig(PLOTS_DIR / "actual_vs_forecast_2025.png", dpi=150)
            plt.close()

    print("\nSaved forecast CSVs and plots:")
    print(" -", OUT_FORECAST_CSV)
    print(" -", OUT_FORECAST_CTX)
    print(" -", FORECASTS_DIR)
    print(" -", PLOTS_DIR)


Step 8: Training models (for forecast) and tuning α on 2023...
  Training GRU_L5_U64 ...
    α tuned on 2023: 1.00
  Training LSTM_L5_U64 ...
    α tuned on 2023: 0.00
  Training StackedGRU_L5_64_32 ...
    α tuned on 2023: 0.91
  Training BiGRU_L5_U64 ...
    α tuned on 2023: 0.23
  Training TCN_L5_F64_K3 ...
    α tuned on 2023: 0.65

Built last windows for 2025 forecast: n_sectors=11, X_2025.shape=(11, 5, 21)

Saved forecast CSVs and plots:
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/sector_dl_forecast_2425.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/sector_dl_forecast_2425_with_context.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/forecasts
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/plots


In [23]:
# Step 8 (Ministry): FY2024-25 ministry-wise forecasts (DL vs Linear) → results_Data_2/fy2425/ministries/*
# - Build ministry-level features from sector_ministry_timeseries + sector panel/macros
# - Residual learning on log-growth vs Naive(Lag1); embeddings for Sector and Ministry; validate α on 2023
# - Linear baselines (OLS, Ridge) on flattened lookback windows; same residual reconstruction and α tuning
# - Evaluate on FY2024; forecast FY2025 from windows ending 2024; save combined outputs

import os
import re
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge

# TensorFlow (for DL)
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, optimizers, losses
except Exception as e:
    raise RuntimeError("TensorFlow is required for this step. Install with: pip3 install tensorflow") from e

# Paths
BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
DATA = ROOT / "data"
RESULTS_ROOT = ROOT / "results_Data_2"

FY_DIR = RESULTS_ROOT / "fy2425" / "ministries"
FORECASTS_DIR = FY_DIR / "forecasts"
PLOTS_DIR = FY_DIR / "plots"
METRICS_DIR = RESULTS_ROOT / "metrics"

FY_DIR.mkdir(parents=True, exist_ok=True)
FORECASTS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)

# Inputs
IN_MIN_TS   = DATA / "sector_ministry_timeseries.csv"       # ministry-level long ts with Sector_12
IN_SECTOR_P = DATA / "sector_budget_macro_panel.csv"        # sector-level engineered panel (Step 4)
IN_MACRO    = DATA / "macro_indicators_wb.csv"              # for safety if needed (not strictly required)

# Outputs
OUT_MIN_COMPACT = FY_DIR / "ministry_forecast_2425.csv"
OUT_MIN_CONTEXT = FY_DIR / "ministry_forecast_2425_with_context.csv"
OUT_MIN_METRICS = METRICS_DIR / "ministry_metrics_23_24.csv"

LOOKBACK = 5
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

def fy_end_year_short(s: str) -> int:
    s = str(s).strip()
    m = re.fullmatch(r"(\d{2})-(\d{2})", s)
    if not m:
        raise ValueError(f"Unexpected Fiscal_Year format: {s}")
    return 2000 + int(m.group(2))

# Load ministry time series
min_ts = pd.read_csv(IN_MIN_TS, dtype={"Sector_12":"string","Base_Ministry":"string","Fiscal_Year":"string"})
min_ts["Fiscal_Year"] = min_ts["Fiscal_Year"].astype("string").str.strip()
for c in ["Budget_Amount","Sector_Total","Ministry_Share_Sector","Year_End"]:
    if c in min_ts.columns:
        min_ts[c] = pd.to_numeric(min_ts[c], errors="coerce")
if "Year_End" not in min_ts.columns:
    min_ts["Year_End"] = min_ts["Fiscal_Year"].map(fy_end_year_short)

# Attach sector-level engineered/macros from panel (Step 4 output)
sec_panel = pd.read_csv(IN_SECTOR_P, dtype={"Sector_12":"string","Fiscal_Year":"string"})
sec_panel["Fiscal_Year"] = sec_panel["Fiscal_Year"].astype("string").str.strip()
# choose useful sector/macros cols if present
sec_keep = [
    "Sector_12","Fiscal_Year","Year_End",
    "Sector_Share_GDP","Sector_Share_Lag1","Sector_Share_Growth",
    "Share_Log_Diff1","Share_LogDiff1_MA3",
    "GDP_Growth_Rate","Inflation_CPI","Exchange_Rate_USD","Fiscal_Deficit_GDP",
    "Global_GDP_Growth","Election_Year","High_Inflation","GDP_Growth_Lag1","Inflation_Lag1",
    "Inflation_x_Election","GDPGrowth_x_Election","Trend"
]
sec_use = [c for c in sec_keep if c in sec_panel.columns]
sec_p = sec_panel[sec_use].copy()
# prefix sector cols to avoid collision
pref = {}
for c in sec_p.columns:
    if c not in ["Sector_12","Fiscal_Year","Year_End"]:
        pref[c] = f"Sec_{c}"
sec_p = sec_p.rename(columns=pref)

# Merge sector/macros into ministry rows
df = min_ts.merge(sec_p, on=["Sector_12","Fiscal_Year","Year_End"], how="left")

# Engineering at ministry level
df = df.sort_values(["Sector_12","Base_Ministry","Year_End"])
# Budget lags and growth
df["Min_Budget_Lag1"] = df.groupby(["Sector_12","Base_Ministry"])["Budget_Amount"].shift(1)
df["Min_Budget_Lag2"] = df.groupby(["Sector_12","Base_Ministry"])["Budget_Amount"].shift(2)
df["Min_Budget_Log"]  = np.log(df["Budget_Amount"].where(df["Budget_Amount"]>0))
df["Min_Budget_Log_Diff1"] = df.groupby(["Sector_12","Base_Ministry"])["Min_Budget_Log"].diff(1)
df["Min_Budget_LogDiff1_MA3"] = df.groupby(["Sector_12","Base_Ministry"])["Min_Budget_Log_Diff1"].transform(lambda s: s.rolling(3, min_periods=1).mean())
df["Min_Budget_Growth_Lag1"] = df["Budget_Amount"]/df["Min_Budget_Lag1"] - 1

# Ministry share lag/growth/log-diff
if "Ministry_Share_Sector" in df.columns:
    df["Min_Share_Lag1"] = df.groupby(["Sector_12","Base_Ministry"])["Ministry_Share_Sector"].shift(1)
    df["Min_Share_Growth"] = df["Ministry_Share_Sector"]/df["Min_Share_Lag1"] - 1
    df["Min_Share_Log"] = np.log(df["Ministry_Share_Sector"].where(df["Ministry_Share_Sector"]>0))
    df["Min_Share_Log_Diff1"] = df.groupby(["Sector_12","Base_Ministry"])["Min_Share_Log"].diff(1)
    df["Min_Share_LogDiff1_MA3"] = df.groupby(["Sector_12","Base_Ministry"])["Min_Share_Log_Diff1"].transform(lambda s: s.rolling(3, min_periods=1).mean())

# Trend within ministry
df["Min_Trend"] = df.groupby(["Sector_12","Base_Ministry"]).cumcount() + 1

# Clean inf artifacts
df.replace([np.inf,-np.inf], np.nan, inplace=True)

# Define feature whitelist (present columns only)
min_feats = [
    "Min_Budget_Lag1","Min_Budget_Lag2","Min_Budget_Growth_Lag1",
    "Min_Budget_Log_Diff1","Min_Budget_LogDiff1_MA3",
    "Ministry_Share_Sector","Min_Share_Lag1","Min_Share_Growth",
    "Min_Share_Log_Diff1","Min_Share_LogDiff1_MA3","Min_Trend"
]
sec_feats = [
    "Sec_Sector_Share_GDP","Sec_Sector_Share_Lag1","Sec_Sector_Share_Growth",
    "Sec_Share_Log_Diff1","Sec_Share_LogDiff1_MA3",
    "Sec_GDP_Growth_Rate","Sec_Inflation_CPI","Sec_Exchange_Rate_USD","Sec_Fiscal_Deficit_GDP",
    "Sec_Global_GDP_Growth","Sec_Election_Year","Sec_High_Inflation",
    "Sec_GDP_Growth_Lag1","Sec_Inflation_Lag1",
    "Sec_Inflation_x_Election","Sec_GDPGrowth_x_Election","Sec_Trend"
]
feat_cols = [c for c in (min_feats + sec_feats) if c in df.columns]

# Split flags: train ≤2023, val=2023, test=2024
if "Year_End" not in df.columns:
    df["Year_End"] = df["Fiscal_Year"].map(fy_end_year_short)
df["Split"] = np.where(df["Year_End"]==2024, "test",
                np.where(df["Year_End"]==2023, "val", "train"))
train_mask = df["Year_End"] <= 2023
val_mask   = df["Year_End"] == 2023
test_mask  = df["Year_End"] == 2024

# Impute+scale on train only
for c in feat_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
imputer = SimpleImputer(strategy="median")
scaler  = StandardScaler()
X_train = df.loc[train_mask, feat_cols]
X_train_imp = imputer.fit_transform(X_train)
_ = scaler.fit(X_train_imp)

X_all_imp = imputer.transform(df[feat_cols])
X_all_std = scaler.transform(X_all_imp)
z_cols = [f"z_{c}" for c in feat_cols]
for i, c in enumerate(z_cols):
    df[c] = X_all_std[:, i]

# Identity mappings
sectors = sorted(df["Sector_12"].dropna().astype("string").unique().tolist())
mins    = sorted(df["Base_Ministry"].dropna().astype("string").unique().tolist())
sector2id = {s:i for i,s in enumerate(sectors)}
min2id    = {m:i for i,m in enumerate(mins)}
n_sectors = len(sector2id)
n_min     = len(min2id)
n_features = len(z_cols)
embed_sec = max(4, min(16, int(round(math.sqrt(n_sectors)))))
embed_min = max(4, min(16, int(round(math.sqrt(n_min)))))

# Sequence builder (group by Sector_12, Base_Ministry)
def build_sequences_min(dfin: pd.DataFrame, lookback: int, target_year: int|None):
    X_list, S_list, M_list, y_list, keys = [], [], [], [], []
    A_list, N_list = [], []
    for (sec, bm), g in dfin.groupby(["Sector_12","Base_Ministry"], sort=False):
        g = g.sort_values("Year_End")
        Z = g[z_cols].values
        y = g["Min_Budget_Log_Diff1"].values
        A = g["Budget_Amount"].values
        N = g["Min_Budget_Lag1"].values
        yrs = g["Year_End"].values
        fyears = g["Fiscal_Year"].values

        sid = np.int32(sector2id.get(str(sec), 0))
        mid = np.int32(min2id.get(str(bm), 0))
        for i in range(lookback, len(g)):
            if target_year is not None and int(yrs[i]) != int(target_year):
                continue
            if not np.isfinite(y[i]) or not np.isfinite(N[i]) or N[i] <= 0:
                continue
            X_list.append(Z[i-lookback:i, :].astype(np.float32))
            S_list.append(sid); M_list.append(mid)
            y_list.append(np.float32(y[i]))
            A_list.append(np.float64(A[i]))
            N_list.append(np.float64(N[i]))
            keys.append((str(sec), str(bm), str(fyears[i]), int(yrs[i])))
    if not len(X_list):
        return (np.zeros((0,lookback,n_features),dtype=np.float32),
                np.zeros((0,),dtype=np.int32),
                np.zeros((0,),dtype=np.int32),
                np.zeros((0,),dtype=np.float32),
                np.zeros((0,),dtype=np.float64),
                np.zeros((0,),dtype=np.float64),
                [])
    return (np.stack(X_list), np.array(S_list), np.array(M_list), np.array(y_list),
            np.array(A_list), np.array(N_list), keys)

# Build split datasets for sequences
df_hist_tr = df[df["Year_End"] <= 2022].copy()
df_hist_va = df[df["Year_End"] <= 2023].copy()
df_hist_te = df[df["Year_End"] <= 2024].copy()

X_tr, S_tr, M_tr, y_tr, A_tr, N_tr, K_tr = build_sequences_min(df_hist_tr, LOOKBACK, None)
X_va, S_va, M_va, y_va, A_va, N_va, K_va = build_sequences_min(df_hist_va, LOOKBACK, 2023)
X_te, S_te, M_te, y_te, A_te, N_te, K_te = build_sequences_min(df_hist_te, LOOKBACK, 2024)

print(f"Ministry sequences (L={LOOKBACK}) — Train:{X_tr.shape} Val:{X_va.shape} Test:{X_te.shape} n_features:{n_features} n_sectors:{n_sectors} n_min:{n_min}")
if X_tr.shape[0]==0:
    raise RuntimeError("No ministry training sequences. Consider reducing LOOKBACK or check features.")

# DL model with sector + ministry embeddings
def dl_model_core(model_type="gru", units=64, kernel=3):
    seq_in = layers.Input(shape=(LOOKBACK, n_features), name="seq_in")
    sec_in = layers.Input(shape=(), dtype="int32", name="sec_in")
    min_in = layers.Input(shape=(), dtype="int32", name="min_in")

    sec_emb = layers.Embedding(n_sectors, embed_sec, name="sec_emb")(sec_in)
    min_emb = layers.Embedding(n_min,     embed_min, name="min_emb")(min_in)
    sec_v, min_v = layers.Flatten()(sec_emb), layers.Flatten()(min_emb)

    if model_type=="gru":
        enc = layers.GRU(units, name="enc_gru")(seq_in)
    elif model_type=="lstm":
        enc = layers.LSTM(units, name="enc_lstm")(seq_in)
    elif model_type=="bigru":
        enc = layers.Bidirectional(layers.GRU(units, name="enc_bigru_base"), name="enc_bigru")(seq_in)
    elif model_type=="stacked_gru":
        x = layers.GRU(units, return_sequences=True, name="enc_gru_1")(seq_in)
        enc = layers.GRU(max(16, units//2), name="enc_gru_2")(x)
    elif model_type=="tcn":
        x = seq_in
        for d in [1,2,4]:
            res = x
            x = layers.Conv1D(units, kernel, padding="causal", dilation_rate=d, activation="relu")(x)
            x = layers.BatchNormalization()(x)
            x = layers.Dropout(0.1)(x)
            if res.shape[-1] != x.shape[-1]:
                res = layers.Conv1D(units, 1, padding="same")(res)
            x = layers.Add()([res, x]); x = layers.Activation("relu")(x)
        enc = layers.GlobalAveragePooling1D()(x)
    else:
        raise ValueError(model_type)

    z = layers.Concatenate()([enc, sec_v, min_v])
    z = layers.Dense(64, activation="relu")(z)
    z = layers.Dropout(0.2)(z)
    out = layers.Dense(1, name="residual_log_growth")(z)

    model = models.Model(inputs=[seq_in, sec_in, min_in], outputs=out)
    model.compile(optimizer=optimizers.Adam(1e-3), loss=losses.Huber(1.0),
                  metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")])
    return model

MODEL_SPECS = {
    "GRU_L5_U64":          ("gru",         64, 3),
    "LSTM_L5_U64":         ("lstm",        64, 3),
    "StackedGRU_L5_64_32": ("stacked_gru", 64, 3),
    "BiGRU_L5_U64":        ("bigru",       64, 3),
    "TCN_L5_F64_K3":       ("tcn",         64, 3),
}

es = callbacks.EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=7, min_lr=1e-5)
cb = [es, rlr]

def best_alpha(naive_lvl, dl_lvl, actual_lvl):
    alphas = np.linspace(0.0, 1.0, 101)
    best_a, best_mae = 0.0, float("inf")
    for a in alphas:
        blend = a * dl_lvl + (1 - a) * naive_lvl
        mae = np.mean(np.abs(actual_lvl - blend))
        if mae < best_mae:
            best_mae, best_a = mae, a
    return float(best_a), float(best_mae)

def metrics_all(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float); y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if not np.any(mask): return np.nan, np.nan, np.nan, np.nan
    yt, yp = y_true[mask], y_pred[mask]
    mae = np.mean(np.abs(yt - yp))
    rmse = float(np.sqrt(np.mean((yt - yp)**2)))
    r2 = 1.0 - (np.sum((yt-yp)**2) / np.sum((yt - np.mean(yt))**2) if np.sum((yt - np.mean(yt))**2)>0 else np.nan)
    mape = np.mean(np.abs((yt - yp)/yt)) * 100.0 if np.all(yt != 0) else np.nan
    return mae, rmse, r2, mape

# Train DL models and tune α on 2023
print("\nTraining DL (ministry) and tuning α on 2023...")
dl_models = {}
dl_alphas = {}
dl_preds_test = {}
dl_metrics_rows = []

has_val = len(X_va) > 0
has_test = len(X_te) > 0

for name, (mtype, units, ksz) in MODEL_SPECS.items():
    print(f"  {name}...")
    tf.keras.backend.clear_session()
    model = dl_model_core(mtype, units, ksz)
    _ = model.fit([X_tr, S_tr, M_tr], y_tr,
                  validation_data=([X_va, S_va, M_va], y_va) if has_val else None,
                  epochs=200, batch_size=32, verbose=0, callbacks=cb)
    # Predict residuals
    res_val = model.predict([X_va, S_va, M_va], verbose=0).reshape(-1) if has_val else np.array([],dtype=np.float32)
    res_te  = model.predict([X_te, S_te, M_te], verbose=0).reshape(-1) if has_test else np.array([],dtype=np.float32)
    dl_val_lvl = (N_va * np.exp(res_val)) if has_val else np.array([],dtype=np.float64)
    dl_te_lvl  = (N_te * np.exp(res_te))  if has_test else np.array([],dtype=np.float64)
    # α
    alpha, _ = best_alpha(N_va, dl_val_lvl, A_va) if has_val else (1.0, np.nan)
    dl_alphas[name] = alpha
    te_blend = (alpha * dl_te_lvl + (1 - alpha) * N_te) if has_test else np.array([],dtype=np.float64)
    dl_preds_test[name] = {"dl": dl_te_lvl, "blend": te_blend}
    # metrics
    if has_test:
        mae, rmse, r2, mape = metrics_all(A_te, te_blend)
        dl_metrics_rows.append({"model": f"{name}_Blend", "n_eval": int(len(A_te)), "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape, "alpha": alpha})
        mae2, rmse2, r22, mape2 = metrics_all(A_te, dl_te_lvl)
        dl_metrics_rows.append({"model": f"{name}_DL_Level", "n_eval": int(len(A_te)), "MAE": mae2, "RMSE": rmse2, "R2": r22, "MAPE_%": mape2, "alpha": np.nan})
    dl_models[name] = model
    print(f"    α={alpha:.2f}")

# DL ensemble
if has_test and len(dl_preds_test):
    te_stack = np.column_stack([dl_preds_test[m]["blend"] for m in MODEL_SPECS.keys()])
    ens_blend = te_stack.mean(axis=1)
    mae, rmse, r2, mape = metrics_all(A_te, ens_blend)
    dl_metrics_rows.append({"model":"DL_Ensemble_Blend","n_eval":int(len(A_te)),"MAE":mae,"RMSE":rmse,"R2":r2,"MAPE_%":mape,"alpha":np.nan})

# Linear models on flattened windows (predict residual log-growth)
def flatten_windows(X: np.ndarray) -> np.ndarray:
    return X.reshape((X.shape[0], X.shape[1]*X.shape[2])) if len(X) else np.zeros((0, LOOKBACK*n_features), dtype=np.float32)
X_tr_flat = flatten_windows(X_tr)
X_va_flat = flatten_windows(X_va)
X_te_flat = flatten_windows(X_te)

lin_models = {
    "LinearOLS": LinearRegression(),
    "Ridge":     Ridge(alpha=1.0, random_state=42)
}
lin_alphas = {}
lin_preds_test = {}
for lname, lmod in lin_models.items():
    lmod.fit(X_tr_flat, y_tr)
    res_val = lmod.predict(X_va_flat) if has_val else np.array([],dtype=float)
    res_te  = lmod.predict(X_te_flat) if has_test else np.array([],dtype=float)
    dl_val_lvl = (N_va * np.exp(res_val)) if has_val else np.array([],dtype=float)
    dl_te_lvl  = (N_te * np.exp(res_te))  if has_test else np.array([],dtype=float)
    alpha, _ = best_alpha(N_va, dl_val_lvl, A_va) if has_val else (1.0, np.nan)
    lin_alphas[lname] = alpha
    te_blend = (alpha * dl_te_lvl + (1 - alpha) * N_te) if has_test else np.array([],dtype=float)
    lin_preds_test[lname] = {"dl": dl_te_lvl, "blend": te_blend}
    if has_test:
        mae, rmse, r2, mape = metrics_all(A_te, te_blend)
        dl_metrics_rows.append({"model": f"{lname}_Blend", "n_eval": int(len(A_te)), "MAE":mae,"RMSE":rmse,"R2":r2,"MAPE_%":mape,"alpha":alpha})
        mae2, rmse2, r22, mape2 = metrics_all(A_te, dl_te_lvl)
        dl_metrics_rows.append({"model": f"{lname}_DL_Level", "n_eval": int(len(A_te)), "MAE":mae2,"RMSE":rmse2,"R2":r22,"MAPE_%":mape2,"alpha":np.nan})

# Naive baseline (FY2024 eval)
if has_test:
    mae, rmse, r2, mape = metrics_all(A_te, N_te)
    dl_metrics_rows.append({"model":"Naive_Lag1","n_eval":int(len(A_te)),"MAE":mae,"RMSE":rmse,"R2":r2,"MAPE_%":mape,"alpha":np.nan})

# Save 2024 metrics
metrics_df = pd.DataFrame(dl_metrics_rows, columns=["model","n_eval","MAE","RMSE","R2","MAPE_%","alpha"])
metrics_df.to_csv(OUT_MIN_METRICS, index=False)
print("\nSaved ministry metrics (FY2024):", OUT_MIN_METRICS)

# Build last windows for 2025 forecast (ending 2024)
df_2024 = df[df["Year_End"] <= 2024].copy()

X_2025_list, S_2025_list, M_2025_list, N_2025_list, keys_2025 = [], [], [], [], []
for (sec, bm), g in df_2024.groupby(["Sector_12","Base_Ministry"], sort=False):
    g = g.sort_values("Year_End")
    if len(g) < LOOKBACK: 
        continue
    if (g["Year_End"] == 2024).any():
        last_idx = g[g["Year_End"]==2024].index[-1]
        pos = g.index.get_loc(last_idx)
        if pos + 1 >= LOOKBACK:
            idxs = g.index.tolist()[pos+1-LOOKBACK:pos+1]
            Z = g.loc[idxs, z_cols].values
            if Z.shape != (LOOKBACK, n_features): 
                continue
            a_last = g.loc[last_idx, "Budget_Amount"]
            if not np.isfinite(a_last) or a_last <= 0: 
                continue
            X_2025_list.append(Z.astype(np.float32))
            S_2025_list.append(np.int32(sector2id.get(str(sec), 0)))
            M_2025_list.append(np.int32(min2id.get(str(bm), 0)))
            N_2025_list.append(float(a_last))
            keys_2025.append((str(sec), str(bm), "24-25", 2025, float(a_last)))

X_2025 = np.array(X_2025_list, dtype=np.float32)
S_2025 = np.array(S_2025_list, dtype=np.int32)
M_2025 = np.array(M_2025_list, dtype=np.int32)
N_2025 = np.array(N_2025_list, dtype=np.float64)
print(f"2025 ministry windows: {X_2025.shape[0]} ministries")

if X_2025.shape[0]==0:
    # write empty skeletons
    pd.DataFrame(columns=["Sector_12","Base_Ministry","Fiscal_Year","Year_End","Naive_Lag1_2024","DL_Ensemble_2025","Linear_Ridge_2025","Growth_vs_2024_%"]).to_csv(OUT_MIN_COMPACT, index=False)
    pd.DataFrame(columns=["Sector_12","Base_Ministry","Fiscal_Year","Year_End"]).to_csv(OUT_MIN_CONTEXT, index=False)
else:
    # DL per-model forecasts
    dl_forecasts = {}
    for name, model in dl_models.items():
        res = model.predict([X_2025, S_2025, M_2025], verbose=0).reshape(-1)
        dl_lvl = N_2025 * np.exp(res)
        alpha = dl_alphas[name]
        blend = alpha * dl_lvl + (1 - alpha) * N_2025
        dl_forecasts[name] = {"dl": dl_lvl, "blend": blend}
        # save per-model file
        rows = []
        for i, (sec, bm, fy, ye, naive_2024) in enumerate(keys_2025):
            rows.append({
                "Sector_12": sec, "Base_Ministry": bm, "Fiscal_Year": fy, "Year_End": ye,
                "Naive_Lag1_2024": naive_2024,
                f"{name}_DL_2025": float(dl_lvl[i]), f"{name}_Blend_2025": float(blend[i]), "alpha": float(alpha)
            })
        pd.DataFrame(rows).to_csv(FORECASTS_DIR / f"ministry_{name}_forecast_2425.csv", index=False)

    # DL ensemble
    ens_stack = np.column_stack([dl_forecasts[m]["blend"] for m in MODEL_SPECS.keys()])
    dl_ens = ens_stack.mean(axis=1)

    # Linear forecasts (residual on flattened windows)
    X_2025_flat = X_2025.reshape((X_2025.shape[0], LOOKBACK*n_features))
    lin_forecasts = {}
    for lname, lmod in lin_models.items():
        res = lmod.predict(X_2025_flat).reshape(-1)
        dl_lvl = N_2025 * np.exp(res)
        alpha = lin_alphas[lname]
        blend = alpha * dl_lvl + (1 - alpha) * N_2025
        lin_forecasts[lname] = {"dl": dl_lvl, "blend": blend}
        # save per-model file
        rows = []
        for i, (sec, bm, fy, ye, naive_2024) in enumerate(keys_2025):
            rows.append({
                "Sector_12": sec, "Base_Ministry": bm, "Fiscal_Year": fy, "Year_End": ye,
                "Naive_Lag1_2024": naive_2024,
                f"{lname}_DL_2025": float(dl_lvl[i]), f"{lname}_Blend_2025": float(blend[i]), "alpha": float(alpha)
            })
        pd.DataFrame(rows).to_csv(FORECASTS_DIR / f"ministry_{lname}_forecast_2425.csv", index=False)

    # Compact combined output
    compact_rows, ctx_rows = [], []
    for i, (sec, bm, fy, ye, naive_2024) in enumerate(keys_2025):
        row = {
            "Sector_12": sec, "Base_Ministry": bm, "Fiscal_Year": fy, "Year_End": ye,
            "Naive_Lag1_2024": float(naive_2024),
            "DL_Ensemble_2025": float(dl_ens[i]),
            "Linear_Ridge_2025": float(lin_forecasts["Ridge"]["blend"][i]) if "Ridge" in lin_forecasts else np.nan,
        }
        row["Growth_vs_2024_%"] = float((row["DL_Ensemble_2025"] - naive_2024)/naive_2024*100.0) if np.isfinite(naive_2024) and naive_2024 else np.nan
        compact_rows.append(row)

        ctx = row.copy()
        # add DL per-model and alphas
        for m in MODEL_SPECS.keys():
            ctx[f"{m}_DL_2025"] = float(dl_forecasts[m]["dl"][i])
            ctx[f"{m}_Blend_2025"] = float(dl_forecasts[m]["blend"][i])
            ctx[f"{m}_alpha"] = float(dl_alphas[m])
        # add linear per-model
        for lname in lin_models.keys():
            ctx[f"{lname}_DL_2025"] = float(lin_forecasts[lname]["dl"][i])
            ctx[f"{lname}_Blend_2025"] = float(lin_forecasts[lname]["blend"][i])
            ctx[f"{lname}_alpha"] = float(lin_alphas[lname])
        ctx_rows.append(ctx)

    compact_df = pd.DataFrame(compact_rows).sort_values(["Sector_12","Base_Ministry"]).reset_index(drop=True)
    ctx_df = pd.DataFrame(ctx_rows).sort_values(["Sector_12","Base_Ministry"]).reset_index(drop=True)

    compact_df.to_csv(OUT_MIN_COMPACT, index=False)
    ctx_df.to_csv(OUT_MIN_CONTEXT, index=False)

    # Plot growth bars by ministry (top 30 by magnitude)
    plot_df = compact_df.copy()
    plot_df["Growth_vs_2024_%"] = pd.to_numeric(plot_df["Growth_vs_2024_%"], errors="coerce")
    plot_df = plot_df.dropna(subset=["Growth_vs_2024_%"]).sort_values("Growth_vs_2024_%", ascending=False).head(30)
    plt.figure(figsize=(12,8))
    sns.barplot(data=plot_df, x="Growth_vs_2024_%", y="Base_Ministry", color="#59a14f")
    plt.title("Ministry: Forecasted Growth vs 2024 (DL Ensemble) — FY2024-25")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "ministries_growth_vs_2024.png", dpi=150)
    plt.close()

    # Optional: evaluate 2025 if actuals exist in ministry TS
    has_2025 = (min_ts["Year_End"] == 2025).any()
    if has_2025:
        actual_2025 = (min_ts[min_ts["Year_End"]==2025][["Sector_12","Base_Ministry","Budget_Amount"]]
                       .groupby(["Sector_12","Base_Ministry"], as_index=False).sum())
        merged = compact_df.merge(actual_2025, on=["Sector_12","Base_Ministry"], how="left")
        merged = merged.rename(columns={"Budget_Amount":"Actual_2025"})
        eval_cols = {
            "DL_Ensemble_2025": "DL_Ensemble_2025",
            "Linear_Ridge_2025": "Linear_Ridge_2025",
            "Naive_Lag1_2024": "Naive_Lag1_2024",
        }
        rows = []
        for name, col in eval_cols.items():
            m = merged.dropna(subset=[col,"Actual_2025"])
            if len(m):
                mae, rmse, r2, mape = metrics_all(m["Actual_2025"].values, m[col].values)
                rows.append({"model": f"{name}_2025", "n_eval": int(len(m)), "MAE": mae, "RMSE": rmse, "R2": r2, "MAPE_%": mape})
        if rows:
            m25 = pd.DataFrame(rows)
            m25.to_csv(METRICS_DIR / "ministry_metrics_2025_if_available.csv", index=False)
            print("Saved 2025 ministry metrics (if available):", METRICS_DIR / "ministry_metrics_2025_if_available.csv")

print("\nSaved ministry forecasts:")
print(" -", OUT_MIN_COMPACT)
print(" -", OUT_MIN_CONTEXT)
print(" - per-model files:", FORECASTS_DIR)
print(" - plots:", PLOTS_DIR)

Ministry sequences (L=5) — Train:(610, 5, 28) Val:(61, 5, 28) Test:(61, 5, 28) n_features:28 n_sectors:11 n_min:61

Training DL (ministry) and tuning α on 2023...
  GRU_L5_U64...
    α=1.00
  LSTM_L5_U64...
    α=0.00
  StackedGRU_L5_64_32...
    α=0.87
  BiGRU_L5_U64...
    α=0.21
  TCN_L5_F64_K3...
    α=0.00

Saved ministry metrics (FY2024): /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/metrics/ministry_metrics_23_24.csv
2025 ministry windows: 61 ministries

Saved ministry forecasts:
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/ministry_forecast_2425.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/ministry_forecast_2425_with_context.csv
 - per-model files: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/forecasts
 - plots: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots


In [24]:
# ...existing code...
# Step 9: Evaluate ministry forecasts against FY2024-25 BE (DL vs Linear vs Naive)
import re
import difflib
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
RESULTS_ROOT = ROOT / "results_Data_2"
FY_DIR = RESULTS_ROOT / "fy2425" / "ministries"
METRICS_DIR = RESULTS_ROOT / "metrics"
METRICS_DIR.mkdir(parents=True, exist_ok=True)

IN_FORECAST = FY_DIR / "ministry_forecast_2425.csv"
OUT_COMPARE = FY_DIR / "ministry_compare_2425_vs_BE.csv"
OUT_METRICS = METRICS_DIR / "ministry_metrics_2425_vs_BE.csv"

if not IN_FORECAST.exists():
    raise FileNotFoundError(f"Forecast not found: {IN_FORECAST}. Run Step 8 (Ministry) first.")

# Paste your FY2024-25 BE list below (exactly as in your prompt)
BE_TEXT = """
Ministry/Department,2024-25 BE
Agricultural Research and Education,9941.09
Animal Husbandry and Dairying,4327.85
Atomic Energy,36159.93
Chemicals and Petro-Chemicals,(Within Chemicals & Fertilizers) Included in below
Fertilisers,145948.45
Civil Aviation,3113.36
Coal,192.32
Mines,1911.6
Commerce (Commerce & Industry; see also DIPP),5254.58
Industrial Policy and Promotion (now DPIIT),8200.63
Posts,25814.0
Telecommunications,97579.05
Information Technology,16549.04
Company Affairs,756.19
Consumer Affairs,250.66
Food and Public Distribution,205513.94
Culture,115531.79
Defence (Civil estimates),Part of Defence BE
Defence Services,621941
Development of North Eastern Region,5892.0
Environment and Forests,3079.4
External Affairs,18050.0
Economic Affairs (centralised provisions),5901.31
Food Processing Industries,3287.65
Health,90659
AYUSH (Ayurveda, Yoga & Naturopathy, Unani, Siddha, and Homoeopathy),3647.5
Heavy Industry,6171.63
Public Enterprises,33.05
Home Affairs,219643
Elementary Education and Literacy,68804.85
Secondary Education and Higher Education,44094.62
Women and Child Development,25448.75
Information and Broadcasting,4692.0
Labour and Employment,13221.73
Law and Justice,3975.43
Panchayati Raj,1016.42
Parliamentary Affairs,63.0
Personnel, Public Grievances and Pensions,71701.0
Planning,824.39
Power,20671.32
Rural Development,190406
Land Resources,2419.23
Drinking Water Supply (within Jal Shakti),77223.0
Science and Technology,7931.05
Scientific and Industrial Research,5746.51
Biotechnology,2683.86
Shipping,2218.74
Road Transport and Highways,278000
Small Scale Industries,(within MSME)
Social Justice and Empowerment,20671.32
Space,12543.91
Statistics and Programme Implementation,5443.4
Steel,70.15
Textiles,4389.34
Tourism,2400.0
Tribal Affairs,12461.88
Urban Development,76431.6
Water Resources,20054.67
Youth Affairs and Sports,3397.32
Agriculture and Cooperation,(Combined above)
Andaman & Nicobar Islands,192.32
""".strip()

def norm_name(s: str) -> str:
    s = str(s).lower().strip()
    s = s.replace("&", " and ")
    # unify spaces and drop punctuation except spaces and alnum
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    # optional: drop helper phrases that cause mismatch
    s = s.replace(" now dpiit", "").replace(" within msme", "")
    s = s.replace(" within chemicals and fertilizers included in below", "")
    s = s.replace(" part of defence be", "").replace(" combined above", "")
    return s

def parse_be_text(txt: str) -> pd.DataFrame:
    rows = []
    for i, line in enumerate(txt.splitlines()):
        line = line.strip()
        if not line or i == 0:  # skip header
            continue
        if "," not in line:
            continue
        # split on last comma (names may contain commas in parentheses)
        name, val = line.rsplit(",", 1)
        name = name.strip()
        val = val.strip()
        # try numeric; skip non-numeric (notes like "Part of Defence BE")
        try:
            amt = float(val)
        except Exception:
            continue
        rows.append({"Base_Ministry_raw": name, "Actual_2025_BE": amt})
    df = pd.DataFrame(rows)
    if not len(df):
        raise RuntimeError("No numeric BE rows parsed. Check BE_TEXT format.")
    df["bm_norm"] = df["Base_Ministry_raw"].map(norm_name)
    return df

be_df = parse_be_text(BE_TEXT)

# Load forecast
fc = pd.read_csv(IN_FORECAST)
# Keep unique ministry rows
keep_cols = ["Sector_12","Base_Ministry","Fiscal_Year","Year_End","Naive_Lag1_2024","DL_Ensemble_2025","Linear_Ridge_2025"]
present = [c for c in keep_cols if c in fc.columns]
fc = fc[present].copy()
fc["bm_norm"] = fc["Base_Ministry"].map(norm_name)

# Fuzzy match BE -> forecast ministries
fc_names = fc["bm_norm"].dropna().unique().tolist()

def best_match(name: str, choices: list[str], cutoff=0.80):
    if name in choices:
        return name, 1.0
    match = difflib.get_close_matches(name, choices, n=1, cutoff=cutoff)
    if match:
        # compute score explicitly
        score = difflib.SequenceMatcher(None, name, match[0]).ratio()
        return match[0], score
    return None, 0.0

match_rows = []
for _, r in be_df.iterrows():
    m, score = best_match(r["bm_norm"], fc_names, cutoff=0.78)
    match_rows.append({
        "Base_Ministry_raw": r["Base_Ministry_raw"],
        "bm_norm": r["bm_norm"],
        "match_norm": m,
        "match_score": score,
        "Actual_2025_BE": r["Actual_2025_BE"],
    })
match_df = pd.DataFrame(match_rows)
matched = match_df.dropna(subset=["match_norm"]).copy()

# Merge with forecast on matched normalized name
fc_idx = fc.drop_duplicates(subset=["bm_norm"]).set_index("bm_norm")
matched["bm_norm_fc"] = matched["match_norm"]
merged = matched.join(fc_idx, on="bm_norm_fc", how="left", rsuffix="_fc")

# Drop rows without forecasts
merged = merged.dropna(subset=["DL_Ensemble_2025", "Linear_Ridge_2025", "Naive_Lag1_2024"])

def metrics_all(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if not np.any(mask):
        return np.nan, np.nan, np.nan, np.nan
    yt, yp = y_true[mask], y_pred[mask]
    mae = float(np.mean(np.abs(yt - yp)))
    rmse = float(np.sqrt(np.mean((yt - yp) ** 2)))
    denom = np.sum((yt - yt.mean()) ** 2)
    r2 = float(1.0 - (np.sum((yt - yp) ** 2) / denom)) if denom > 0 else np.nan
    mape = float(np.mean(np.abs((yt - yp) / yt)) * 100.0) if np.all(yt != 0) else np.nan
    return mae, rmse, r2, mape

Y = merged["Actual_2025_BE"].values
models = {
    "DL_Ensemble_2025": merged["DL_Ensemble_2025"].values,
    "Linear_Ridge_2025": merged["Linear_Ridge_2025"].values,
    "Naive_Lag1_2024": merged["Naive_Lag1_2024"].values,
}

rows = []
for name, pred in models.items():
    mae, rmse, r2, mape = metrics_all(Y, pred)
    rows.append({
        "model": name,
        "n_eval": int(len(Y)),
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "MAPE_%": mape
    })

metrics_df = pd.DataFrame(rows, columns=["model","n_eval","MAE","RMSE","R2","MAPE_%"])
metrics_df.to_csv(OUT_METRICS, index=False)

# Save detailed comparison per ministry
out_cols = [
    "Base_Ministry_raw","Base_Ministry","Sector_12",
    "Actual_2025_BE","Naive_Lag1_2024","DL_Ensemble_2025","Linear_Ridge_2025",
    "bm_norm","match_norm","match_score","Fiscal_Year","Year_End"
]
present_cols = [c for c in out_cols if c in merged.columns]
merged[present_cols].sort_values("Base_Ministry_raw").to_csv(OUT_COMPARE, index=False)

print(f"Matched ministries: {merged['Base_Ministry_raw'].nunique()} / {be_df['Base_Ministry_raw'].nunique()}")
unmatched = match_df[match_df["match_norm"].isna()]
if len(unmatched):
    print("Unmatched BE ministries (check naming/mapping):")
    print(unmatched["Base_Ministry_raw"].tolist())

print("\nMetrics (FY2024-25 BE):")
print(metrics_df)

print("\nSaved:")
print(" -", OUT_METRICS)
print(" -", OUT_COMPARE)
# ...existing code...

Matched ministries: 55 / 57
Unmatched BE ministries (check naming/mapping):
['Commerce (Commerce & Industry; see also DIPP)', 'Drinking Water Supply (within Jal Shakti)']

Metrics (FY2024-25 BE):
               model  n_eval           MAE          RMSE        R2     MAPE_%
0   DL_Ensemble_2025      55  17334.218035  86115.170025  0.255690  17.797000
1  Linear_Ridge_2025      55  16825.765001  86053.966996  0.256748   9.343098
2    Naive_Lag1_2024      55  16665.533091  86044.164803  0.256917   8.611867

Saved:
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/metrics/ministry_metrics_2425_vs_BE.csv
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/ministry_compare_2425_vs_BE.csv


In [25]:
# ...existing code...
# Step 10: Visuals — Ministry FY2024-25 (BE) comparison: DL vs Linear vs Naive
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.ticker import FuncFormatter
import seaborn as sns

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
ROOT = BASE / "data_2"
RESULTS_ROOT = ROOT / "results_Data_2"
FY_DIR = RESULTS_ROOT / "fy2425" / "ministries"
PLOTS_DIR = FY_DIR / "plots"
METRICS_DIR = RESULTS_ROOT / "metrics"

COMPARE_CSV = FY_DIR / "ministry_compare_2425_vs_BE.csv"
METRICS_CSV = METRICS_DIR / "ministry_metrics_2425_vs_BE.csv"

PLOTS_DIR.mkdir(parents=True, exist_ok=True)

if not COMPARE_CSV.exists() or not METRICS_CSV.exists():
    raise FileNotFoundError("Missing inputs. Run Step 8 (Ministry) and Step 9 first.")

cmp = pd.read_csv(COMPARE_CSV)
metrics_df = pd.read_csv(METRICS_CSV)

# Clean and compute errors
need = ["Base_Ministry_raw","Actual_2025_BE","DL_Ensemble_2025","Linear_Ridge_2025","Naive_Lag1_2024"]
cmp = cmp[[c for c in need if c in cmp.columns]].dropna()

cmp["AE_DL"]      = (cmp["DL_Ensemble_2025"] - cmp["Actual_2025_BE"]).abs()
cmp["AE_Linear"]  = (cmp["Linear_Ridge_2025"] - cmp["Actual_2025_BE"]).abs()
cmp["AE_Naive"]   = (cmp["Naive_Lag1_2024"] - cmp["Actual_2025_BE"]).abs()

cmp["PE_DL_%"]     = (cmp["DL_Ensemble_2025"] - cmp["Actual_2025_BE"]) / cmp["Actual_2025_BE"] * 100.0
cmp["PE_Linear_%"] = (cmp["Linear_Ridge_2025"] - cmp["Actual_2025_BE"]) / cmp["Actual_2025_BE"] * 100.0
cmp["PE_Naive_%"]  = (cmp["Naive_Lag1_2024"] - cmp["Actual_2025_BE"]) / cmp["Actual_2025_BE"] * 100.0

sns.set_theme(style="whitegrid")
fmt_comma0 = FuncFormatter(lambda x, p: f"{x:,.0f}")

# 1) Metrics bars (MAE/RMSE/R2/MAPE)
def metric_bar(metric, higher_is_better=False, fname="bar.png"):
    df = metrics_df.copy()
    # Order models consistently: DL, Linear, Naive
    order = ["DL_Ensemble_2025","Linear_Ridge_2025","Naive_Lag1_2024"]
    df["model"] = pd.Categorical(df["model"], categories=order, ordered=True)
    df = df.dropna(subset=[metric]).sort_values(
        metric, ascending=(not higher_is_better)
    )
    palette = df["model"].map({
        "DL_Ensemble_2025": "#59a14f",
        "Linear_Ridge_2025": "#4c78a8",
        "Naive_Lag1_2024": "#9c9c9c",
    }).tolist()
    plt.figure(figsize=(8, 5))
    ax = sns.barplot(data=df, x="model", y=metric, palette=palette)
    if metric in ["MAE","RMSE"]:
        ax.yaxis.set_major_formatter(fmt_comma0)
    plt.xlabel("")
    plt.title(f"Ministry FY2024-25 (BE) — {metric}")
    for i, v in enumerate(df[metric].values):
        ax.text(i, v, f"{v:,.0f}" if metric in ["MAE","RMSE"] else f"{v:.3f}",
                ha="center", va="bottom", fontsize=9)
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / fname, dpi=150)
    plt.close()

metric_bar("MAE",  higher_is_better=False, fname="ministry_metrics_2425_BE_MAE.png")
metric_bar("RMSE", higher_is_better=False, fname="ministry_metrics_2425_BE_RMSE.png")
metric_bar("R2",   higher_is_better=True,  fname="ministry_metrics_2425_BE_R2.png")
metric_bar("MAPE_%", higher_is_better=False, fname="ministry_metrics_2425_BE_MAPE.png")

# 2) Top-20 absolute errors by ministry (DL and Linear)
def topN_abs_error(df, col_err, label_pred, fname):
    top = df.nlargest(20, col_err).copy()
    plt.figure(figsize=(10, 8))
    ax = sns.barplot(data=top, x=col_err, y="Base_Ministry_raw", color="#737373")
    ax.xaxis.set_major_formatter(fmt_comma0)
    plt.xlabel("Absolute Error")
    plt.ylabel("")
    plt.title(f"Top-20 Absolute Errors vs BE — {label_pred}")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / fname, dpi=150)
    plt.close()

topN_abs_error(cmp, "AE_DL",     "DL Ensemble",   "ministry_top20_abs_error_DL_Ensemble.png")
topN_abs_error(cmp, "AE_Linear", "Linear Ridge",  "ministry_top20_abs_error_Linear_Ridge.png")

# 3) Scatter: Actual vs Pred (DL, Linear, Naive)
def scatter_with_diag(df, pred_col, title, ax):
    x = df["Actual_2025_BE"].values
    y = df[pred_col].values
    # Metrics inline
    mask = np.isfinite(x) & np.isfinite(y)
    if mask.sum():
        mae = np.mean(np.abs(x[mask]-y[mask]))
        ss_res = np.sum((x[mask]-y[mask])**2)
        ss_tot = np.sum((x[mask]-x[mask].mean())**2)
        r2 = 1 - ss_res/ss_tot if ss_tot>0 else np.nan
    else:
        mae, r2 = np.nan, np.nan
    ax.scatter(x, y, alpha=0.6, s=28)
    lim = [0, max(x.max(), y.max())*1.05]
    ax.plot(lim, lim, "k--", lw=1)
    ax.set_xlim(lim); ax.set_ylim(lim)
    ax.xaxis.set_major_formatter(fmt_comma0)
    ax.yaxis.set_major_formatter(fmt_comma0)
    ax.set_title(f"{title}\nMAE={mae:,.0f}  R2={r2:.3f}")
    ax.set_xlabel("Actual 2024-25 BE")
    ax.set_ylabel("Predicted")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
scatter_with_diag(cmp, "DL_Ensemble_2025", "DL Ensemble", axes[0])
scatter_with_diag(cmp, "Linear_Ridge_2025","Linear Ridge", axes[1])
scatter_with_diag(cmp, "Naive_Lag1_2024",  "Naive (FY24)", axes[2])
plt.tight_layout()
plt.savefig(PLOTS_DIR / "ministry_actual_vs_pred_scatter_2425.png", dpi=150)
plt.close()

# 4) Percentage error distribution (KDE/Hist)
pe_long = pd.DataFrame({
    "DL Ensemble": cmp["PE_DL_%"],
    "Linear Ridge": cmp["PE_Linear_%"],
    "Naive (FY24)": cmp["PE_Naive_%"],
}).melt(var_name="Model", value_name="Pct_Error_%").dropna()

plt.figure(figsize=(10,6))
sns.histplot(data=pe_long, x="Pct_Error_%", hue="Model", kde=True, element="step", stat="density", common_norm=False)
plt.axvline(0, color="k", lw=1, ls="--")
plt.title("Percentage Error Distribution vs BE (Ministries, FY2024-25)")
plt.xlabel("Percentage Error (%)")
plt.tight_layout()
plt.savefig(PLOTS_DIR / "ministry_pct_error_distribution_2425.png", dpi=150)
plt.close()

print("Saved ministry BE comparison visuals to:", PLOTS_DIR)
for f in [
    "ministry_metrics_2425_BE_MAE.png",
    "ministry_metrics_2425_BE_RMSE.png",
    "ministry_metrics_2425_BE_R2.png",
    "ministry_metrics_2425_BE_MAPE.png",
    "ministry_top20_abs_error_DL_Ensemble.png",
    "ministry_top20_abs_error_Linear_Ridge.png",
    "ministry_actual_vs_pred_scatter_2425.png",
    "ministry_pct_error_distribution_2425.png",
]:
    print(" -", PLOTS_DIR / f)
# ...existing code...


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=df, x="model", y=metric, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=df, x="model", y=metric, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=df, x="model", y=metric, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=df, x="model", y=metric, palette=palette)


Saved ministry BE comparison visuals to: /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_metrics_2425_BE_MAE.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_metrics_2425_BE_RMSE.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_metrics_2425_BE_R2.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_metrics_2425_BE_MAPE.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_top20_abs_error_DL_Ensemble.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_top20_abs_error_Linear_Ridge.png
 - /Users/vvmohith/Desktop/PROJECT/final_data/data_2/results_Data_2/fy2425/ministries/plots/ministry_a