In [9]:
# Create CSV table from screenshot sector data (% of GDP)

import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
DATA = BASE / "data"
OUT  = DATA / "sector_shares_gdp.csv"

# Years from 2005-06 to 2024-25
years = [f"{y:04d}-{(y+1)%100:02d}" for y in range(2005, 2025)]

# Data from your screenshots
sector_data = {
    "agriculture_forestry_fishing": [
        18.2, 17.9, 17.7, 17.7, 17.6, 17.6, 17.5, 17.1, 17.0, 16.5, 
        16.1, 15.6, 15.0, 14.6, 14.2, 14.5, 14.3, 14.0, 15.5, 15.0
    ],
    "trade_hotels_transport_communication_broadcasting": [
        16.7, 17.0, 17.1, 17.3, 17.4, 17.6, 17.7, 17.8, 17.8, 17.5,
        17.8, 18.1, 18.2, 18.3, 18.2, 18.6, 18.6, 18.5, 17.5, 17.5
    ],
    "defense_security": [
        1.8, 1.8, 1.8, 1.9, 1.9, 2.0, 2.0, 2.0, 2.1, 2.1,
        2.0, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9, 1.9
    ],
    "economic_services": [
        4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.1, 5.2, 5.2,
        5.3, 5.3, 5.3, 5.3, 5.3, 5.1, 5.2, 5.3, 5.3, 5.3
    ],
    "energy_natural_resources": [
        3.5, 3.5, 3.5, 3.6, 3.6, 3.7, 3.8, 3.8, 3.8, 3.9,
        3.9, 4.0, 4.0, 4.0, 4.1, 4.1, 4.2, 4.2, 4.3, 4.3
    ],
    "food_distribution": [
        1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.6, 1.6, 1.6, 1.6,
        1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, 1.7, 1.7, 1.7
    ],
    "governance_administration": [
        5.3, 5.3, 5.2, 5.2, 5.2, 5.2, 5.1, 5.1, 5.1, 5.0,
        5.0, 5.0, 5.0, 5.0, 5.0, 5.3, 5.4, 5.3, 5.3, 5.3
    ],
    "infrastructure_transport": [
        2.8, 2.9, 2.9, 3.0, 3.0, 3.0, 3.0, 3.1, 3.1, 3.1,
        3.2, 3.2, 3.3, 3.3, 3.4, 3.4, 3.5, 3.5, 3.5, 3.5
    ],
    "regional_development": [
        2.8, 3.0, 3.2, 3.5, 4.0, 4.2, 4.5, 4.3, 4.1, 4.0,
        3.8, 3.6, 3.4, 3.2, 3.0, 3.1, 3.2, 3.3, 3.3, 3.1
    ],
    "science_innovation": [
        0.74, 0.77, 0.77, 0.82, 0.82, 0.75, 0.84, 0.82, 0.80, 0.78,
        0.69, 0.75, 0.70, 0.66, 0.66, 0.64, 0.64, 0.65, 0.65, 0.64
    ],
    "social_services": [
        5.0, 5.1, 5.2, 5.2, 5.3, 5.3, 5.4, 5.5, 5.6, 5.7,
        5.9, 6.0, 6.1, 6.1, 6.4, 8.3, 8.6, 7.2, 6.5, 6.0
    ]
}

# Create DataFrame
df_data = {"sector": list(sector_data.keys())}
for i, year in enumerate(years):
    df_data[year] = [values[i] for values in sector_data.values()]

df = pd.DataFrame(df_data)
df.to_csv(OUT, index=False)

print(f"Saved: {OUT}")
print("Preview:")
print(df.head())

Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data/sector_shares_gdp.csv
Preview:
                                              sector  2005-06  2006-07  \
0                       agriculture_forestry_fishing     18.2     17.9   
1  trade_hotels_transport_communication_broadcasting     16.7     17.0   
2                                   defense_security      1.8      1.8   
3                                  economic_services      4.5      4.6   
4                           energy_natural_resources      3.5      3.5   

   2007-08  2008-09  2009-10  2010-11  2011-12  2012-13  2013-14  ...  \
0     17.7     17.7     17.6     17.6     17.5     17.1     17.0  ...   
1     17.1     17.3     17.4     17.6     17.7     17.8     17.8  ...   
2      1.8      1.9      1.9      2.0      2.0      2.0      2.1  ...   
3      4.7      4.8      4.9      5.0      5.1      5.1      5.2  ...   
4      3.5      3.6      3.6      3.7      3.8      3.8      3.8  ...   

   2015-16  2016-17  2017-18  

In [1]:
# Update ministry mapping to match uniform sector names from screenshots

import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
DATA = BASE / "data"
MAP_FILE = DATA / "ministry_to_sector12.csv"

# Read current mapping
df = pd.read_csv(MAP_FILE)

# Update sector names to match screenshot format
df["Sector_12"] = df["Sector_12"].replace({
    "Agriculture & Rural": "agriculture forestry fishing",
    "Communication & Technology": "communication broadcasting culture and toursim", 
    "Culture & Tourism": "communication broadcasting culture and toursim",
    "Defense & Security": "defense security",
    "Economic Services": "economic services", 
    "Energy & Natural Resources": "energy and natural resources",
    "Food & Distribution": "food distribution",
    "Governance & Administration": "governance and administration",
    "Infrastructure & Transport": "infrastructure and transport", 
    "Regional Development": "regional and development",
    "Science & Innovation": "science and innovation",
    "Social Services": "social and services"
})

# Save updated mapping
df.to_csv(MAP_FILE, index=False)

print("Updated sector mapping:")
print(df["Sector_12"].value_counts().sort_index())
print(f"\nSaved updated mapping to: {MAP_FILE}")

Updated sector mapping:
Sector_12
agriculture forestry fishing                       7
communication broadcasting culture and toursim     6
defense security                                   3
economic services                                 12
energy and natural resources                       7
food distribution                                  3
governance and administration                      5
infrastructure and transport                       4
regional and development                           3
science and innovation                             4
social and services                                7
Name: count, dtype: int64

Saved updated mapping to: /Users/vvmohith/Desktop/PROJECT/final_data/data/ministry_to_sector12.csv


In [2]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

DATA = Path("/Users/vvmohith/Desktop/PROJECT/final_data/data")
SRC  = DATA / "macro_indicators_wb.csv"
DST  = SRC  # overwrite in place

# Robust FY parsers
def fy_full(fy) -> str:
    if pd.isna(fy):
        return np.nan
    s = str(fy).strip().replace("–", "-").replace("—", "-")
    s = re.sub(r"\s+", "", s)

    m = re.fullmatch(r"(\d{2})-(\d{2})", s)           # 05-06
    if m:
        start = 2000 + int(m.group(1))
        end   = 2000 + int(m.group(2))
        return f"{start}-{str(end)[2:]}"

    m = re.fullmatch(r"(20\d{2})-(\d{2})", s)         # 2005-06
    if m:
        start = int(m.group(1)); end = 2000 + int(m.group(2))
        return f"{start}-{str(end)[2:]}"

    m = re.fullmatch(r"(20\d{2})-(20\d{2})", s)       # 2005-2006
    if m:
        start = int(m.group(1)); end = int(m.group(2))
        return f"{start}-{str(end)[2:]}"

    return np.nan

def fy_prev_full(fy_full_s: str) -> str:
    if pd.isna(fy_full_s):
        return np.nan
    start = int(str(fy_full_s)[:4])
    prev_start = start - 1
    return f"{prev_start}-{str(start)[2:]}"

def end_calendar_year(fy_full_s: str) -> int:
    return int(str(fy_full_s)[:4]) + 1

# 2) CPI Inflation (%)
inflation = {
    "2004-05": 3.8, "2005-06": 4.2, "2006-07": 6.6, "2007-08": 6.2, "2008-09": 9.1,
    "2009-10": 12.3, "2010-11": 10.5, "2011-12": 8.4, "2012-13": 10.2, "2013-14": 9.4,
    "2014-15": 5.9, "2015-16": 4.9, "2016-17": 4.5, "2017-18": 3.6, "2018-19": 3.4,
    "2019-20": 4.8, "2020-21": 6.2, "2021-22": 5.5, "2022-23": 6.7, "2023-24": 5.4,
    "2024-25": 4.6,
}

# 3) Exchange rate (INR per USD)
ex_rate = {
    "2005-06": 44.27, "2006-07": 45.28, "2007-08": 40.24, "2008-09": 45.92, "2009-10": 47.42,
    "2010-11": 45.58, "2011-12": 47.92, "2012-13": 53.21, "2013-14": 60.50, "2014-15": 61.14,
    "2015-16": 65.46, "2016-17": 67.07, "2017-18": 64.45, "2018-19": 69.92, "2019-20": 74.53,
    "2020-21": 73.23, "2021-22": 74.57, "2022-23": 81.35, "2023-24": 81.94, "2024-25": 86.83,
}

# 4) Fiscal deficit (% of GDP) – new column
fiscal_def = {
    "2005-06": 4.0, "2006-07": 3.4, "2007-08": 2.7, "2008-09": 2.5, "2009-10": 6.4,
    "2010-11": 6.5, "2011-12": 5.9, "2012-13": 5.9, "2013-14": 4.5, "2014-15": 4.1,
    "2015-16": 3.9, "2016-17": 3.5, "2017-18": 3.5, "2018-19": 3.4, "2019-20": 4.6,
    "2020-21": 9.2, "2021-22": 6.7, "2022-23": 6.4, "2023-24": 5.6, "2024-25": 4.8,
}

# 5) Global GDP growth (calendar year)
global_gdp = {
    2005: 4.5, 2006: 4.1, 2007: 4.3, 2008: 3.0, 2009: -1.3,
    2010: 4.6, 2011: 3.3, 2012: 3.5, 2013: 3.4, 2014: 3.6,
    2015: 3.4, 2016: 3.3, 2017: 3.8, 2018: 3.6, 2019: 2.8,
    2020: -2.9, 2021: 6.2, 2022: 3.4, 2023: 2.8, 2024: 3.2, 2025: 3.0,
}

# Load, clean, and align
df = pd.read_csv(SRC)

# Retain original Fiscal_Year text; create a robust FY_Full for mapping
df = df[df["Fiscal_Year"].notna()].copy()
df["Fiscal_Year"] = df["Fiscal_Year"].astype(str)

df["FY_Full"] = df["Fiscal_Year"].apply(fy_full)
bad_fy = df.loc[df["FY_Full"].isna(), "Fiscal_Year"].unique().tolist()
if bad_fy:
    print("Dropping rows with bad Fiscal_Year values:", bad_fy)
df = df[df["FY_Full"].notna()].copy()

# Replace series as requested (keep GDP_Growth_Rate, Election_Year, High_Inflation as-is)
df["Inflation_CPI"] = df["FY_Full"].map(inflation)
df["Exchange_Rate_USD"] = df["FY_Full"].map(ex_rate)
df["Fiscal_Deficit_GDP"] = df["FY_Full"].map(fiscal_def)
df["Global_GDP_Growth"] = df["FY_Full"].apply(lambda s: global_gdp.get(end_calendar_year(s), np.nan))

# Lags
gdp_map = pd.Series(df["GDP_Growth_Rate"].values, index=df["FY_Full"]).to_dict()
df["GDP_Growth_Lag1"] = df["FY_Full"].apply(lambda s: gdp_map.get(fy_prev_full(s), np.nan))
df["Inflation_Lag1"] = df["FY_Full"].apply(lambda s: inflation.get(fy_prev_full(s), np.nan))

# Enforce numeric types
for c in ["GDP_Growth_Rate","Inflation_CPI","Exchange_Rate_USD","Fiscal_Deficit_GDP",
          "Global_GDP_Growth","GDP_Growth_Lag1","Inflation_Lag1"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Save with Fiscal_Year first, keep original format; drop helper column
preferred = [
    "Fiscal_Year","GDP_Growth_Rate","Inflation_CPI","Exchange_Rate_USD",
    "Fiscal_Deficit_GDP","Global_GDP_Growth","Election_Year","High_Inflation",
    "GDP_Growth_Lag1","Inflation_Lag1"
]
ordered = [c for c in preferred if c in df.columns] + [c for c in df.columns if c not in preferred and c != "FY_Full"]
df = df[ordered]

df.to_csv(DST, index=False)
print("Updated and saved:", DST)
print(df.head(5))

Updated and saved: /Users/vvmohith/Desktop/PROJECT/final_data/data/macro_indicators_wb.csv
  Fiscal_Year  GDP_Growth_Rate  Inflation_CPI  Exchange_Rate_USD  \
0       05-06         8.060733            4.2              44.27   
1       06-07         7.660815            6.6              45.28   
2       07-08         3.086698            6.2              40.24   
3       08-09         7.861889            9.1              45.92   
4       09-10         8.497585           12.3              47.42   

   Fiscal_Deficit_GDP  Global_GDP_Growth  Election_Year  High_Inflation  \
0                 4.0                4.1            0.0             0.0   
1                 3.4                4.3            0.0             1.0   
2                 2.7                3.0            0.0             1.0   
3                 2.5               -1.3            0.0             1.0   
4                 6.4                4.6            1.0             1.0   

   GDP_Growth_Lag1  Inflation_Lag1  Current_Accou

In [1]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/Users/vvmohith/Desktop/PROJECT/final_data")
DATA = BASE / "data"
IN_BUDGETS = DATA / "standardized_budget_time_series.csv"
IN_MAP     = DATA / "ministry_to_sector12.csv"

OUT_MIN_TS = DATA / "sector_ministry_timeseries.csv"  # drill-down dataset

def tidy_ministry(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

# Load and clean
dfb = pd.read_csv(IN_BUDGETS, dtype={"Base_Ministry": "string"})
dfb["Base_Ministry"] = dfb["Base_Ministry"].map(tidy_ministry)

# FY columns like 05-06, 06-07, ...
fy_cols = [c for c in dfb.columns if re.fullmatch(r"\d{2}-\d{2}", str(c))]
if not fy_cols:
    raise RuntimeError("No fiscal-year columns found in standardized_budget_time_series.csv")
fy_cols = sorted(fy_cols, key=lambda c: 2000 + int(str(c).split("-")[1]))

# Numeric budgets
dfb[fy_cols] = (dfb[fy_cols]
                .replace(r"[,\s]", "", regex=True)
                .replace("", np.nan)
                .apply(pd.to_numeric, errors="coerce"))

# Merge sector mapping (keep ministry granularity)
map_df = pd.read_csv(IN_MAP, dtype={"Base_Ministry":"string","Sector_12":"string"})
map_df["Base_Ministry"] = map_df["Base_Ministry"].map(tidy_ministry)
map_df["Sector_12"] = map_df["Sector_12"].str.strip()

dfb = dfb.merge(map_df, on="Base_Ministry", how="left").dropna(subset=["Sector_12"])

# Long format per ministry
long = dfb.melt(
    id_vars=["Base_Ministry","Sector_12"],
    value_vars=fy_cols,
    var_name="Fiscal_Year",
    value_name="Budget_Amount"
).dropna(subset=["Budget_Amount"])

# Year end key
long["Year_End"] = long["Fiscal_Year"].map(lambda s: 2000 + int(str(s).split("-")[1])).astype("Int64")

# Sector totals and ministry share within sector (for stacked/percent charts)
tot = (long.groupby(["Sector_12","Fiscal_Year"], as_index=False)
            .agg(Sector_Total=("Budget_Amount","sum")))
long = long.merge(tot, on=["Sector_12","Fiscal_Year"], how="left")
long["Ministry_Share_Sector"] = long["Budget_Amount"] / long["Sector_Total"]

# Save for dashboard drill-down
cols = ["Sector_12","Base_Ministry","Fiscal_Year","Year_End","Budget_Amount","Sector_Total","Ministry_Share_Sector"]
long.sort_values(["Sector_12","Base_Ministry","Year_End"]).to_csv(OUT_MIN_TS, index=False, columns=cols)

print(f"Saved: {OUT_MIN_TS}")
print(long.head(8))

Saved: /Users/vvmohith/Desktop/PROJECT/final_data/data/sector_ministry_timeseries.csv
                         Base_Ministry                     Sector_12  \
0  Agricultural Research and Education  agriculture forestry fishing   
1        Animal Husbandry and Dairying  agriculture forestry fishing   
2                        Atomic Energy  energy and natural resources   
3        Chemicals and Petro-Chemicals             economic services   
4                          Fertilisers  energy and natural resources   
5                       Civil Aviation  infrastructure and transport   
6                                 Coal  energy and natural resources   
7                                Mines  energy and natural resources   

  Fiscal_Year  Budget_Amount  Year_End  Sector_Total  Ministry_Share_Sector  
0       05-06        1942.00      2006      27237.86               0.071298  
1       05-06         710.69      2006      27237.86               0.026092  
2       05-06        4995.86   