In [None]:
# import os
# import pandas as pd

# BASE_DIR = "output_csv"
# sample_df = pd.read_csv("Sample.csv")
# sample_cols = sample_df.columns.tolist()

# # Remove metadata columns from column matching
# metadata_cols = ["Year", "Region", "Province", "Municipality"]
# data_cols = [col for col in sample_cols if col not in metadata_cols]

# all_data = []

# for year in os.listdir(BASE_DIR):
#     year_path = os.path.join(BASE_DIR, year)
#     if not os.path.isdir(year_path):
#         continue

#     for region in os.listdir(year_path):
#         region_path = os.path.join(year_path, region)
#         if not os.path.isdir(region_path):
#             continue

#         for province in os.listdir(region_path):
#             province_path = os.path.join(region_path, province)
#             if not os.path.isdir(province_path):
#                 continue

#             for municipality in os.listdir(province_path):
#                 mun_path = os.path.join(province_path, municipality)
#                 if not os.path.isdir(mun_path):
#                     continue

#                 csv_name = f"{municipality}.csv"
#                 csv_path = os.path.join(mun_path, csv_name)

#                 if os.path.exists(csv_path):
#                     try:
#                         df = pd.read_csv(csv_path)

#                         # Keep only expected columns, ignore extras
#                         df = df[[col for col in data_cols if col in df.columns]]

#                         # Add folder-derived metadata
#                         df["Year"] = year
#                         df["Region"] = region
#                         df["Province"] = province
#                         df["Municipality"] = municipality

#                         # Reorder columns to match Sample.csv
#                         df = df[sample_cols]

#                         all_data.append(df)
#                     except Exception as e:
#                         print(f"Failed to read {csv_path}: {e}")

# final_df = pd.concat(all_data, ignore_index=True)
# # Drop rows where Ownership is 'TOTAL' (case-sensitive match)
# final_df = final_df[final_df["Ownership"] != "TOTAL"]
# final_df.to_csv("Final.csv", index=False)
# print("Saved Final.csv")

import os
import glob
import re
import pandas as pd

BASE_DIR = "output_csv"

# Load Sample to define the canonical column order
sample_df = pd.read_csv("Sample.csv")
sample_cols = sample_df.columns.tolist()

# Metadata vs data columns
metadata_cols = ["Year", "Region", "Province", "Municipality"]
data_cols = [c for c in sample_cols if c not in metadata_cols]

# -------------------------------
# Cleaners used ONLY for final labels
# -------------------------------
def clean_name(val, is_region=False):
    if pd.isna(val):
        return val
    if is_region:
        val = re.sub(r'^\d{2}_-_', '', val)  # Remove 'XX_-_' prefix
    return str(val).replace('_', ' ')

# -------------------------------
# Helpers
# -------------------------------
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in data_cols:
        if col not in out.columns:
            out[col] = 0
    out = out[[c for c in data_cols if c in out.columns]]
    return out

def add_labels(level, mode, region, province=None, municipality=None):
    """
    level: 'region' | 'province' | 'municipality'
    mode: 'with_dupes' | 'NA' | 'dirty'
    Returns (Region, Province, Municipality) values.
    """
    if mode == "dirty":
        region_val = region
        if level == "region":
            return (region, region, region)
        elif level == "province":
            return (region, province, province)
        else:
            return (region, province, municipality)

    # Cleaned label logic
    region_val = clean_name(region, is_region=True)
    if mode == "with_dupes":
        if level == "region":
            return (region_val, region_val, region_val)
        elif level == "province":
            return (region_val, clean_name(province), clean_name(province))
        else:
            return (region_val, clean_name(province), clean_name(municipality))
    elif mode == "NA":
        if level == "region":
            return (region_val, "N/A", "N/A")
        elif level == "province":
            return (region_val, clean_name(province), "N/A")
        else:
            return (region_val, clean_name(province), clean_name(municipality))
    else:
        raise ValueError(f"Unknown mode: {mode}")

def read_csv_safe(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"Failed to read {path}: {e}")
        return None

def process_all(mode: str) -> pd.DataFrame:
    rows = []

    for year in sorted(os.listdir(BASE_DIR)):
        year_path = os.path.join(BASE_DIR, year)
        if not os.path.isdir(year_path):
            continue

        for region in sorted(os.listdir(year_path)):
            region_path = os.path.join(year_path, region)
            if not os.path.isdir(region_path):
                continue

            # --- Region-level CSVs ---
            for r_csv in sorted(glob.glob(os.path.join(region_path, "*.csv"))):
                df = read_csv_safe(r_csv)
                if df is None:
                    continue
                df = normalize_columns(df)
                r_val, p_val, m_val = add_labels("region", mode, region)
                df["Year"] = year
                df["Region"] = r_val
                df["Province"] = p_val
                df["Municipality"] = m_val
                df = df[sample_cols]
                rows.append(df)

            # --- Provinces ---
            for province in sorted(os.listdir(region_path)):
                province_path = os.path.join(region_path, province)
                if not os.path.isdir(province_path):
                    continue

                # Province-level CSVs
                for p_csv in sorted(glob.glob(os.path.join(province_path, "*.csv"))):
                    df = read_csv_safe(p_csv)
                    if df is None:
                        continue
                    df = normalize_columns(df)
                    r_val, p_val, m_val = add_labels("province", mode, region, province)
                    df["Year"] = year
                    df["Region"] = r_val
                    df["Province"] = p_val
                    df["Municipality"] = m_val
                    df = df[sample_cols]
                    rows.append(df)

                # --- Municipalities ---
                for municipality in sorted(os.listdir(province_path)):
                    mun_path = os.path.join(province_path, municipality)
                    if not os.path.isdir(mun_path):
                        continue
                    for m_csv in sorted(glob.glob(os.path.join(mun_path, "*.csv"))):
                        df = read_csv_safe(m_csv)
                        if df is None:
                            continue
                        df = normalize_columns(df)
                        r_val, p_val, m_val = add_labels(
                            "municipality", mode, region, province, municipality
                        )
                        df["Year"] = year
                        df["Region"] = r_val
                        df["Province"] = p_val
                        df["Municipality"] = m_val
                        df = df[sample_cols]
                        rows.append(df)

    if not rows:
        return pd.DataFrame(columns=sample_cols)

    final_df = pd.concat(rows, ignore_index=True)
    if "Ownership" in final_df.columns:
        final_df = final_df[final_df["Ownership"] != "TOTAL"]
    return final_df

# --------- Build three versions ----------
final_with_dupes = process_all("with_dupes")
final_with_na = process_all("NA")
final_dirty = process_all("dirty")

final_with_dupes.to_csv("Final_with_dupes.csv", index=False)
final_with_na.to_csv("Final_with_NA.csv", index=False)
final_dirty.to_csv("Final_dirty.csv", index=False)

print("Saved Final_with_dupes.csv, Final_with_NA.csv, and Final_dirty.csv")


In [None]:
# import pandas as pd
# import re

# # Read the CSV
# df = pd.read_csv("Final.csv")

# # Function to clean Region, Province, Municipality
# def clean_name(val, is_region=False):
#     if pd.isna(val):
#         return val
#     if is_region:
#         val = re.sub(r'^\d{2}_-_', '', val)  # Remove 'XX_-_' from start
#     return val.replace('_', ' ')

# # Apply transformations
# df["Region"] = df["Region"].apply(lambda x: clean_name(x, is_region=True))
# df["Province"] = df["Province"].apply(clean_name)
# df["Municipality"] = df["Municipality"].apply(clean_name)

# # Save cleaned version if needed
# df.to_csv("Final_cleaned.csv", index=False)
# print("Saved Final_cleaned.csv")
