In [None]:
import pandas as pd
import yaml
import numpy as np

# --------------------------------------------------
# Explicit identifier columns (must be strings)
# --------------------------------------------------
ID_COLS = [
    "ent_id",
    "unique_id",
    "st_code",
    "dist_code",
    "sbd_code",
]

# --------------------------------------------------
# Load cleaned dataset with forced dtypes
# --------------------------------------------------
df = pd.read_csv(
    "census_clean.csv",
    dtype={col: "string" for col in ID_COLS}
)

# --------------------------------------------------
# Load variable metadata
# --------------------------------------------------
with open("variable_metadata.yaml", "r") as f:
    meta = yaml.safe_load(f)

# --------------------------------------------------
# Helper functions
# --------------------------------------------------
def latex_escape(text):
    if text is None or (isinstance(text, float) and np.isnan(text)):
        return ""
    return (
        str(text)
        .replace("\\", "\\textbackslash{}")
        .replace("&", "\\&")
        .replace("%", "\\%")
        .replace("_", "\\_")
        .replace("#", "\\#")
        .replace("{", "\\{")
        .replace("}", "\\}")
    )

def dtype_label(series, colname):
    # Explicit override for identifiers
    if colname in ID_COLS:
        return "String (Identifier)"

    if pd.api.types.is_integer_dtype(series):
        return "Integer"
    if pd.api.types.is_float_dtype(series):
        return "Float"
    return "Categorical / String"

# --------------------------------------------------
# Build rows
# --------------------------------------------------
rows = []
n = len(df)

for col in df.columns:
    s = df[col]
    m = meta[col]

    dtype = dtype_label(s, col)
    missing = s.isna().sum()
    miss_pct = round((missing / n) * 100, 2)

    if pd.api.types.is_numeric_dtype(s):
        vrange = f"{s.dropna().min()} -- {s.dropna().max()}"
    else:
        cats = sorted(s.dropna().unique())
        vrange = ", ".join(map(str, cats)) if len(cats) <= 6 else "See notes"

    rows.append([
        latex_escape(col),
        latex_escape(m["description"]),
        latex_escape(m["unit"]),
        latex_escape(dtype),
        latex_escape(vrange),
        latex_escape(f"{missing} ({miss_pct}\\%)"),
        latex_escape(m["notes"])
    ])

# --------------------------------------------------
# Write FINAL codebook.tex
# --------------------------------------------------
with open("codebook.tex", "w") as f:
    f.write(r"""
\footnotesize
\begin{longtblr}[
  caption = {Codebook for Cleaned Indian Census Population Dataset},
  label = {tab:codebook},
]{
  width = \textwidth,
  leftsep = 0pt,
  rightsep = 0pt,
  colsep = 6pt,
  rowsep = 10pt,
  colspec = {
    Q[l,wd=0.11\textwidth]
    Q[l,wd=0.11\textwidth]
    Q[l,wd=0.11\textwidth]
    Q[c,wd=0.08\textwidth]
    Q[l,wd=0.10\textwidth]
    Q[c,wd=0.08\textwidth]
    Q[l,wd=0.28\textwidth]
  },
  rowhead = 1,
  hlines,
}
Variable &
Description &
Unit of Observation &
Type &
Valid Range / Categories &
Missingness &
Notes \\
""")

    for r in rows:
        f.write(" & ".join(r) + r" \\" + "\n")

    f.write(r"\end{longtblr}")
