In [3]:
import pandas as pd
from pathlib import Path

project_root = Path.cwd().parent  # sai de notebooks/ e vai para a raiz do repo
path = project_root / "data" / "raw" / "default_of_credit_card_clients__normalized.csv"

print("Loading:", path)
df = pd.read_csv(path)

df.shape, df.columns.tolist()[:10]


Loading: /Users/marcelorech/Documents/niw-risk-ai/credit-risk-default-prediction/data/raw/default_of_credit_card_clients__normalized.csv


((30000, 25),
 ['ID',
  'LIMIT_BAL',
  'SEX',
  'EDUCATION',
  'MARRIAGE',
  'AGE',
  'PAY_0',
  'PAY_2',
  'PAY_3',
  'PAY_4'])

In [4]:
df["default.payment.next.month"].value_counts(dropna=False), df.isna().sum().sum()


(default.payment.next.month
 0    23364
 1     6636
 Name: count, dtype: int64,
 np.int64(0))

In [5]:
from pathlib import Path

out_path = project_root / "reports" / "data_dictionary.md"
out_path.parent.mkdir(parents=True, exist_ok=True)

rows = []
rows.append("# Data Dictionary - UCI Credit Card Default\n")
rows.append(f"- Rows: {df.shape[0]}\n- Columns: {df.shape[1]}\n")
rows.append("\n## Target\n")
rows.append("- `default.payment.next.month`: 1 = default, 0 = non-default\n")
rows.append("\n## Columns\n")
rows.append("| column | dtype | example | notes |\n|---|---:|---:|---|\n")

for col in df.columns:
    dtype = str(df[col].dtype)
    example = df[col].iloc[0]
    # short notes for key known columns (optional baseline)
    notes = ""
    if col == "ID":
        notes = "Row identifier (drop from modeling)."
    elif col.startswith("PAY_"):
        notes = "Repayment status by month (categorical/ordinal)."
    elif col.startswith("BILL_AMT"):
        notes = "Bill statement amount by month."
    elif col.startswith("PAY_AMT"):
        notes = "Payment amount by month."
    elif col in ["SEX", "EDUCATION", "MARRIAGE"]:
        notes = "Categorical code (check code meanings in UCI description)."
    rows.append(f"| {col} | {dtype} | {example} | {notes} |\n")

out_path.write_text("".join(rows), encoding="utf-8")
print("Wrote:", out_path)


Wrote: /Users/marcelorech/Documents/niw-risk-ai/credit-risk-default-prediction/reports/data_dictionary.md
