In [2]:
import pandas as pd
from pathlib import Path

In [3]:
# Config
DATA_PATH = Path("../data/bronze")

COLUMN_ALIASES = {
    "lag": ["Defasagem", "Defas"],
    "math": ["Matem", "Mat"],
    "portuguese": ["Portug", "Por"],
}

COLUMNS_TO_KEEP = ["RA", "IAA", "IEG", "IPS", "math", "portuguese", "lag", "year"]

In [4]:
csv_files = sorted(DATA_PATH.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {DATA_PATH}")

In [5]:
dfs = []

for file in csv_files:
    df = pd.read_csv(file)

    # Clean column names
    df.columns = df.columns.str.strip()

    # Add year
    df["year"] = int(file.stem)

    # Standardize columns
    rename_map = {}
    for standard_name, aliases in COLUMN_ALIASES.items():
        for col in df.columns:
            if col in aliases:
                rename_map[col] = standard_name

    df = df.rename(columns=rename_map)

    print(f"Loaded {file.name} → {df.shape}")
    dfs.append(df)

Loaded 2022.csv → (860, 43)
Loaded 2023.csv → (1014, 49)
Loaded 2024.csv → (1156, 51)


In [6]:
combined_df = pd.concat(dfs, ignore_index=True)

# Validate required columns
missing_cols = set(COLUMNS_TO_KEEP) - set(combined_df.columns)
if missing_cols:
    raise ValueError(f"Missing columns: {missing_cols}")

combined_df = combined_df[COLUMNS_TO_KEEP]

In [7]:
combined_df.head()

Unnamed: 0,RA,IAA,IEG,IPS,math,portuguese,lag,year
0,RA-1,83,41,56,27,35,-1,2022
1,RA-2,88,52,63,63,45,0,2022
2,RA-3,0,79,56,58,40,0,2022
3,RA-4,88,45,56,28,35,0,2022
4,RA-5,79,86,56,70,29,0,2022
