Setup

In [1]:
from pathlib import Path
import pandas as pd

SITE_ID = "MAO"
ZONE    = "DORM1"

DATA_DIR   = Path("../data")
INTERIM    = DATA_DIR / "interim" / SITE_ID
PROCESSED  = DATA_DIR / "processed" / SITE_ID
PROCESSED.mkdir(parents=True, exist_ok=True)

# arquivos de entrada (ajuste se os nomes diferirem)
F_MET   = INTERIM   / "met_aligned.csv.gz"            # EPW+flags/métricas (sem 29/02)
F_VN    = INTERIM / "eplus_vn.csv.gz"               # EnergyPlus (ventilação natural)
F_AC    = INTERIM / "eplus_ac.csv.gz"               # EnergyPlus (ar-condicionado)  ← se existir
F_JOS3  = PROCESSED / f"JOS3_output_{ZONE}.csv.gz"    # JOS-3 (t_core, t_skin_mean, w_mean)

# saída
OUT_BASE = PROCESSED / f"{SITE_ID}_{ZONE}_1991-2023_FULL_PIPELINE"

Helpers

In [2]:
def read_csv_timeset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, compression="gzip", low_memory=False)
    # garantir timeset como datetime e remover duplicatas
    if "timeset" not in df.columns:
        raise ValueError(f"{path.name} não possui coluna 'timeset'.")
    df["timeset"] = pd.to_datetime(df["timeset"], errors="coerce")
    df = df.dropna(subset=["timeset"]).sort_values("timeset")
    df = df[~df["timeset"].duplicated(keep="first")]
    return df.reset_index(drop=True)

def pref(df: pd.DataFrame, prefix: str, keep=("timeset",)) -> pd.DataFrame:
    """Prefixa todas as colunas exceto as de `keep`."""
    ren = {c: (c if c in keep else f"{prefix}{c}") for c in df.columns}
    return df.rename(columns=ren)

Carregar bases

In [3]:
met = read_csv_timeset(F_MET)       # base temporal (EPW + flags + métricas)
vn  = read_csv_timeset(F_VN)        # saídas VN
jos3 = read_csv_timeset(F_JOS3)     # saídas JOS-3
ac = read_csv_timeset(F_AC)         # saídas AC

print("met:",  met["timeset"].min(), "→", met["timeset"].max(), "len:", len(met))
print("vn :",  vn["timeset"].min(),  "→", vn["timeset"].max(),  "len:", len(vn))
print("jos3:", jos3["timeset"].min(),"→", jos3["timeset"].max(),"len:", len(jos3))
print("ac :", ac["timeset"].min(), "→", ac["timeset"].max(), "len:", len(ac))

met: 1991-01-01 01:00:00 → 2024-01-01 00:00:00 len: 289080
vn : 1991-01-01 01:00:00 → 2024-01-01 00:00:00 len: 289080
jos3: 1991-01-01 01:00:00 → 2024-01-01 00:00:00 len: 289080
ac : 1991-01-01 01:00:00 → 2024-01-01 00:00:00 len: 289080


Prefixar e unir

In [4]:
vn_p   = pref(vn,   "vn_")
jos3_p = pref(jos3, "jos3_")
ac_p   = pref(ac,   "ac_") 

full = met.merge(vn_p,   on="timeset", how="left")
full = full.merge(jos3_p, on="timeset", how="left")
full = full.merge(ac_p, on="timeset", how="left")

full = full.sort_values("timeset").reset_index(drop=True)

print("full:", full["timeset"].min(), "→", full["timeset"].max(), "len:", len(full))
print("dups full:", full["timeset"].duplicated().sum())


full: 1991-01-01 01:00:00 → 2024-01-01 00:00:00 len: 289080
dups full: 0


QC básico

In [5]:
# NaNs por bloco (amostra)
cols_flags = [c for c in full.columns if c.startswith(("HW_","INMET_","OUZ_"))]
cols_vn    = [c for c in full.columns if c.startswith("vn_")]
cols_jos3  = [c for c in full.columns if c.startswith("jos3_")]
cols_ac    = [c for c in full.columns if c.startswith("ac_")]

def nan_ratio(df, cols, label):
    if cols:
        print(label, "NaN:", df[cols].isna().mean().mean().round(3))

nan_ratio(full, cols_flags, "Flags")
nan_ratio(full, cols_vn,    "VN")
nan_ratio(full, cols_jos3,  "JOS3")
nan_ratio(full, cols_ac,    "AC")

Flags NaN: 0.763
VN NaN: 0.0
JOS3 NaN: 0.0
AC NaN: 0.0


Salvar

In [6]:
full.to_csv(f"{OUT_BASE}.csv.gz", index=False, compression="gzip")
print("Saved:", f"{OUT_BASE}.csv.gz")

Saved: ..\data\processed\MAO\MAO_DORM1_1991-2023_FULL_PIPELINE.csv.gz


In [7]:
full.to_csv(f"{OUT_BASE}.csv", index=False)
print("Saved:", f"{OUT_BASE}.csv")

Saved: ..\data\processed\MAO\MAO_DORM1_1991-2023_FULL_PIPELINE.csv


In [8]:
full.to_parquet(f"{OUT_BASE}.parquet", index=False)
print("Saved:", f"{OUT_BASE}.parquet")

Saved: ..\data\processed\MAO\MAO_DORM1_1991-2023_FULL_PIPELINE.parquet
