# Configuração do Ambiente e Conexão

In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path
from sqlalchemy import create_engine, text

# ETL

## Extract

In [2]:
print("Lendo dados da camada BRONZE (CSV)...")

df = pd.read_csv("../Data Layer/raw/dados_brutos.csv")

Lendo dados da camada BRONZE (CSV)...


## Transform

Funções de limpeza e padronização

In [3]:
def remove_special_chars(value):
    if pd.isna(value):
        return value
    value = unicodedata.normalize('NFKD', str(value))
    value = value.encode('ASCII', 'ignore').decode('ASCII')
    value = re.sub(r'[^a-zA-Z0-9\s\-]', '', value)
    return value.strip().lower()

def round_float_columns(df):
    float_cols = df.select_dtypes(include='float').columns
    df[float_cols] = df[float_cols].round(2)
    return df

def normalize_column_name(col):
    col = unicodedata.normalize('NFKD', col).encode('ascii', 'ignore').decode('utf-8')
    col = col.lower().strip()
    col = re.sub(r'[^a-z0-9]+', '_', col)
    col = col.strip('_')
    return col

Aplicando as funções de padronização.

In [4]:
df.columns = [normalize_column_name(c) for c in df.columns]

df['date'] = pd.to_datetime(df['date'], errors='coerce')

string_cols = df.select_dtypes(include='object').columns

for col in string_cols:
    df[col] = df[col].apply(remove_special_chars)

df['month'] = df['date'].dt.month

df['season'] = np.where(
    df['month'].isin([6, 7, 8]),
    'verao',
    'outras_estacoes'
)

df = round_float_columns(df)


## Load

In [5]:
engine = create_engine(
    "postgresql://postgres:postgres@localhost:5432/housets"
)
with engine.begin() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS silver;"))

df.to_sql(
    name="silver_houses",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False
)

print("RAW to SILVER concluído com sucesso.")

RAW to SILVER concluído com sucesso.
