In [4]:
import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path
from sqlalchemy import create_engine, text


engine = create_engine(
    "postgresql://postgres:postgres@localhost:5432/housets"
)


def remove_special_chars(value):
    if pd.isna(value):
        return value
    value = unicodedata.normalize('NFKD', str(value))
    value = value.encode('ASCII', 'ignore').decode('ASCII')
    value = re.sub(r'[^a-zA-Z0-9\s\-]', '', value)
    return value.strip().lower()


def round_float_columns(df):
    float_cols = df.select_dtypes(include='float').columns
    df[float_cols] = df[float_cols].round(2)
    return df

with engine.begin() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS silver;"))


print("Lendo dados da camada BRONZE (CSV)...")

df = pd.read_csv("../Data Layer/raw/dados_brutos.csv")

df.columns = (
    df.columns
      .str.lower()
      .str.strip()
)


df['date'] = pd.to_datetime(df['date'], errors='coerce')


df = df.dropna(subset=['price', 'date', 'city_full'])
df = df.drop_duplicates()


string_cols = df.select_dtypes(include='object').columns

for col in string_cols:
    df[col] = df[col].apply(remove_special_chars)


df['month'] = df['date'].dt.month

df['season'] = np.where(
    df['month'].isin([6, 7, 8]),
    'verao',
    'outras_estacoes'
)

df = round_float_columns(df)

df.to_sql(
    name="silver_houses",
    schema="silver",
    con=engine,
    if_exists="replace",
    index=False
)

print("RAW → SILVER concluído com sucesso.")


Lendo dados da camada BRONZE (CSV)...
RAW → SILVER concluído com sucesso.
