### 1. Configuração Inicial

**Objetivo:** Importar as bibliotecas e definir os caminhos e nomes de tabelas.

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, input_file_name, current_timestamp

# --- CONFIGURAÇÃO DOS CAMINHOS E NOMES ---
data_source = "sinasc"
root_path = "/lakehouse/health_insights_brasil"
# Apontamos para o diretório da AMOSTRA que criamos no notebook anterior
landing_zone_path = f"{root_path}/landing_zone/{data_source}" 
bronze_path = f"{root_path}/bronze/{data_source}"
table_name = "bronze_sinasc"

print("✅ Configuração concluída.")
print(f"   - Lendo de: {landing_zone_path}")
print(f"   - Escrevendo em: {bronze_path}")

✅ Configuração concluída.
   - Lendo de: /lakehouse/health_insights_brasil/landing_zone/sinasc
   - Escrevendo em: /lakehouse/health_insights_brasil/bronze/sinasc


### 2. Definição Explícita do Schema

**Objetivo:** Criar o "contrato" dos nossos dados. Definimos cada coluna e seu tipo. Para a camada Bronze, é uma boa prática ler a maioria dos campos como `StringType` para evitar erros de parse. A conversão para números ou datas será feita na camada Silver.

In [0]:
# Schema explícito para os dados do SINASC 
bronze_schema = StructType([
    StructField("ORIGEM", StringType(), True),
    StructField("CODESTAB", StringType(), True),
    StructField("CODMUNNASC", StringType(), True),
    StructField("NUMERODN", StringType(), True), 
    StructField("LOCNASC", StringType(), True),
    StructField("IDADEMAE", StringType(), True),
    StructField("ESTCIVMAE", StringType(), True),
    StructField("ESCMAE", StringType(), True),
    StructField("CODOCUPMAE", StringType(), True),
    StructField("QTDFILVIVO", StringType(), True),
    StructField("QTDFILMORT", StringType(), True),
    StructField("CODMUNRES", StringType(), True),
    StructField("GESTACAO", StringType(), True),
    StructField("GRAVIDEZ", StringType(), True),
    StructField("PARTO", StringType(), True),
    StructField("CONSULTAS", StringType(), True),
    StructField("DTNASC", StringType(), True),
    StructField("HORANASC", StringType(), True),
    StructField("SEXO", StringType(), True),
    StructField("APGAR1", StringType(), True),
    StructField("APGAR5", StringType(), True),
    StructField("RACACOR", StringType(), True),
    StructField("PESO", StringType(), True),
    StructField("IDANOMAL", StringType(), True),
    StructField("DTCADASTRO", StringType(), True),
    StructField("CODANOMAL", StringType(), True),
    StructField("NUMEROLOTE", StringType(), True),
    StructField("VERSAOSIST", StringType(), True),
    StructField("DTRECEBIM", StringType(), True),
    StructField("DIFDATA", StringType(), True),
    StructField("DTRECORIGA", StringType(), True),
    StructField("NATURALMAE", StringType(), True),
    StructField("CODMUNNATU", StringType(), True),
    StructField("CODUFNATU", StringType(), True),
    StructField("ESCMAE2010", StringType(), True),
    StructField("SERIESCMAE", StringType(), True),
    StructField("DTNASCMAE", StringType(), True),
    StructField("RACACORMAE", StringType(), True),
    StructField("QTDGESTANT", StringType(), True),
    StructField("QTDPARTNOR", StringType(), True),
    StructField("QTDPARTCES", StringType(), True),
    StructField("IDADEPAI", StringType(), True),
    StructField("DTULTMENST", StringType(), True),
    StructField("SEMAGESTAC", StringType(), True),
    StructField("TPMETESTIM", StringType(), True),
    StructField("CONSPRENAT", StringType(), True),
    StructField("MESPRENAT", StringType(), True),
    StructField("TPAPRESENT", StringType(), True),
    StructField("STTRABPART", StringType(), True),
    StructField("STCESPARTO", StringType(), True),
    StructField("TPNASCASSI", StringType(), True),
    StructField("TPFUNCRESP", StringType(), True),
    StructField("TPDOCRESP", StringType(), True),
    StructField("DTDECLARAC", StringType(), True),
    StructField("ESCMAEAGR1", StringType(), True),
    StructField("STDNEPIDEM", StringType(), True),
    StructField("STDNNOVA", StringType(), True),
    StructField("CODPAISRES", StringType(), True),
    StructField("TPROBSON", StringType(), True),
    StructField("PARIDADE", StringType(), True),
    StructField("KOTELCHUCK", StringType(), True),
    StructField("CONTADOR", StringType(), True),
    StructField("munResUf", StringType(), True),
    StructField("munResNome", StringType(), True),
    StructField("munResLat", StringType(), True),
    StructField("munResLon", StringType(), True),
    StructField("munResAlt", StringType(), True),
    StructField("munResArea", StringType(), True)
])

print("✅ Schema da camada Bronze corrigido com a coluna NUMERODN.")

✅ Schema da camada Bronze corrigido com a coluna NUMERODN.


### 3. Leitura do CSV e Escrita da Tabela Bronze

**Objetivo:** Ler o CSV da Landing Zone forçando o uso do nosso schema, adicionar colunas de metadados e salvar a tabela final no formato Delta.

In [0]:
# --- LIMPEZA PARA UMA CARGA COMPLETA ---
print("Limpando diretórios e tabela antiga para garantir uma carga completa...")
dbutils.fs.rm(bronze_path, recurse=True)
spark.sql(f"DROP TABLE IF EXISTS default.{table_name}")

# --- PROCESSAMENTO EM LOTE ---
print("Iniciando o processamento em lote...")

# Leitura do CSV aplicando nosso schema explícito
df_raw = (spark.read
          .format("csv")
          .option("header", "true")
          .option("sep", ";")
          .schema(bronze_schema) # <-- APLICAÇÃO DO SCHEMA
          .load(landing_zone_path)
         )

# Adiciona as colunas de metadados para rastreabilidade
df_bronze = (df_raw
             .withColumn("_ingestion_timestamp", current_timestamp())
             .withColumn("_source_file", input_file_name())
            )

# Escrita dos dados no formato Delta
print(f"Salvando {df_bronze.count()} registros no formato Delta em: {bronze_path}")
(df_bronze.write
 .format("delta")
 .mode("overwrite")
 .save(bronze_path)
)

# Criação da tabela externa no Metastore
print(f"Registrando a tabela externa '{table_name}'...")
spark.sql(f"""
    CREATE TABLE default.{table_name}
    USING DELTA
    LOCATION '{bronze_path}'
""")

print(f"\n✅ Tabela '{table_name}' criada com sucesso na camada Bronze.")

Limpando diretórios e tabela antiga para garantir uma carga completa...
Iniciando o processamento em lote...
Salvando 10000 registros no formato Delta em: /lakehouse/health_insights_brasil/bronze/sinasc
Registrando a tabela externa 'bronze_sinasc'...

✅ Tabela 'bronze_sinasc' criada com sucesso na camada Bronze.


### 4. Verificação Final

**Objetivo:** Confirmar visualmente que a tabela foi criada e que os dados estão estruturados corretamente com as colunas que definimos.

In [0]:
%sql
-- A verificação final que agora deve mostrar os dados corretamente preenchidos
SELECT * FROM default.bronze_sinasc LIMIT 10;

ORIGEM,CODESTAB,CODMUNNASC,NUMERODN,LOCNASC,IDADEMAE,ESTCIVMAE,ESCMAE,CODOCUPMAE,QTDFILVIVO,QTDFILMORT,CODMUNRES,GESTACAO,GRAVIDEZ,PARTO,CONSULTAS,DTNASC,HORANASC,SEXO,APGAR1,APGAR5,RACACOR,PESO,IDANOMAL,DTCADASTRO,CODANOMAL,NUMEROLOTE,VERSAOSIST,DTRECEBIM,DIFDATA,DTRECORIGA,NATURALMAE,CODMUNNATU,CODUFNATU,ESCMAE2010,SERIESCMAE,DTNASCMAE,RACACORMAE,QTDGESTANT,QTDPARTNOR,QTDPARTCES,IDADEPAI,DTULTMENST,SEMAGESTAC,TPMETESTIM,CONSPRENAT,MESPRENAT,TPAPRESENT,STTRABPART,STCESPARTO,TPNASCASSI,TPFUNCRESP,TPDOCRESP,DTDECLARAC,ESCMAEAGR1,STDNEPIDEM,STDNNOVA,CODPAISRES,TPROBSON,PARIDADE,KOTELCHUCK,CONTADOR,munResUf,munResNome,munResLat,munResLon,munResAlt,munResArea,_ingestion_timestamp,_source_file
1,1,2516500,110001,1,24,1,3,999992,1,0,110001,5,1,2,2,14022024,845,1,8,9,4,3120,2,5032024,,20240003,3.2.50,5032024,20,20,5032024,811,110028,11,2,5.0,11012000,4,1,0,1,,17052023.0,38,8,2,3,1,2,2,1,2,3,14022024,3,0,1,1,5,1,2,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
2,1,2516500,110001,1,29,2,5,999992,0,0,110001,5,1,2,4,17042024,850,1,8,9,4,3564,2,9052024,,20240005,3.2.50,9052024,22,22,9052024,811,110001,11,5,,22111994,4,0,0,0,41.0,17072023.0,39,8,8,5,1,2,1,1,2,3,17042024,8,0,1,1,2,0,2,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
3,1,2516500,110001,1,30,5,4,622020,2,0,110001,5,1,2,4,29052024,1006,1,8,9,4,2816,2,3062024,,20240006,3.2.50,3062024,5,5,3062024,811,110001,11,3,2.0,12081993,4,2,0,2,35.0,27082023.0,39,8,8,2,1,2,1,1,2,3,29052024,5,0,1,1,5,1,5,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
4,1,2516500,110001,1,14,5,4,999991,0,0,110001,5,1,2,4,27052024,900,1,8,9,1,3126,2,3062024,,20240006,3.2.50,3062024,7,7,3062024,811,110001,11,2,8.0,12102009,1,0,0,0,17.0,28082023.0,38,8,10,3,1,2,2,1,2,3,27052024,4,0,1,1,1,0,5,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
5,1,2516500,110001,1,24,2,4,999992,1,0,110001,5,1,2,4,13052024,835,1,8,9,4,3622,2,3062024,,20240006,3.2.50,3062024,21,21,3062024,850,500270,50,3,1.0,23091999,4,1,0,1,29.0,8082023.0,39,8,11,2,1,2,2,1,2,3,13052024,5,0,1,1,5,1,5,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
6,1,2516500,110001,1,29,5,5,999992,0,0,110001,5,1,2,4,8052024,915,2,8,9,2,2935,2,3062024,,20240006,3.2.50,3062024,26,26,3062024,811,110014,11,4,,5121994,2,0,0,0,30.0,1082023.0,40,8,7,4,1,2,2,1,2,3,8052024,7,0,1,1,1,0,2,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
7,1,2679477,110001,1,18,9,3,999992,1,0,110001,5,1,1,3,1052024,110,1,8,9,4,3365,2,3062024,,20240007,3.2.50,12062024,42,33,3062024,811,110001,11,1,4.0,28052005,4,1,1,0,,5082023.0,38,8,5,1,1,2,3,1,5,5,1052024,2,0,1,1,3,1,3,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
8,1,2516500,110001,1,29,1,4,999992,2,1,110001,5,1,2,4,5062024,1005,1,8,9,4,3298,2,2072024,,20240008,3.2.50,20082024,76,76,20082024,811,110001,11,3,3.0,10111994,4,3,0,2,,5092023.0,39,8,7,2,1,2,1,1,2,3,5062024,6,0,1,1,5,1,5,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
9,1,2494299,110002,1,20,5,4,999992,0,0,110002,5,1,1,4,1012024,516,1,8,9,1,3240,2,3012024,,20240001,3.2.50,9012024,8,8,9012024,811,110002,11,3,3.0,24062003,1,0,0,0,,5042023.0,38,8,10,2,1,2,3,2,2,3,1012024,6,1,1,1,1,0,5,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
10,1,2494299,110002,1,40,1,5,252305,4,1,110110,5,1,2,4,1012024,2123,1,9,9,4,3960,2,3012024,,20240001,3.2.50,9012024,8,8,9012024,812,120040,12,5,,2061983,4,5,1,3,,,38,1,7,4,1,2,2,1,2,3,1012024,8,1,1,1,5,1,2,,,,,,,2025-08-14T21:31:20.819+0000,dbfs:/lakehouse/health_insights_brasil/landing_zone/sinasc/SINASC_2024_SAMPLE_10000.csv
