In [3]:
# stage_ciha
# Processa a camada stage da base CIHA, criando a tabela no catalogo e salvando em delta

In [3]:
pip install delta-spark

Collecting delta-spark
  Obtaining dependency information for delta-spark from https://files.pythonhosted.org/packages/3b/d8/265a93d22ae79262cdff701496a6f5676926a342153f3855ae6060430660/delta_spark-4.0.0-py3-none-any.whl.metadata
  Downloading delta_spark-4.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting pyspark>=4.0.0 (from delta-spark)
  Downloading pyspark-4.0.0.tar.gz (434.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.1/434.1 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting py4j==0.10.9.9 (from pyspark>=4.0.0->delta-spark)
  Obtaining dependency information for py4j==0.10.9.9 from https://files.pythonhosted.org/packages/bd/db/ea0203e495be491c85af87b66e37acfd3bf756fd985f87e46fc5e3bf022c/py4j-0.10.9.9-py2.py3-none-any.whl.metadata
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading delta_spark-4.0.0-py3-none-any.whl (39 kB)
Downloading py4j-0.10.9.9-p

In [1]:
%run ../spark-default.py

In [2]:
from delta.tables import DeltaTable
import uuid

In [3]:
uuid_udf = udf(lambda: str(uuid.uuid4()), StringType())
ciha = spark.read.parquet("s3a://datalake/raw/ciha/").withColumn("pk", uuid_udf())

In [4]:
spark.sql("show databases").show(truncate=False)

+---------+
|namespace|
+---------+
|default  |
|stage    |
+---------+



In [6]:
DeltaTable.createIfNotExists(spark) \
  .tableName("stage.ciha") \
  .addColumns(ciha.schema) \
  .partitionedBy("ano_cmpt") \
  .execute()

<delta.tables.DeltaTable at 0x7d0230330e50>

In [7]:
ciha.printSchema()

root
 |-- mes_cmpt: string (nullable = true)
 |-- espec: string (nullable = true)
 |-- cgc_hosp: string (nullable = true)
 |-- munic_res: string (nullable = true)
 |-- nasc: date (nullable = true)
 |-- sexo: string (nullable = true)
 |-- uti_mes_to: string (nullable = true)
 |-- uti_int_to: string (nullable = true)
 |-- proc_rea: string (nullable = true)
 |-- qt_proc: string (nullable = true)
 |-- dt_atend: date (nullable = true)
 |-- dt_saida: date (nullable = true)
 |-- diag_princ: string (nullable = true)
 |-- diag_secun: string (nullable = true)
 |-- cobranca: string (nullable = true)
 |-- natureza: string (nullable = true)
 |-- gestao: string (nullable = true)
 |-- munic_mov: string (nullable = true)
 |-- cod_idade: string (nullable = true)
 |-- idade: string (nullable = true)
 |-- dias_perm: string (nullable = true)
 |-- morte: string (nullable = true)
 |-- nacional: string (nullable = true)
 |-- car_int: string (nullable = true)
 |-- homonimo: string (nullable = true)
 |-- cnes:

In [8]:
spark.sql("describe stage.ciha").show(999)

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|            mes_cmpt|   string|   null|
|               espec|   string|   null|
|            cgc_hosp|   string|   null|
|           munic_res|   string|   null|
|                nasc|     date|   null|
|                sexo|   string|   null|
|          uti_mes_to|   string|   null|
|          uti_int_to|   string|   null|
|            proc_rea|   string|   null|
|             qt_proc|   string|   null|
|            dt_atend|     date|   null|
|            dt_saida|     date|   null|
|          diag_princ|   string|   null|
|          diag_secun|   string|   null|
|            cobranca|   string|   null|
|            natureza|   string|   null|
|              gestao|   string|   null|
|           munic_mov|   string|   null|
|           cod_idade|   string|   null|
|               idade|   string|   null|
|           dias_perm|   string|   null|
|               

In [5]:
# spark.sql("delete from stage.ciha").show()
# spark.sql("drop table stage.ciha").show()

++
||
++
++



In [9]:
for ano in [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]:
    print(ano)
    ciha.filter(f"ano_cmpt = '{ano}'").write.mode("append").insertInto("stage.ciha")

2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025


In [10]:
ciha = spark.table("stage.ciha")

In [11]:
ciha.limit(10).show(truncate=False)

+--------+-----+--------------+---------+----------+----+----------+----------+----------+-------+----------+----------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+--------+-------+--------+-------+-----+----------+----------+---------------------------------------------+---------+--------+------------------------------------+
|mes_cmpt|espec|cgc_hosp      |munic_res|nasc      |sexo|uti_mes_to|uti_int_to|proc_rea  |qt_proc|dt_atend  |dt_saida  |diag_princ|diag_secun|cobranca|natureza|gestao|munic_mov|cod_idade|idade|dias_perm|morte|nacional|car_int|homonimo|cnes   |fonte|cgc_consor|modalidade|input_file_name                              |estado_uf|ano_cmpt|pk                                  |
+--------+-----+--------------+---------+----------+----+----------+----------+----------+-------+----------+----------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+--------+-------+--------+-------+-----+---

In [12]:
ciha.groupBy("ano_cmpt").agg(count("*")).orderBy("ano_cmpt").show(truncate=False)

+--------+--------+
|ano_cmpt|count(1)|
+--------+--------+
|2011    |10010212|
|2012    |11234203|
|2013    |12388257|
|2014    |16011188|
|2015    |14029854|
|2016    |16218264|
|2017    |17707297|
|2018    |16095906|
|2019    |17008127|
|2020    |13105052|
|2021    |16309069|
|2022    |17818647|
|2023    |18017246|
|2024    |15655354|
|2025    |4209491 |
+--------+--------+



In [13]:
ciha.groupBy("estado_uf").agg(count("*")).show(100, truncate=False)

+---------+--------+
|estado_uf|count(1)|
+---------+--------+
|SP       |92096240|
|RS       |39060781|
|MG       |24358412|
|PR       |14761366|
|BA       |8609573 |
|RJ       |7356377 |
|SC       |7550733 |
|PE       |4890151 |
|ES       |1988321 |
|CE       |2278173 |
|PA       |3771294 |
|AM       |1080146 |
|MS       |1303271 |
|AL       |1137548 |
|RN       |980335  |
|TO       |517835  |
|GO       |963343  |
|PB       |491967  |
|PI       |470368  |
|MT       |666845  |
|MA       |350928  |
|AP       |664216  |
|RO       |124731  |
|SE       |192684  |
|DF       |104108  |
|AC       |48421   |
+---------+--------+



In [14]:
ciha.count()

215818167

In [15]:
ciha.groupBy("pk").agg(count("*").alias("count")).filter("count > 1").count()

0

In [16]:
spark.stop()