In [None]:
# raw_ciha 
# Este notebook foi utilizado para transformar a base do CIHA de csv para PARQUET, criando a camada raw

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('lab') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "s3a://minio:9000/datalake/") # Set default file system into the HDFS namenode
             .config("spark.jars", "/opt/bitnami/spark/jars_external/hadoop-aws-3.3.4.jar,/opt/bitnami/spark/jars_external/aws-java-sdk-bundle-1.12.588.jar")
             .config("spark.sql.repl.eagerEval.enabled", True)
             .config("spark.driver.memory", "16g")  # Memória para o driver
             .config("spark.executor.memory", "32g")  # Memória para os executores
             .config("spark.executor.cores", "10")  # Número de núcleos por executor
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

hdp_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "minio",
    "fs.s3a.secret.key": "minioadmin",
    "fs.s3a.connection.timeout": "600000",
    "spark.sql.debug.maxToStringFields": "100",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "true"
}

for k,v in hdp_configs.items():
    spark.sparkContext._jsc.hadoopConfiguration().set(k, v)


In [2]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")

In [3]:
ciha_raw = spark.read.option("header", True).csv("s3a://datalake/source/ciha_csv/")
    
ciha = ciha_raw.select(
    col("ANO_CMPT").alias("ano_cmpt"),
    col("MES_CMPT").alias("mes_cmpt"),
    col("ESPEC").alias("espec"),
    col("CGC_HOSP").alias("cgc_hosp"),
    col("MUNIC_RES").alias("munic_res"),
    to_date("NASC", "yyyyMMdd").alias("nasc"),
    col("SEXO").alias("sexo"),
    col("UTI_MES_TO").alias("uti_mes_to"),
    col("UTI_INT_TO").alias("uti_int_to"),
    col("PROC_REA").alias("proc_rea"),
    col("QT_PROC").alias("qt_proc"),
    to_date("DT_ATEND", "yyyyMMdd").alias("dt_atend"),
    to_date("DT_SAIDA", "yyyyMMdd").alias("dt_saida"),
    col("DIAG_PRINC").alias("diag_princ"),
    col("DIAG_SECUN").alias("diag_secun"),
    col("COBRANCA").alias("cobranca"),
    col("NATUREZA").alias("natureza"),
    col("GESTAO").alias("gestao"),
    col("MUNIC_MOV").alias("munic_mov"),
    col("COD_IDADE").alias("cod_idade"),
    col("IDADE").alias("idade"),
    col("DIAS_PERM").alias("dias_perm"),
    col("MORTE").alias("morte"),
    col("NACIONAL").alias("nacional"),
    col("CAR_INT").alias("car_int"),
    col("HOMONIMO").alias("homonimo"),
    col("CNES").alias("cnes"),
    col("FONTE").alias("fonte"),
    col("CGC_CONSOR").alias("cgc_consor"),
    col("MODALIDADE").alias("modalidade"),
    input_file_name().alias("input_file_name")
).withColumn("estado_uf", substring( split(col("input_file_name"), "/")[5], 5, 2))

ciha.printSchema()

# print(ciha.count())

ciha.show()

root
 |-- ano_cmpt: string (nullable = true)
 |-- mes_cmpt: string (nullable = true)
 |-- espec: string (nullable = true)
 |-- cgc_hosp: string (nullable = true)
 |-- munic_res: string (nullable = true)
 |-- nasc: date (nullable = true)
 |-- sexo: string (nullable = true)
 |-- uti_mes_to: string (nullable = true)
 |-- uti_int_to: string (nullable = true)
 |-- proc_rea: string (nullable = true)
 |-- qt_proc: string (nullable = true)
 |-- dt_atend: date (nullable = true)
 |-- dt_saida: date (nullable = true)
 |-- diag_princ: string (nullable = true)
 |-- diag_secun: string (nullable = true)
 |-- cobranca: string (nullable = true)
 |-- natureza: string (nullable = true)
 |-- gestao: string (nullable = true)
 |-- munic_mov: string (nullable = true)
 |-- cod_idade: string (nullable = true)
 |-- idade: string (nullable = true)
 |-- dias_perm: string (nullable = true)
 |-- morte: string (nullable = true)
 |-- nacional: string (nullable = true)
 |-- car_int: string (nullable = true)
 |-- homon

In [5]:
ciha.groupBy("ano_cmpt").agg(count("*")).orderBy("ano_cmpt").show(truncate=False)

+--------+--------+
|ano_cmpt|count(1)|
+--------+--------+
|2011    |10642684|
|2012    |12857677|
|2013    |14974908|
|2014    |18555742|
|2015    |16620815|
|2016    |18869474|
|2017    |20327035|
|2018    |18744844|
|2019    |19447461|
|2020    |15165019|
|2021    |18263073|
|2022    |19742407|
|2023    |17944281|
|2024    |8686891 |
+--------+--------+



In [5]:
ciha.groupBy("estado_uf").agg(count("*")).orderBy("estado_uf").show(truncate=False)

+---------+--------+
|estado_uf|count(1)|
+---------+--------+
|AC       |46547   |
|AL       |1409098 |
|AM       |1170521 |
|AP       |801518  |
|BA       |9966235 |
|CE       |2416704 |
|DF       |132599  |
|ES       |2375047 |
|GO       |1168715 |
|MA       |406895  |
|MG       |27502206|
|MS       |1512187 |
|MT       |737912  |
|PA       |4130110 |
|PB       |469368  |
|PE       |5190979 |
|PI       |507707  |
|PR       |13891970|
|RJ       |8428685 |
|RN       |871089  |
+---------+--------+
only showing top 20 rows



In [6]:
# NASC DT_ATEND DT_SAIDA

ciha.filter("ano_cmpt = '2023'").select(
    min("NASC").alias("min_NASC"),
    min("DT_ATEND").alias("min_DT_ATEND"),
    min("DT_SAIDA").alias("min_DT_SAIDA")
)

min_NASC,min_DT_ATEND,min_DT_SAIDA
0195-10-05,1931-04-05,1899-12-30


In [8]:
ciha.filter("ano_cmpt = '2023' and nasc < '1500-01-01'")

ano_cmpt,mes_cmpt,espec,cgc_hosp,munic_res,nasc,sexo,uti_mes_to,uti_int_to,proc_rea,qt_proc,dt_atend,dt_saida,diag_princ,diag_secun,cobranca,natureza,gestao,munic_mov,cod_idade,idade,dias_perm,morte,nacional,car_int,homonimo,cnes,fonte,cgc_consor,modalidade,input_file_name,estado_uf
2023,1,,92815000000834,43090,0196-06-01,0,0,0,209020016,1,2023-01-04,2023-01-04,C670,,18,,M,430920,0,99,0,0,,,,2232049,6,0,1,s3a://datalake/so...,RS
2023,8,,76591049000128,42054,0195-10-05,0,0,0,301010072,1,2023-08-14,2023-08-14,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR
2023,8,,76591049000128,42054,0195-10-05,0,0,0,301010072,2,2023-08-09,2023-08-09,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR
2023,8,,76591049000128,42054,0195-10-05,0,0,0,301010072,1,2023-08-11,2023-08-11,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR
2023,8,,76591049000128,42054,0195-10-05,0,0,0,301010072,1,2023-08-04,2023-08-04,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR
2023,8,,76591049000128,42054,0195-10-05,0,0,0,211020036,1,2023-08-02,2023-08-02,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR
2023,8,,76591049000128,42054,0195-10-05,0,0,0,301010072,1,2023-08-02,2023-08-02,,,18,,D,410690,0,99,0,0,,,,15644,1,0,1,s3a://datalake/so...,PR


In [7]:
for ano in [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
    print(ano)
    ciha.filter(f"ano_cmpt = '{ano}'").filter("nasc >= '1500-01-01' ").write.mode("overwrite").partitionBy("ano_cmpt").parquet("s3a://datalake/raw/ciha/")

2023
2024
