In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[2]") \
                    .appName('tcc1') \
                    .getOrCreate()

In [4]:
ciha = spark.read.option("header", True).csv("datasus/ciha/csv/")

In [6]:
ciha.count()

214492919

In [7]:
ciha.printSchema()

root
 |-- ANO_CMPT: string (nullable = true)
 |-- MES_CMPT: string (nullable = true)
 |-- ESPEC: string (nullable = true)
 |-- CGC_HOSP: string (nullable = true)
 |-- MUNIC_RES: string (nullable = true)
 |-- NASC: string (nullable = true)
 |-- SEXO: string (nullable = true)
 |-- UTI_MES_TO: string (nullable = true)
 |-- UTI_INT_TO: string (nullable = true)
 |-- PROC_REA: string (nullable = true)
 |-- QT_PROC: string (nullable = true)
 |-- DT_ATEND: string (nullable = true)
 |-- DT_SAIDA: string (nullable = true)
 |-- DIAG_PRINC: string (nullable = true)
 |-- DIAG_SECUN: string (nullable = true)
 |-- COBRANCA: string (nullable = true)
 |-- NATUREZA: string (nullable = true)
 |-- GESTAO: string (nullable = true)
 |-- MUNIC_MOV: string (nullable = true)
 |-- COD_IDADE: string (nullable = true)
 |-- IDADE: string (nullable = true)
 |-- DIAS_PERM: string (nullable = true)
 |-- MORTE: string (nullable = true)
 |-- NACIONAL: string (nullable = true)
 |-- CAR_INT: string (nullable = true)
 |--

In [5]:
ciha.show(truncate=False)

+--------+--------+-----+--------------+---------+----+----+----------+----------+----------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+--------+-------+--------+-------+-----+----------+
|ANO_CMPT|MES_CMPT|ESPEC|CGC_HOSP      |MUNIC_RES|NASC|SEXO|UTI_MES_TO|UTI_INT_TO|PROC_REA  |QT_PROC|DT_ATEND|DT_SAIDA|DIAG_PRINC|DIAG_SECUN|COBRANCA|NATUREZA|GESTAO|MUNIC_MOV|COD_IDADE|IDADE|DIAS_PERM|MORTE|NACIONAL|CAR_INT|HOMONIMO|CNES   |FONTE|MODALIDADE|
+--------+--------+-----+--------------+---------+----+----+----------+----------+----------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+--------+-------+--------+-------+-----+----------+
|2011    |05      |null |55344337000108|null     |null|null|0         |0         |0202060330|1      |null    |null    |null      |null      |null    |11      |D     |354140   |null     |null |null     |null |null    |nul

In [9]:
df2 = ciha.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in ciha.columns])

In [10]:
df2.show(truncate=False)

+--------+--------+---------+--------+---------+--------+--------+----------+----------+--------+-------+--------+--------+----------+----------+---------+---------+------+---------+---------+--------+---------+--------+---------+---------+---------+----+------+----------+
|ANO_CMPT|MES_CMPT|ESPEC    |CGC_HOSP|MUNIC_RES|NASC    |SEXO    |UTI_MES_TO|UTI_INT_TO|PROC_REA|QT_PROC|DT_ATEND|DT_SAIDA|DIAG_PRINC|DIAG_SECUN|COBRANCA |NATUREZA |GESTAO|MUNIC_MOV|COD_IDADE|IDADE   |DIAS_PERM|MORTE   |NACIONAL |CAR_INT  |HOMONIMO |CNES|FONTE |MODALIDADE|
+--------+--------+---------+--------+---------+--------+--------+----------+----------+--------+-------+--------+--------+----------+----------+---------+---------+------+---------+---------+--------+---------+--------+---------+---------+---------+----+------+----------+
|0       |0       |214492919|8719116 |27282664 |27282664|27282664|0         |2457596   |0       |0      |27282664|27282664|82022537  |208124582 |182108088|131909720|0     |0     

In [14]:
selected_cols = [
    "ANO_CMPT",
    "MES_CMPT",
    "CGC_HOSP",
    "MUNIC_RES",
    "NASC",
    "SEXO",
    "UTI_MES_TO",
    "UTI_INT_TO",
    "PROC_REA",
    "QT_PROC",
    "DT_ATEND",
    "DT_SAIDA",
    "DIAG_PRINC",
    "DIAG_SECUN",
    "COBRANCA",
    "NATUREZA",
    "GESTAO",
    "MUNIC_MOV",
    "COD_IDADE",
    "IDADE",
    "DIAS_PERM",
    "MORTE",
    "CNES",
    "FONTE",
    "MODALIDADE"
]
    
st1 = ciha.select(*selected_cols)

In [17]:
important_cols = [
    "ANO_CMPT",
    "MES_CMPT",
    "CGC_HOSP",
    "MUNIC_RES",
    "NASC",
    "SEXO",
    "PROC_REA",
    "QT_PROC",
    "DT_ATEND",
    "DT_SAIDA",
    "DIAG_PRINC",
    "GESTAO",
    "MUNIC_MOV",
    "COD_IDADE",
    "IDADE",
    "DIAS_PERM",
    "MORTE",
    "CNES",
    "MODALIDADE"
]

st2 = st1.filter(" and ".join([f"{c} is not null" for c in important_cols]))

st2.write.parquet("datasus/ciha/parquet/")

In [18]:
df = spark.read.parquet("datasus/ciha/parquet/")

In [21]:
df.count()

127272146

In [20]:
df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns]).show(truncate=False)

+--------+--------+--------+---------+----+----+----------+----------+--------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+----+-----+----------+
|ANO_CMPT|MES_CMPT|CGC_HOSP|MUNIC_RES|NASC|SEXO|UTI_MES_TO|UTI_INT_TO|PROC_REA|QT_PROC|DT_ATEND|DT_SAIDA|DIAG_PRINC|DIAG_SECUN|COBRANCA|NATUREZA|GESTAO|MUNIC_MOV|COD_IDADE|IDADE|DIAS_PERM|MORTE|CNES|FONTE|MODALIDADE|
+--------+--------+--------+---------+----+----+----------+----------+--------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+----+-----+----------+
|0       |0       |0       |0        |0   |0   |0         |1352318   |0       |0      |0       |0       |0         |121074132 |96128118|76086356|0     |0        |0        |0    |0        |0    |0   |0    |0         |
+--------+--------+--------+---------+----+----+----------+----------+--------+-------+--------+--------+----------+----------+-----

In [30]:
df.show(truncate=False)

+--------+--------+--------------+---------+--------+----+----------+----------+----------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+-------+-----+----------+
|ANO_CMPT|MES_CMPT|CGC_HOSP      |MUNIC_RES|NASC    |SEXO|UTI_MES_TO|UTI_INT_TO|PROC_REA  |QT_PROC|DT_ATEND|DT_SAIDA|DIAG_PRINC|DIAG_SECUN|COBRANCA|NATUREZA|GESTAO|MUNIC_MOV|COD_IDADE|IDADE|DIAS_PERM|MORTE|CNES   |FONTE|MODALIDADE|
+--------+--------+--------------+---------+--------+----+----------+----------+----------+-------+--------+--------+----------+----------+--------+--------+------+---------+---------+-----+---------+-----+-------+-----+----------+
|2019    |02      |60194990000763|354990   |19820217|3   |0         |000       |0411010034|1      |20190226|20190228|O821      |null      |61      |null    |M     |354990   |4        |37   |2        |0    |0009539|06   |02        |
|2019    |02      |60194990000763|354990   |19831215|3   |0         |000

In [32]:
df.groupBy("ANO_CMPT").count().orderBy("ANO_CMPT").show()

+--------+--------+
|ANO_CMPT|   count|
+--------+--------+
|    2011|13455228|
|    2012| 7493554|
|    2013|13539741|
|    2014| 9157732|
|    2015| 8976001|
|    2016|10116122|
|    2017|11664889|
|    2018|11277506|
|    2019|11150677|
|    2020| 8660219|
|    2021| 9908547|
|    2022|10433583|
|    2023| 1438347|
+--------+--------+

