# Projeto Final Semantix

In [None]:
# Importar bibliotecas
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
# Criar diretório e enviar os dados para o HDFS

!hdfs dfs -mkdir -p /user/milena/covid
!hdfs dfs -put /input/dados /user/milena/covid

In [2]:
!hdfs dfs -ls /user/milena/covid/dados

Found 5 items
-rw-r--r--   3 root supergroup   62493275 2022-08-18 00:33 /user/milena/covid/dados/HIST_PAINEL_COVIDBR_2020_Parte1_14ago2022.csv
-rw-r--r--   3 root supergroup   76520606 2022-08-18 00:33 /user/milena/covid/dados/HIST_PAINEL_COVIDBR_2020_Parte2_14ago2022.csv
-rw-r--r--   3 root supergroup   91120853 2022-08-18 00:33 /user/milena/covid/dados/HIST_PAINEL_COVIDBR_2021_Parte1_14ago2022.csv
-rw-r--r--   3 root supergroup   93414239 2022-08-18 00:33 /user/milena/covid/dados/HIST_PAINEL_COVIDBR_2021_Parte2_14ago2022.csv
-rw-r--r--   3 root supergroup   91807916 2022-08-18 00:33 /user/milena/covid/dados/HIST_PAINEL_COVIDBR_2022_Parte1_14ago2022.csv


In [3]:
sc = SparkContext

In [4]:
# Criando a sessão Spark
spark_session = SparkSession.Builder().getOrCreate()

In [13]:
# Carrega os dados em um dataframe Spark
df_spark = spark_session.read.csv('/user/milena/covid/dados', sep = ";",inferSchema=True, header=True,ignoreLeadingWhiteSpace=True)

In [14]:
# Visualiza os dados
df_spark.show(10)

+------+------+---------+-----+------+--------------+---------------+-------------------+---------+----------------+--------------+----------+---------------+-----------+----------------+---------------------+----------------------+
|regiao|estado|municipio|coduf|codmun|codRegiaoSaude|nomeRegiaoSaude|               data|semanaEpi|populacaoTCU2019|casosAcumulado|casosNovos|obitosAcumulado|obitosNovos|Recuperadosnovos|emAcompanhamentoNovos|interior/metropolitana|
+------+------+---------+-----+------+--------------+---------------+-------------------+---------+----------------+--------------+----------+---------------+-----------+----------------+---------------------+----------------------+
|Brasil|  null|     null|   76|  null|          null|           null|2021-07-01 00:00:00|       26|       210147125|      18622304|     65163|         520095|       2029|        16858632|              1180443|                  null|
|Brasil|  null|     null|   76|  null|          null|           null

In [16]:
# Mostrar estrutura dos dados

print(df_spark.printSchema())

root
 |-- regiao: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- coduf: integer (nullable = true)
 |-- codmun: integer (nullable = true)
 |-- codRegiaoSaude: integer (nullable = true)
 |-- nomeRegiaoSaude: string (nullable = true)
 |-- data: timestamp (nullable = true)
 |-- semanaEpi: integer (nullable = true)
 |-- populacaoTCU2019: integer (nullable = true)
 |-- casosAcumulado: decimal(10,0) (nullable = true)
 |-- casosNovos: integer (nullable = true)
 |-- obitosAcumulado: integer (nullable = true)
 |-- obitosNovos: integer (nullable = true)
 |-- Recuperadosnovos: integer (nullable = true)
 |-- emAcompanhamentoNovos: integer (nullable = true)
 |-- interior/metropolitana: integer (nullable = true)

None


In [11]:
# Verifica o número de linhas
df_spark.count()

4642162

In [17]:
# Otimizar o hive (particionamento por municipio)

df_spark.write.saveAsTable("covid_p", mode="overwrite", partitionBy="municipio")

In [18]:
!hdfs dfs -ls /user/hive/warehouse/milena.db

Found 1 items
drwxrwxr-x   - root supergroup          0 2022-08-16 13:28 /user/hive/warehouse/milena.db/dados


In [19]:
spark.sql("show partitions covid_p").show(20, truncate = False)

+-----------------------------+
|partition                    |
+-----------------------------+
|municipio=Abadia de Goiás    |
|municipio=Abadia dos Dourados|
|municipio=Abadiânia          |
|municipio=Abaetetuba         |
|municipio=Abaeté             |
|municipio=Abaiara            |
|municipio=Abaré              |
|municipio=Abatiá             |
|municipio=Abaíra             |
|municipio=Abdon Batista      |
|municipio=Abel Figueiredo    |
|municipio=Abelardo Luz       |
|municipio=Abre Campo         |
|municipio=Abreu e Lima       |
|municipio=Abreulândia        |
|municipio=Acaiaca            |
|municipio=Acajutiba          |
|municipio=Acarape            |
|municipio=Acaraú             |
|municipio=Acari              |
+-----------------------------+
only showing top 20 rows

