In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('lab') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "s3a://minio:9000/datalake/") # Set default file system into the HDFS namenode
             .config("spark.jars", "/opt/bitnami/spark/jars_external/hadoop-aws-3.3.4.jar,/opt/bitnami/spark/jars_external/aws-java-sdk-bundle-1.12.588.jar")
             .config("spark.sql.repl.eagerEval.enabled", True)
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

hdp_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "minio",
    "fs.s3a.secret.key": "minioadmin",
    "fs.s3a.connection.timeout": "600000",
    "spark.sql.debug.maxToStringFields": "100",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "true"
}

for k,v in hdp_configs.items():
    spark.sparkContext._jsc.hadoopConfiguration().set(k, v)


In [10]:
spark.stop()

In [20]:
spark.sql("show databases")

namespace
bronze
default
gold
silver
source


In [14]:
spark.sql("CREATE DATABASE IF NOT EXISTS source LOCATION 's3a://datalake/source/'")

In [19]:
spark.sql("show tables from source").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   source|    final|      false|
+---------+---------+-----------+



In [16]:
schema = StructType([
    StructField('ano_cmpt', StringType(), True), 
    StructField('mes_cmpt', StringType(), True), 
    StructField('cgc_hosp', StringType(), True), 
    StructField('munic_res', StringType(), True), 
    StructField('nasc', DateType(), True), 
    StructField('sexo', StringType(), True), 
    StructField('uti_mes_to', StringType(), True), 
    StructField('uti_int_to', StringType(), True), 
    StructField('proc_rea', StringType(), True), 
    StructField('qt_proc', StringType(), True), 
    StructField('dt_atend', DateType(), True), 
    StructField('dt_saida', DateType(), True), 
    StructField('diag_princ', StringType(), True), 
    StructField('diag_secun', StringType(), True), 
    StructField('cobranca', StringType(), True), 
    StructField('natureza', StringType(), True), 
    StructField('gestao', StringType(), True), 
    StructField('munic_mov', StringType(), True), 
    StructField('cod_idade', StringType(), True), 
    StructField('idade', StringType(), True), 
    StructField('dias_perm', StringType(), True), 
    StructField('morte', StringType(), True), 
    StructField('cnes', StringType(), True), 
    StructField('fonte', StringType(), True), 
    StructField('modalidade', StringType(), True), 
    StructField('nome_uf', StringType(), True), 
    StructField('nome_municipio', StringType(), True), 
    StructField('regiao', StringType(), True), 
    StructField('idhm', FloatType(), True), 
    StructField('populacao_residente', IntegerType(), True), 
    StructField('area_unidade_territorial', FloatType(), True), 
    StructField('diag_princ_desc', StringType(), True), 
    StructField('diag_secun_desc', StringType(), True), 
    StructField('diag_princ_detalhes', StructType([
        StructField('sub_cat', StringType(), True), 
        StructField('classificacao', StringType(), True), 
        StructField('restr_sexo', StringType(), True), 
        StructField('causa_obito', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('desc_abrev', StringType(), True), 
        StructField('refer', StringType(), True), 
        StructField('excluidos', StringType(), True)
    ]), True), 
    StructField('diag_secun_detalhes', StructType([
        StructField('sub_cat', StringType(), True), 
        StructField('classificacao', StringType(), True), 
        StructField('restr_sexo', StringType(), True), 
        StructField('causa_obito', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('desc_abrev', StringType(), True), 
        StructField('refer', StringType(), True), 
        StructField('excluidos', StringType(), True)
    ]), True), 
    StructField('feriado', StringType(), True), 
    StructField('distancia_feriado', IntegerType(), True), 
    StructField('feriado_info', ArrayType(StructType([
        StructField('data', StringType(), True), 
        StructField('nome', StringType(), True), 
        StructField('tipo', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('uf', StringType(), True), 
        StructField('municipio', StringType(), True), 
        StructField('cod_municipio', StringType(), True)
    ]), True), True), 
    StructField('sigla', StringType(), True)
])

df_final = spark.createDataFrame([], schema = schema)

schema_str = ", ".join([f"{x[0]} {x[1]}" for x in df_final.drop("sigla").dtypes ])
spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS source.final ({schema_str}) USING PARQUET PARTITIONED BY (sigla string) LOCATION 's3a://datalake/source/final/'").show()

++
||
++
++



In [17]:
spark.sql("msck repair table source.final").show()

++
||
++
++



In [18]:
spark.table("source.final").groupBy("sigla").count().show()

+-----+--------+
|sigla|   count|
+-----+--------+
|   SP|67515831|
|   RS| 9961350|
|   MG| 8143352|
|   BA| 5017618|
|   SC| 2861516|
|   CE|  906485|
|   MS|  659648|
|   AL|  522891|
|   PA|  567822|
|   GO|  503891|
|   ES|  342183|
|   RN|  760980|
|   MT|  319959|
|   PI|  385786|
|   PE|  289395|
|   PB|  177462|
|   SE|   92552|
|   RO|   19372|
|   TO|  114201|
+-----+--------+



In [None]:
df_final_clean = spark.createDataFrame([], schema = schema).drop("diag_princ_detalhes", "diag_secun_detalhes", "feriado_info")

schema_str = ", ".join([f"{x[0]} {x[1]}" for x in df_final_clean.drop("sigla").dtypes])
select_str = ", ".join(df_final_clean.columns)

spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS gold.final LOCATION 's3a://datalake/gold/final/' AS (SELECT {select_str} FROM source.final) ")

In [22]:
df = spark.table("source.final").drop("diag_princ_detalhes", "diag_secun_detalhes", "feriado_info")

# df.write.mode("overwrite").saveAsTable("source.final_clean")

df.createOrReplaceGlobalView("gold.final")

AttributeError: 'DataFrame' object has no attribute 'createOrReplaceGlobalView'

In [9]:
df = spark.table("source.final")

In [11]:
df

ano_cmpt,mes_cmpt,cgc_hosp,munic_res,nasc,sexo,uti_mes_to,uti_int_to,proc_rea,qt_proc,dt_atend,dt_saida,diag_princ,diag_secun,cobranca,natureza,gestao,munic_mov,cod_idade,idade,dias_perm,morte,cnes,fonte,modalidade,nome_uf,nome_municipio,regiao,idhm,populacao_residente,area_unidade_territorial,diag_princ_desc,diag_secun_desc,diag_princ_detalhes,diag_secun_detalhes,feriado,distancia_feriado,feriado_info,sigla
2018,2,60975174000363,355030,1994-07-31,3,0,0,301060002,1,2018-02-06,2018-02-06,Z014,,,,M,355030,4,23,0,0,2080796,1,1,São Paulo,São Paulo,Sudeste,0.805,11253503,1521.101,Exame ginecológic...,,"{Z014, null, F, n...","{null, null, null...",False,6,"[{2018-02-12, Car...",SP
2019,10,60979457000707,353440,2016-07-15,3,0,0,301070121,1,2019-10-11,2019-10-11,G822,,,,M,353440,4,3,0,0,5493943,1,1,São Paulo,Osasco,Sudeste,0.776,666740,64.954,Paraplegia não es...,,"{G822, null, null...","{null, null, null...",False,1,"[{2019-10-12, Nos...",SP
2018,2,60975174000363,355030,1959-10-02,3,0,0,302050027,1,2018-02-06,2018-02-06,A300,,,,M,355030,4,58,0,0,2080796,1,1,São Paulo,São Paulo,Sudeste,0.805,11253503,1521.101,Hanseníase [lepra...,,"{A300, null, null...","{null, null, null...",False,6,"[{2018-02-12, Car...",SP
2019,10,60979457000707,353440,2005-04-01,3,0,0,301070121,1,2019-10-11,2019-10-11,G822,,,,M,353440,4,14,0,0,5493943,1,1,São Paulo,Osasco,Sudeste,0.776,666740,64.954,Paraplegia não es...,,"{G822, null, null...","{null, null, null...",False,1,"[{2019-10-12, Nos...",SP
2018,2,60975174000363,355030,1997-07-20,3,0,0,301060002,1,2018-02-06,2018-02-06,Z349,,,,M,355030,4,20,0,0,2080796,1,1,São Paulo,São Paulo,Sudeste,0.805,11253503,1521.101,Supervisão de gra...,,"{Z349, null, F, n...","{null, null, null...",False,6,"[{2018-02-12, Car...",SP
2019,10,60979457000707,353440,2016-07-21,1,0,0,301070121,1,2019-10-11,2019-10-11,G808,,,,M,353440,4,3,0,0,5493943,1,1,São Paulo,Osasco,Sudeste,0.776,666740,64.954,Outras formas de ...,,"{G808, null, null...","{null, null, null...",False,1,"[{2019-10-12, Nos...",SP
2018,2,60975174000363,355030,1960-12-06,3,0,0,301060002,1,2018-02-06,2018-02-06,K219,,,,M,355030,4,57,0,0,2080796,1,1,São Paulo,São Paulo,Sudeste,0.805,11253503,1521.101,Doença de refluxo...,,"{K219, null, null...","{null, null, null...",False,6,"[{2018-02-12, Car...",SP
2019,10,60979457000707,353440,1972-02-14,3,0,0,302050019,1,2019-10-11,2019-10-11,M510,,,,M,353440,4,47,0,0,5493943,1,1,São Paulo,Osasco,Sudeste,0.776,666740,64.954,Transtornos de di...,,"{M510, +, null, n...","{null, null, null...",False,1,"[{2019-10-12, Nos...",SP
2018,2,60975174000363,355030,1929-07-01,3,0,0,301060002,1,2018-02-06,2018-02-06,R54,,,,M,355030,4,88,0,0,2080796,1,1,São Paulo,São Paulo,Sudeste,0.805,11253503,1521.101,Senilidade,,"{R54, null, null,...","{null, null, null...",False,6,"[{2018-02-12, Car...",SP
2019,10,60979457000707,353440,2017-10-10,1,0,0,301070121,3,2019-10-11,2019-10-11,G808,,,,M,353440,4,2,0,0,5493943,1,1,São Paulo,Osasco,Sudeste,0.776,666740,64.954,Outras formas de ...,,"{G808, null, null...","{null, null, null...",False,1,"[{2019-10-12, Nos...",SP
