In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (SparkSession.builder
             .master("spark://spark-master:7077") # Points to the Spark Cluster
             .appName('lab') # Name the app
             .config("hive.metastore.uris", "thrift://hive-metastore:9083") # Set external Hive Metastore
             .config("hive.metastore.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir (legacy) users/hive/warehouse
             .config("spark.sql.warehouse.dir", "s3a://minio:9000/datalake/") # Set default warehouse dir
             .config("hive.metastore.schema.verification", "false") # Prevent some errors
             .config("fs.defaultFS", "s3a://minio:9000/datalake/") # Set default file system into the HDFS namenode
             .config("spark.jars", "/opt/bitnami/spark/jars_external/hadoop-aws-3.3.4.jar,/opt/bitnami/spark/jars_external/aws-java-sdk-bundle-1.12.588.jar")
             .enableHiveSupport()
             .getOrCreate())

sc = spark.sparkContext

hdp_configs = {
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.access.key": "minio",
    "fs.s3a.secret.key": "minioadmin",
    "fs.s3a.connection.timeout": "600000",
    "spark.sql.debug.maxToStringFields": "100",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "true"
}

for k,v in hdp_configs.items():
    spark.sparkContext._jsc.hadoopConfiguration().set(k, v)


In [3]:
spark.stop()

In [5]:
spark.sql("show tables from source").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   source|    final|      false|
|   source|  persons|      false|
|   source|    test2|      false|
+---------+---------+-----------+



In [36]:
schema = StructType([
    StructField('ano_cmpt', StringType(), True), 
    StructField('mes_cmpt', StringType(), True), 
    StructField('cgc_hosp', StringType(), True), 
    StructField('munic_res', StringType(), True), 
    StructField('nasc', DateType(), True), 
    StructField('sexo', StringType(), True), 
    StructField('uti_mes_to', StringType(), True), 
    StructField('uti_int_to', StringType(), True), 
    StructField('proc_rea', StringType(), True), 
    StructField('qt_proc', StringType(), True), 
    StructField('dt_atend', DateType(), True), 
    StructField('dt_saida', DateType(), True), 
    StructField('diag_princ', StringType(), True), 
    StructField('diag_secun', StringType(), True), 
    StructField('cobranca', StringType(), True), 
    StructField('natureza', StringType(), True), 
    StructField('gestao', StringType(), True), 
    StructField('munic_mov', StringType(), True), 
    StructField('cod_idade', StringType(), True), 
    StructField('idade', StringType(), True), 
    StructField('dias_perm', StringType(), True), 
    StructField('morte', StringType(), True), 
    StructField('cnes', StringType(), True), 
    StructField('fonte', StringType(), True), 
    StructField('modalidade', StringType(), True), 
    StructField('nome_uf', StringType(), True), 
    StructField('nome_municipio', StringType(), True), 
    StructField('regiao', StringType(), True), 
    StructField('idhm', FloatType(), True), 
    StructField('populacao_residente', IntegerType(), True), 
    StructField('area_unidade_territorial', FloatType(), True), 
    StructField('diag_princ_desc', StringType(), True), 
    StructField('diag_secun_desc', StringType(), True), 
    StructField('diag_princ_detalhes', StructType([
        StructField('sub_cat', StringType(), True), 
        StructField('classificacao', StringType(), True), 
        StructField('restr_sexo', StringType(), True), 
        StructField('causa_obito', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('desc_abrev', StringType(), True), 
        StructField('refer', StringType(), True), 
        StructField('excluidos', StringType(), True)
    ]), True), 
    StructField('diag_secun_detalhes', StructType([
        StructField('sub_cat', StringType(), True), 
        StructField('classificacao', StringType(), True), 
        StructField('restr_sexo', StringType(), True), 
        StructField('causa_obito', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('desc_abrev', StringType(), True), 
        StructField('refer', StringType(), True), 
        StructField('excluidos', StringType(), True)
    ]), True), 
    StructField('feriado', StringType(), True), 
    StructField('distancia_feriado', IntegerType(), True), 
    StructField('feriado_info', ArrayType(StructType([
        StructField('data', StringType(), True), 
        StructField('nome', StringType(), True), 
        StructField('tipo', StringType(), True), 
        StructField('descricao', StringType(), True), 
        StructField('uf', StringType(), True), 
        StructField('municipio', StringType(), True), 
        StructField('cod_municipio', StringType(), True)
    ]), True), True), 
    StructField('sigla', StringType(), True)
])

df_final = spark.createDataFrame([], schema = schema)

schema_str = ", ".join([f"{x[0]} {x[1]}" for x in df_final.drop("sigla").dtypes ])
spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS source.final ({schema_str}) USING PARQUET PARTITIONED BY (sigla string) LOCATION 's3a://datalake/source/final/'").show()

++
||
++
++



In [31]:
spark.sql("msck repair table source.final").show()

++
||
++
++



In [38]:
spark.table("source.final").groupBy("sigla").count().show()

+-----+--------+
|sigla|   count|
+-----+--------+
|   SP|67515831|
|   RS| 9961350|
|   MG| 8143352|
|   BA| 5017618|
|   SC| 2861516|
|   CE|  906485|
|   MS|  659648|
|   AL|  522891|
|   PA|  567822|
|   GO|  503891|
|   ES|  342183|
|   RN|  760980|
|   MT|  319959|
|   PI|  385786|
|   PE|  289395|
|   PB|  177462|
|   SE|   92552|
|   RO|   19372|
|   TO|  114201|
+-----+--------+



In [2]:
df = spark.table("source.final").drop("diag_princ_detalhes", "diag_secun_detalhes", "feriado_info")

df.limit(1000).write.mode("overwrite").saveAsTable("source.test2")