In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import col, when
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/Hiv_NY.csv"

In [4]:
# Read without header
hiv = spark.read.option("header", True) \
    .csv(hdfs_path)

#Delete col
hiv = hiv.drop(col("Concurrent diagnoses"))
hiv = hiv.drop(col("% linked to care within 3 months"))
hiv = hiv.drop(col("AIDS diagnosis rate"))
hiv = hiv.drop(col("PLWDHI prevalence"))
hiv = hiv.drop(col("Death rate"))
hiv = hiv.drop(col("Non-HIV-related death rate"))


hiv = hiv.withColumn( 
    "Borough",
    when(
        col("Borough").isNull(),
        "Unknown"
    ).otherwise(col("Borough"))
)

hiv = hiv.withColumn( #alterar summary em que os valores estejam null para unknown"
    "UHF",
    when(
        col("UHF").isNull(),
        "Unknown"
    ).otherwise(col("UHF"))
)


hiv = hiv.withColumn( #alterar summary em que os valores estejam null para unknown"
    "Race",
    when(
        col("Race").isNull(),
        "Unknown"
    ).otherwise(col("Race"))
)




#Delete Nulls in important col
hiv = hiv.filter(~(col("HIV diagnosis rate").isNull()))
hiv = hiv.filter(~(col("HIV diagnoses").isNull()))
hiv = hiv.filter(~(col("AIDS diagnoses").isNull()))
hiv = hiv.filter(~(col("% viral suppression").isNull()))
hiv = hiv.filter(~(col("HIV-related death rate").isNull()))

#Replace /Man with Male/ and /Woman with Female/
hiv = hiv.withColumn(
    "Gender",
    when(
        col("Gender") == "Man",
        "Male"
    ).when(
        col("Gender") == "Woman",
        "Female"
    ).when(
        col("Gender").isNull(),
        "Unknown"
    ).otherwise(col("Gender"))
)

hiv = hiv.withColumn("Year", col("Year").cast("int")) 
hiv = hiv.withColumn("HIV diagnoses", col("HIV diagnoses").cast("int")) 
hiv = hiv.withColumn("HIV diagnosis rate", col("HIV diagnosis rate").cast("double")) 
hiv = hiv.withColumn("AIDS diagnoses", col("AIDS diagnoses").cast("int")) 
hiv = hiv.withColumn("% viral suppression", col("% viral suppression").cast("double")) 
hiv = hiv.withColumn("Deaths", col("Deaths").cast("int")) 
hiv = hiv.withColumn("HIV-related death rate", col("HIV-related death rate").cast("double")) 

hiv = hiv.withColumnRenamed("HIV diagnoses", "HIV_diagnoses")
hiv = hiv.withColumnRenamed("HIV diagnosis rate", "HIV_diagnosis_rate")
hiv = hiv.withColumnRenamed("AIDS diagnoses", "AIDS_diagnoses")
hiv = hiv.withColumnRenamed("% viral suppression", "viral_suppression_percent")
hiv = hiv.withColumnRenamed("HIV-related death rate", "HIV_related_death_rate")

hiv.printSchema()
hiv.show()
#hiv.toPandas() 
#Usar o comando acima para uma melhor pré-visualização da tabela

customSchema = StructType([
    StructField("Year", IntegerType(), True),        
    StructField("Borough", StringType(), True),
    StructField("UHF", StringType(), True),
    StructField("Gender", StringType(), True),        
    StructField("Age", StringType(), True),
    StructField("Race", StringType(), True),
    StructField("HIV_diagnoses", IntegerType(), True),    
    StructField("HIV_diagnosis_rate",  DoubleType(), True), 
    StructField("AIDS_diagnoses", IntegerType(), True),
    StructField("viral_suppression_percent", DoubleType(), True),
    StructField("Deaths", IntegerType(), True),        
    StructField("HIV_related_death_rate", DoubleType(), True)
])

hiv \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/Hiv_NY")

root
 |-- Year: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- UHF: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- HIV_diagnoses: integer (nullable = true)
 |-- HIV_diagnosis_rate: double (nullable = true)
 |-- AIDS_diagnoses: integer (nullable = true)
 |-- viral_suppression_percent: double (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- HIV_related_death_rate: double (nullable = true)

+----+-------+---+-----------+-------+--------------------+-------------+------------------+--------------+-------------------------+------+----------------------+
|Year|Borough|UHF|     Gender|    Age|                Race|HIV_diagnoses|HIV_diagnosis_rate|AIDS_diagnoses|viral_suppression_percent|Deaths|HIV_related_death_rate|
+----+-------+---+-----------+-------+--------------------+-------------+------------------+--------------+-------------------------+------+-------------

In [5]:
spark.sql(
    """
    SELECT * FROM projeto.Hiv_NY
    """
).show()

+----+-------+---+-----------+-------+--------------------+-------------+------------------+--------------+-------------------------+------+----------------------+
|Year|Borough|UHF|     Gender|    Age|                Race|HIV_diagnoses|HIV_diagnosis_rate|AIDS_diagnoses|viral_suppression_percent|Deaths|HIV_related_death_rate|
+----+-------+---+-----------+-------+--------------------+-------------+------------------+--------------+-------------------------+------+----------------------+
|2011|    All|All|        All|    All|                 All|         3379|              48.3|          2366|                     71.0|  2040|                   5.8|
|2011|    All|All|       Male|    All|                 All|         2595|              79.1|          1712|                     72.0|  1423|                   5.7|
|2011|    All|All|     Female|    All|                 All|          733|              21.1|           622|                     68.0|   605|                   6.0|
|2011|    All|Al