In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col, avg, format_number  
from pyspark.sql.types import FloatType, StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, sum, round, lit, concat, when, count, coalesce, upper, udf
from pyspark.sql import functions as F

warehouse_location = 'heducations://heducations-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# read data from the silver tables
education = spark.table("projeto.Education_State")

In [3]:
education = education.withColumn("Country",F.lit("United States"))

education = education.withColumn("Difference_Less_than_High_School_Diploma", col("Men_Less_than_High_School_Diploma") - col("Women_Less_than_High_School_Diploma"))
education = education.withColumn("Difference_High_School_Diploma", col("Men_High_School_Diploma") - col("Women_High_School_Diploma"))
education = education.withColumn("Difference_College_or_Associate_Diploma", col("Men_College_or_Associate_Diploma") - col("Women_College_or_Associate_Diploma"))
education = education.withColumn("Difference_Bachelor_Diploma_or_Higher", col("Men_Bachelor_Diploma_or_Higher") - col("Women_Bachelor_Diploma_or_Higher"))



In [4]:
education.printSchema()
education.show()

root
 |-- State: string (nullable = true)
 |-- Men_Less_than_High_School_Diploma: float (nullable = true)
 |-- Women_Less_than_High_School_Diploma: float (nullable = true)
 |-- Men_High_School_Diploma: float (nullable = true)
 |-- Women_High_School_Diploma: float (nullable = true)
 |-- Men_College_or_Associate_Diploma: float (nullable = true)
 |-- Women_College_or_Associate_Diploma: float (nullable = true)
 |-- Men_Bachelor_Diploma_or_Higher: float (nullable = true)
 |-- Women_Bachelor_Diploma_or_Higher: float (nullable = true)
 |-- Country: string (nullable = false)
 |-- Difference_Less_than_High_School_Diploma: float (nullable = true)
 |-- Difference_High_School_Diploma: float (nullable = true)
 |-- Difference_College_or_Associate_Diploma: float (nullable = true)
 |-- Difference_Bachelor_Diploma_or_Higher: float (nullable = true)

+--------------------+---------------------------------+-----------------------------------+-----------------------+-------------------------+-------------

In [5]:
customSchema = StructType([
    StructField("Country", StringType(), True),
    StructField("State", StringType(), True),        
    StructField("Men_Less_than_High_School_Diploma", FloatType(), True),        
    StructField("Women_Less_than_High_School_Diploma", FloatType(), True),
    StructField("Men_High_School_Diploma", FloatType(), True),
    StructField("Women_High_School_Diploma", FloatType(), True),
    StructField("Men_College_or_Associate_Diploma", FloatType(), True),
    StructField("Women_College_or_Associate_Diploma", FloatType(), True),
    StructField("Men_Bachelor_Diploma_or_Higher", FloatType(), True),
    StructField("Women_Bachelor_Diploma_or_Higher", FloatType(), True),
    StructField("Difference_Bachelor_Diploma_or_Higher", FloatType(), True),
    StructField("Difference_College_or_Associate_Diploma", FloatType(), True),
    StructField("Difference_High_School_Diploma", FloatType(), True),
    StructField("Difference_Less_than_High_School_Diploma", FloatType(), True),
])

education \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EducationUS")