In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col, avg, format_number  
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
from pyspark.sql.functions import col, sum, round, lit, concat, when, count, coalesce, upper, udf
from pyspark.sql import functions as F

warehouse_location = 'heducations://heducations-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# read data from the silver tables
earnings = spark.table("projeto.EarningsUS")

In [3]:
earnings = earnings.withColumn("Sum", col("Male") + col("Female"))
earnings = earnings.withColumn("Difference", col("Male") - col("Female"))
earnings = earnings.withColumn("country",F.lit("United States"))

In [4]:
earnings.printSchema()
earnings.show()

root
 |-- State: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Female: string (nullable = true)
 |-- Sum: double (nullable = true)
 |-- Difference: double (nullable = true)
 |-- country: string (nullable = false)

+--------------------+-----+------+--------+----------+-------------+
|               State| Male|Female|     Sum|Difference|      country|
+--------------------+-----+------+--------+----------+-------------+
|             Alabama|50019| 37281| 87300.0|   12738.0|United States|
|              Alaska|61173| 50853|112026.0|   10320.0|United States|
|             Arizona|50069| 41617| 91686.0|    8452.0|United States|
|            Arkansas|45014| 35541| 80555.0|    9473.0|United States|
|          California|57475| 50313|107788.0|    7162.0|United States|
|            Colorado|60574| 48381|108955.0|   12193.0|United States|
|         Connecticut|66959| 55743|122702.0|   11216.0|United States|
|            Delaware|57200| 47227|104427.0|    9973.0|United Stat

In [5]:
customSchema = StructType([
    StructField("State", StringType(), True),        
    StructField("Male", DoubleType(), True),
    StructField("Female", DoubleType(), True),
    StructField("Sum", DoubleType(), True),
    StructField("Difference", DoubleType(), True),
    StructField("Country", StringType(), True)
])

earnings \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EarningsUS")