In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import expr, round, col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType

warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = builder.getOrCreate() #spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/projeto/bronze/EarningsUS.csv"

In [3]:
earnings = spark.read.option("header", True) \
.csv(hdfs_path)

In [4]:
earnings.printSchema()
earnings.show()

root
 |-- State: string (nullable = true)
 |-- Male: string (nullable = true)
 |-- Female: string (nullable = true)

+--------------------+-----+------+
|               State| Male|Female|
+--------------------+-----+------+
|             Alabama|50019| 37281|
|              Alaska|61173| 50853|
|             Arizona|50069| 41617|
|            Arkansas|45014| 35541|
|          California|57475| 50313|
|            Colorado|60574| 48381|
|         Connecticut|66959| 55743|
|            Delaware|57200| 47227|
|District of Columbia|88992| 72960|
|             Florida|45187| 37482|
|             Georgia|50547| 40548|
|              Hawaii|53353| 46331|
|               Idaho|49116| 37784|
|            Illinois|59074| 46077|
|             Indiana|51448| 38988|
|                Iowa|52146| 40739|
|              Kansas|51687| 40941|
|            Kentucky|49345| 38774|
|           Louisiana|51976| 37050|
|               Maine|51143| 40906|
+--------------------+-----+------+
only showing top 20

In [5]:
customSchema = StructType([
    StructField("State", StringType(), True),        
    StructField("Male", DoubleType(), True),
    StructField("Female", DoubleType(), True)
])

earnings \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/warehouse/projeto.db/EarningsUS")



In [6]:
spark.sql(
    """
    SELECT * FROM projeto.EarningsUS
    """
).show()

+--------------------+-----+------+
|               State| Male|Female|
+--------------------+-----+------+
|             Alabama|50019| 37281|
|              Alaska|61173| 50853|
|             Arizona|50069| 41617|
|            Arkansas|45014| 35541|
|          California|57475| 50313|
|            Colorado|60574| 48381|
|         Connecticut|66959| 55743|
|            Delaware|57200| 47227|
|District of Columbia|88992| 72960|
|             Florida|45187| 37482|
|             Georgia|50547| 40548|
|              Hawaii|53353| 46331|
|               Idaho|49116| 37784|
|            Illinois|59074| 46077|
|             Indiana|51448| 38988|
|                Iowa|52146| 40739|
|              Kansas|51687| 40941|
|            Kentucky|49345| 38774|
|           Louisiana|51976| 37050|
|               Maine|51143| 40906|
+--------------------+-----+------+
only showing top 20 rows

