In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *


warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

#spark =
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create gold database
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS projeto_gold LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db'
    """
)

DataFrame[]

In [3]:
spark.sql(
    """
    SHOW TABLES FROM projeto_gold
    """
).toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,projeto_gold,earningsus,False
1,projeto_gold,educationny,False
2,projeto_gold,educationus,False
3,projeto_gold,gender_difference_hiv_diagnosis_rate,False
4,projeto_gold,shootingsus,False


In [4]:
spark.sql("""
    DROP TABLE IF EXISTS projeto_gold.Gender_Difference_Hiv_NY
     """
 )

spark.sql("""
    CREATE EXTERNAL TABLE projeto_gold.Gender_Difference_Hiv_NY (
        State String,
        Country String,
        Borough String,
        Female_HIV_diagnosis_rate Double,
        Male_HIV_diagnosis_rate Double,
        Gender_Difference_HIV_diagnosis_rate Double
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/Gender_Difference_Hiv_NY'
""")


DataFrame[]

In [5]:
spark.sql("""
    DROP TABLE IF EXISTS projeto_gold.EducationUS
     """
 )

spark.sql (
    """
CREATE EXTERNAL TABLE projeto_gold.EducationUS (
    Country STRING,
    State STRING,
    Men_Less_than_High_School_Diploma FLOAT,
    Women_Less_than_High_School_Diploma FLOAT,
    Men_High_School_Diploma FLOAT,
    Women_High_School_Diploma FLOAT,
    Men_College_or_Associate_Diploma FLOAT,
    Women_College_or_Associate_Diploma FLOAT,
    Men_Bachelor_Diploma_or_Higher FLOAT,
    Women_Bachelor_Diploma_or_Higher FLOAT,
    Difference_Bachelor_Diploma_or_Higher FLOAT,
    Difference_College_or_Associate_Diploma FLOAT,
    Difference_High_School_Diploma FLOAT,
    Difference_Less_than_High_School_Diploma FLOAT
)
USING DELTA
LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EducationUS/'
"""
)


DataFrame[]

In [6]:
spark.sql("""
    DROP TABLE IF EXISTS projeto_gold.EducationNY
""")
spark.sql("""
    CREATE EXTERNAL TABLE projeto_gold.EducationNY (
        Borough String,
        Female_Average_Dropout_Rate Double,
        Female_Average_Graduation_Rate Double,
        Male_Average_Dropout_Rate Double,
        Male_Average_Graduation_Rate Double,
        State String,
        Country String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EducationNY/'
""")



DataFrame[]

In [7]:
spark.sql("""
    DROP TABLE IF EXISTS projeto_gold.ShootingsUS
""")
spark.sql("""
    CREATE EXTERNAL TABLE projeto_gold.ShootingsUS (
        Borough String,
        Female Integer,
        Male Integer,
        Unknown Integer,
        State String,
        Country String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/ShootingsUS'
""")



DataFrame[]

In [8]:
spark.sql(
    """
    DROP TABLE IF EXISTS projeto_gold.EarningsUS
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE projeto_gold.EarningsUS (
        State String,
        Male Double,
        Female Double,
        Sum Double,
        Difference Double,
        Country String
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/projeto_gold.db/EarningsUS/'
    """
)

DataFrame[]