In [None]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs

In [None]:
pip install delta-spark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/questao1'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
# create gold database
spark.sql( 
    """
    DROP DATABASE IF EXISTS Projeto1 CASCADE
    """
)
spark.sql(
    """
    create database Projeto1 location 'hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db'
    """
)

In [None]:
# criar uma tabela gold das Fontes

spark.sql(
    """
    DROP TABLE IF EXISTS Projeto1.gold_Fontes
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto1.gold_Fontes (
        Position STRING,
        BOROUGH STRING,
        YEAR INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db/gold_Fontes/'
    """
)

In [None]:
from pyspark.sql.functions import substring, avg, sum

# read DrinkingFountains from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/DrinkingFountains"

gold_Fontes = spark\
             .read\
             .load(hdfs_path)

gold_Fontes.show()
gold_Fontes.printSchema()

In [None]:
# write to delta table
gold_Fontes \
    .select ("Position","BOROUGH", "YEAR") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db/gold_Fontes/")

In [None]:
# check the results in the table
spark.table("ProjetoGold.gold_Fontes").show()
spark.table("ProjetoGold.gold_Fontes").count()

In [None]:
# criar uma tabela gold dos Tanques

spark.sql(
    """
    DROP TABLE IF EXISTS Projeto1.gold_Tanques
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto1.gold_Tanques (
        BOROUGH STRING,
        REPORTING_YEAR INT
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db/gold_Tanques/'
    """
)

In [None]:
from pyspark.sql.functions import substring, avg, sum

# read Tanques from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Harbor_Water_Quality"

gold_Tanques = spark\
             .read\
             .load(hdfs_path)

gold_Tanques.show()
gold_Tanques.printSchema()

In [None]:
# write to delta table
gold_Tanques \
    .select ("BOROUGH", "REPORTING_YEAR") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db/gold_Tanques/")

In [None]:
Fontes_Tanques = gold_Fontes.join( gold_Tanques.BOROUGH == gold_Fontes.BOROUGH) & (gold_Tanques.ReportingYear == gold_Fontes.Year), "innner").drop(gold_Tanques.ReportingYear)


In [None]:
# write to delta table
Fontes_Tanques \
    .select ("BOROUGH", "YEAR") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs://hdfs-nn:9000/Projeto/questao1/Projeto1.db/Fontes_Tanques/")

In [None]:
# check the results in the table
spark.table("Projeto1.Fontes_Tanques").show()
spark.table("Projeto1.Fontes_Tanques").count()