In [1]:
pip install delta-spark==2.1.1


Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9870/Projeto/silver/'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.1.1") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark.sql(
    """
    DROP DATABASE Projeto CASCADE
    """
)

DataFrame[]

In [4]:

spark.sql(
    """
    create database Projeto location 'hdfs://hdfs-nn:9000/Projeto/silver/Projeto.db'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    SHOW TABLES FROM Projeto
    """
).show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [6]:
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/ProjetoGreen/DEPGreenInfraestructure.csv"

customSchema = StructType([
    StructField("the_geom", StringType(), True),        
    StructField("Asset_ID", IntegerType(), True),
    StructField("GI_ID", StringType(), True),
    StructField("DEP_Contra", IntegerType(), True),
    StructField("DEP_Cont_1", IntegerType(), True),
    StructField("Project_Ty", StringType(), True),
    StructField("ROW_Onsite", StringType(), True), 
    StructField("Project_Na", StringType(), True), 
    StructField("Asset_Type", StringType(), True), 
    StructField("Status", StringType(), True), 
    StructField("Asset_X_Co", StringType(), True),
    StructField("Asset_Y_Co", StringType(), True),
    StructField("Borough", StringType(), True),
    StructField("Sewer_Type", StringType(), True),
    StructField("Outfall", StringType(), True),
    StructField("Waterbody", StringType(), True),
    StructField("Street_Add", StringType(), True),
    StructField("Nearest_In", StringType(), True),
    StructField("BBL", IntegerType(), True),
    StructField("Secondary_", StringType(), True),
    StructField("Community_", IntegerType(), True),
    StructField("City_Counc", IntegerType(), True),
    StructField("Assembly_D", StringType(), True),
    StructField("Asset_Leng", FloatType(), True),
    StructField("Asset_Width", FloatType(), True),
    StructField("Asset_Area", StringType(), True),
    StructField("GI_Feature", StringType(), True),
    StructField("Tree_Latin", StringType(), True),
    StructField("Tree_Commo", StringType(), True),
    StructField("Constructi", StringType(), True),
    StructField("Construc_1", StringType(), True),
    StructField("Status_Gro", StringType(), True)

])

projeto_green = spark \
            .read\
            .option("delimiter",",")\
            .option("header","false")\
            .schema(customSchema) \
            .csv(hdfs_path)
projeto_green.show()
projeto_green.printSchema()

+--------------------+--------+-------+----------+----------+----------+----------+--------------------+----------+--------------------+--------------+------------+-------+----------+-------+--------------------+--------------------+--------------------+----+----------+----------+----------+----------+----------+-----------+----------+--------------------+--------------------+--------------------+----------+----------+------------+
|            the_geom|Asset_ID|  GI_ID|DEP_Contra|DEP_Cont_1|Project_Ty|ROW_Onsite|          Project_Na|Asset_Type|              Status|    Asset_X_Co|  Asset_Y_Co|Borough|Sewer_Type|Outfall|           Waterbody|          Street_Add|          Nearest_In| BBL|Secondary_|Community_|City_Counc|Assembly_D|Asset_Leng|Asset_Width|Asset_Area|          GI_Feature|          Tree_Latin|          Tree_Commo|Constructi|Construc_1|  Status_Gro|
+--------------------+--------+-------+----------+----------+----------+----------+--------------------+----------+-------------

In [7]:
replaced_projeto_green = projeto_green.drop("Secondary_")
replaced_projeto_green.toPandas()

Unnamed: 0,the_geom,Asset_ID,GI_ID,DEP_Contra,DEP_Cont_1,Project_Ty,ROW_Onsite,Project_Na,Asset_Type,Status,...,Assembly_D,Asset_Leng,Asset_Width,Asset_Area,GI_Feature,Tree_Latin,Tree_Commo,Constructi,Construc_1,Status_Gro
0,the_geom,,GI_ID,,,Project_Ty,Row_Onsite,Project_Na,Asset_Type,Status,...,Assembly_D,,,Asset_Area,GI_Feature,Tree_Latin,Tree_Commo,Constructi,Construc_1,Status_Gro
1,POINT (-73.7427580393534 40.701059914707095),103565.0,1032A,,1.0,External,ROW,South East Queens Demonstration Area 1,ROWB,100% Design Accepted,...,Not Found,10.0,4.0,40,Standard,No Tree,,,,Final Design
2,POINT (-73.74274024719452 40.701098796136186),103566.0,1032B,,1.0,External,ROW,South East Queens Demonstration Area 1,ROWB,100% Design Accepted,...,Not Found,10.0,4.0,40,Standard,Quercus prinus,Chestnut Oak,,,Final Design
3,POINT (-73.74267216865954 40.70124766221339),103567.0,1032C,,1.0,External,ROW,South East Queens Demonstration Area 1,ROWB,100% Design Accepted,...,Not Found,13.0,4.0,52,Standard,No Tree,,,,Final Design
4,POINT (-73.74265073276783 40.70129431593821),103568.0,1032D,,1.0,External,ROW,South East Queens Demonstration Area 1,ROWB,100% Design Accepted,...,Not Found,13.0,4.0,52,Standard,Quercus prinus,Chestnut Oak,,,Final Design
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14266,POINT (-73.93719223100973 40.65332552444603),185675.0,5079B,,1.0,Area-Wide,ROW,EDC CI005 Phase 1,ROWB,In Construction,...,58,13.0,6.0,78,Type C - SW Chamber,No Tree,,GKCI05-1A,2,In Construction
14267,POINT (-73.9427174189671 40.65298274171678),185961.0,4071A,,1.0,Area-Wide,ROW,EDC CI005 Phase 1,ROWB,In Construction,...,58,13.0,4.0,52,Standard,Cercis reniformis 'Oklahoma',Oklahoma Redbud,GKCI05-1A,3,In Construction
14268,POINT (-73.94290382417749 40.65130949009753),185962.0,4076A,,1.0,Area-Wide,ROW,EDC CI005 Phase 1,ROWB,In Construction,...,58,20.0,4.0,80,Standard,Ulmus parvifolia 'Allee',Allée Lacebark Elm,GKCI05-1A,3,In Construction
14269,POINT (-73.90194401543289 40.76384699170711),186977.0,TLC-1,,,External,Onsite,TLC Woodside Facility Renovation,Rain Garden,In Construction,...,,0.0,0.0,56331,,,,,,In Construction


In [8]:
replaced_projeto_green.show()

+--------------------+--------+-------+----------+----------+----------+----------+--------------------+----------+--------------------+--------------+------------+-------+----------+-------+--------------------+--------------------+--------------------+----+----------+----------+----------+----------+-----------+----------+--------------------+--------------------+--------------------+----------+----------+------------+
|            the_geom|Asset_ID|  GI_ID|DEP_Contra|DEP_Cont_1|Project_Ty|ROW_Onsite|          Project_Na|Asset_Type|              Status|    Asset_X_Co|  Asset_Y_Co|Borough|Sewer_Type|Outfall|           Waterbody|          Street_Add|          Nearest_In| BBL|Community_|City_Counc|Assembly_D|Asset_Leng|Asset_Width|Asset_Area|          GI_Feature|          Tree_Latin|          Tree_Commo|Constructi|Construc_1|  Status_Gro|
+--------------------+--------+-------+----------+----------+----------+----------+--------------------+----------+--------------------+--------------

In [9]:
from pyspark.sql.functions import when, col, concat, lit

replaced2_projeto_green = replaced_projeto_green.withColumn(
    "DEP_Contra",
    when(
        (col("DEP_Contra").isNull() | (col("DEP_Contra") == None)), 
        "Sem Informacao"
    ).otherwise(col("DEP_Contra")))

In [10]:
from pyspark.sql.functions import when, col, concat, lit

replaced3_projeto_green = replaced2_projeto_green.withColumn(
    "DEP_Cont_1",
    when(
        (col("DEP_Cont_1").isNull() | (col("DEP_Cont_1") == None)), 
        "Null"
    ).otherwise(col("DEP_Cont_1")))

In [11]:
from pyspark.sql.functions import when, col, concat, lit

replaced4_projeto_green = replaced3_projeto_green.withColumn(
    "Nearest_In",
    when(
        (col("Nearest_In").isNull() | (col("Nearest_In") == None)), 
        "Sem Informacao"
    ).otherwise(col("Nearest_In")))

In [12]:
from pyspark.sql.functions import when, col, concat, lit

replaced5_projeto_green = replaced4_projeto_green.withColumn(
    "BBL",
    when(
        (col("BBL").isNull() | (col("BBL") == None)), 
        "Null"
    ).otherwise(col("BBL")))

In [13]:
from pyspark.sql.functions import when, col, concat, lit

replaced6_projeto_green = replaced5_projeto_green.withColumn(
    "Community_",
    when(
        (col("Community_").isNull() | (col("Community_") == None)), 
        "Null"
    ).otherwise(col("Community_")))

In [14]:
from pyspark.sql.functions import when, col, concat, lit

replaced7_projeto_green = replaced6_projeto_green.withColumn(
    "City_Counc",
    when(
        (col("City_Counc").isNull() | (col("City_Counc") == None)), 
        "Null"
    ).otherwise(col("City_Counc")))

In [15]:
from pyspark.sql.functions import when, col, concat, lit

replaced8_projeto_green = replaced7_projeto_green.withColumn(
    "Assembly_D",
    when(
        (col("Assembly_D").isNull() | (col("Assembly_D") == None)), 
        "Sem Informacao"
    ).otherwise(col("Assembly_D")))

In [16]:
from pyspark.sql.functions import when, col, concat, lit

replaced9_projeto_green = replaced8_projeto_green.withColumn(
    "Asset_Leng",
    when(
        (col("Asset_Leng").isNull() | (col("Asset_Leng") == None)), 
        "Null"
    ).otherwise(col("Asset_Leng")))

In [17]:
from pyspark.sql.functions import when, col, concat, lit

replaced10_projeto_green = replaced9_projeto_green.withColumn(
    "GI_Feature",
    when(
        (col("GI_Feature").isNull() | (col("GI_Feature") == None)), 
        "Sem Informacao"
    ).otherwise(col("GI_Feature")))

In [18]:
from pyspark.sql.functions import when, col, concat, lit

replaced11_projeto_green = replaced10_projeto_green.withColumn(
    "Tree_Latin",
    when(
        (col("Tree_Latin").isNull() | (col("Tree_Latin") == None)), 
        "Sem Informacao"
    ).otherwise(col("Tree_Latin")))

In [19]:
from pyspark.sql.functions import when, col, concat, lit

replaced12_projeto_green = replaced11_projeto_green.withColumn(
    "Tree_Commo",
    when(
        (col("Tree_Commo").isNull() | (col("Tree_Commo") == "N/A")), 
        "Sem Informacao"
    ).otherwise(col("Tree_Commo")))

In [20]:
from pyspark.sql.functions import when, col, concat, lit

replaced13_projeto_green = replaced12_projeto_green.withColumn(
    "Constructi",
    when(
        (col("Constructi").isNull() | (col("Constructi") == None)), 
        "Sem Informacao"
    ).otherwise(col("Constructi")))

In [21]:
from pyspark.sql.functions import when, col, concat, lit

replaced14_projeto_green = replaced13_projeto_green.withColumn(
    "Construc_1",
    when(
        (col("Construc_1").isNull() | (col("Construc_1") == None)), 
        "Sem Informacao"
    ).otherwise(col("Construc_1")))

In [22]:
from pyspark.sql.functions import when, col, concat, lit

replaced15_projeto_green = replaced14_projeto_green.withColumn(
    "Street_Add",
    when(
        (col("Street_Add").isNull() | (col("Street_Add") == None)), 
        "Sem Informacao"
    ).otherwise(col("Street_Add")))

In [None]:
replaced15_projeto_green.toPandas()

In [24]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.DepGreenInfraestructure
    """
)


DataFrame[]

In [25]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.DepGreenInfraestructure (
        the_geom VARCHAR(50),
        Asset_ID INT,
        GI_ID VARCHAR(50),
        DEP_Contra VARCHAR(50),
        DEP_Cont_1 INT,
        Project_Ty VARCHAR(50),
        Row_Onsite VARCHAR(50),
        Project_Na VARCHAR(50),
        Asset_Type VARCHAR(50),
        Status VARCHAR(50),
        Asset_X_Co VARCHAR(50),
        Asset_Y_Co VARCHAR(50),
        Sewer_Type VARCHAR(50),
        Outfall VARCHAR(50),
        Waterbody VARCHAR(50),
        Nearest_In VARCHAR(50),
        BBL INT,
        Community_ INT,
        City_Counc INT,
        Assembly_D VARCHAR(50),
        Asset_Leng FLOAT,
        Asset_Width FLOAT,
        Asset_Area VARCHAR(50),
        GI_Feature VARCHAR(50),
        Tree_Latin VARCHAR(50),
        Tree_Commo VARCHAR(50),
        Constructi VARCHAR(50),
        Construc_1 VARCHAR(50),
        Status_Gro VARCHAR(50)
    )
     USING DELTA
    PARTITIONED BY (
        Borough VARCHAR(50),
        Street_Add VARCHAR(50)
    )
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/silver/Projeto.db/DepGreenInfraestructure'
    """
)

       

DataFrame[]

In [26]:
replaced15_projeto_green \
    .select("the_geom", "Asset_ID", "GI_ID", "DEP_Contra", "DEP_Cont_1", "Project_Ty", "Row_Onsite", "Project_Na", "Asset_Type","Status","Asset_X_Co","Asset_Y_Co","Borough","Sewer_Type","Outfall","Waterbody","Street_Add","Nearest_In","BBL","Community_","City_Counc","Assembly_D","Asset_Leng","Asset_Width","Asset_Area","GI_Feature","Tree_Latin","Tree_Commo","Constructi","Construc_1","Status_Gro")\
    .write \
    .mode("overwrite") \
    .partitionBy("Borough", "Street_Add") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Projeto/silver/Projeto.db/deltalake_table")

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Jav

Py4JError: An error occurred while calling o254.save