In [12]:
import sys
!{sys.executable} -m pip install hdfs

from os import PathLike
from hdfs import InsecureClient
client = InsecureClient("http://hdfs-nn:9870/", user="anonymous")
from_path ="./Self-Reported_Drinking_Water_Tank_Inspection_Results.csv"
to_path ="/Projeto/Bronze/Self-Reported_Drinking_Water_Tank_Inspection_Results.csv"
client.delete(to_path)
client.upload(to_path, from_path)



'/Projeto/Bronze/Self-Reported_Drinking_Water_Tank_Inspection_Results.csv'

In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [25]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/Self-Reported_Drinking_Water_Tank_Inspection_Results.csv"

df_final = spark.read\
    .option("header", "true")\
    .option("delimiter",",")\
    .csv(hdfs_path)

df = df_final.select ("BIN", "BOROUGH", "REPORTING_YEAR", "HOUSE_NUM", "STREET_NAME", "BLOCK", "LOT", "LAB_NAME", "COLIFORM", "ECOLI", "MEET_STANDARDS", "BATCH_DATE", "TANK_NUM", "LATITUDE", "LONGITUDE")

In [26]:
spark.sql(
    """
    SHOW TABLES FROM Projeto
    """
).show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  projeto|drinking_water_qu...|      false|
|  projeto|   drinkingfountains|      false|
+---------+--------------------+-----------+



In [3]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/Self-Reported_Drinking_Water_Tank_Inspection_Results.csv"

customSchema = StructType([
    StructField("_BIN", IntegerType(), True),        
    StructField("BOROUGH", StringType(), True),
    StructField("REPORTING_YEAR", IntegerType(), True),
    StructField("HOUSE_NUM", StringType(), True),
    StructField("STREET_NAME", StringType(), True),
    StructField("BLOCK", StringType(), True),
    StructField("LOT", StringType(), True), 
    StructField("LAB_NAME", StringType(), True),
    StructField("COLIFORM", StringType(), True),
    StructField("ECOLI", StringType(), True),
    StructField("MEET_STANDARDS", StringType(), True),
    StructField("BATCH_DATE", StringType(), True),
    StructField("TANK_NUM", IntegerType(), True),
    StructField("LATITUDE", FloatType(), True),
    StructField("LONGITUDE", FloatType(), True),


])

AguaTanques = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
AguaTanques.show()
AguaTanques.printSchema()

+-------+---------+--------------+---------+--------------------+-----+----+-------------+--------+-----+--------------------+----------+--------+--------+---------+
|   _BIN|  BOROUGH|REPORTING_YEAR|HOUSE_NUM|         STREET_NAME|BLOCK| LOT|     LAB_NAME|COLIFORM|ECOLI|      MEET_STANDARDS|BATCH_DATE|TANK_NUM|LATITUDE|LONGITUDE|
+-------+---------+--------------+---------+--------------------+-----+----+-------------+--------+-----+--------------------+----------+--------+--------+---------+
|1041902|MANHATTAN|         10022|      118|    East 60th Street|01394|0007|WTI3239575893|    2021|    1|Rosenwach Tank Co...|         Y|    null|    null|     null|
|4442431|   QUEENS|         11432|    82-68|        164th Street|06858|0001|WTI8679117896|    2022|    2|               Nalco|         Y|    null|    null|     null|
|1071989|MANHATTAN|         10022|      345|      EAST 56 STREET|01349|7501|WTI6238013576|    2021|    1|     ISSEKS BROS INC|         Y|    null|    null|     null|
|209

In [28]:
replaced_AguaTanques = AguaTanques.withColumn(
    "MEET_STANDARDS",
    when(
        (col("MEET_STANDARDS").isNull()), 
        "Sem informação"
    ).otherwise(col("MEET_STANDARDS")))

In [17]:
replaced_AguaTanques.toPandas()

Unnamed: 0,_BIN,BOROUGH,REPORTING_YEAR,HOUSE_NUM,STREET_NAME,BLOCK,LOT,LAB_NAME,COLIFORM,ECOLI,MEET_STANDARDS,BATCH_DATE,TANK_NUM,LATITUDE,LONGITUDE
0,1041902.0,MANHATTAN,10022.0,118,East 60th Street,01394,0007,WTI3239575893,2021,1,Rosenwach Tank Co. LLC,Y,,,
1,4442431.0,QUEENS,11432.0,82-68,164th Street,06858,0001,WTI8679117896,2022,2,Nalco,Y,,,
2,1071989.0,MANHATTAN,10022.0,345,EAST 56 STREET,01349,7501,WTI6238013576,2021,1,ISSEKS BROS INC,Y,,,
3,2097466.0,BRONX,10475.0,2049,BARTOW AVENUE,5141,100,WTI4549961683,2018,7,Rosenwach Tank Co. LLC,Y,,,
4,2045910.0,BRONX,10461.0,1730,MULFORD AVE,04161,0001,WTI3703539820,2021,1,ISSEKS BROS INC,Y,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39178,3392969.0,BROOKLYN,11201.0,345,Adams Street,00140,7503,WTI0991844172,2020,1,DCAS,Y,,,
39179,1083600.0,MANHATTAN,10001.0,315,West 25th Street,00749,0024,WTI1664941601,2014,1,Isseks Bros,Y,,,
39180,1087126.0,MANHATTAN,10065.0,1188,York Avenue,01480,0010,WTI5712193546,2020,2,Rosenwach Tank Co. LLC,Y,,,
39181,1086515.0,MANHATTAN,10016.0,462,First Avenue,00962,0100,WTI5929880104,2018,1,Nalco,Y,,,


In [29]:
replaced_AguaTanques2 = replaced_AguaTanques.withColumn(
    "LAB_NAME",
    when(
        (col("LAB_NAME").isNull()), 
        "Sem informação"
    ).otherwise(col("LAB_NAME")))


In [30]:
replaced_AguaTanques3 = replaced_AguaTanques2.withColumn(
    "ECOLI",
    when(
        (col("ECOLI").isNull()), 
        "Sem informação"
    ).otherwise(col("ECOLI")))

In [31]:
replaced_AguaTanques4 = replaced_AguaTanques3.withColumn(
    "COLIFORM",
    when(
        (col("COLIFORM").isNull()), 
        "Sem informação"
    ).otherwise(col("COLIFORM")))

In [47]:
spark.sql(
    
    """
    CREATE EXTERNAL TABLE Projeto.Self_Reported_Drinking_Water_Tank_Inspection_Results (
    _BIN INT,
    HOUSE_NUM VARCHAR(50),
    STREET_NAME VARCHAR(50),
    BLOCK VARCHAR(50),
    LOT VARCHAR(50),
    LAB_NAME VARCHAR(50),
    COLIFORM VARCHAR(1),
    ECOLI VARCHAR(1),
    MEET_STANDARDS VARCHAR(50),
    BATCH_DATE VARCHAR(50),
    TANK_NUM INT,
    LATITUDE FLOAT,
    LONGITUDE FLOAT
    ) 
    USING DELTA
    PARTITIONED BY (
        REPORTING_YEAR INT,
        BOROUGH VARCHAR(50)
    

    )
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Self_Reported_Drinking_Water_Tank_Inspection_Results'
    """
)

DataFrame[]

In [46]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Self_Reported_Drinking_Water_Tank_Inspection_Results
    """
)

DataFrame[]

In [48]:
replaced_AguaTanques4 \
    .select("_BIN", "HOUSE_NUM", "STREET_NAME", "BLOCK", "LOT", "LAB_NAME", "COLIFORM", "ECOLI", "MEET_STANDARDS", "BATCH_DATE", "TANK_NUM", "LATITUDE", "LONGITUDE", "REPORTING_YEAR", "BOROUGH") \
    .write \
    .mode("overwrite") \
    .partitionBy("BOROUGH", "REPORTING_YEAR") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Self_Reported_Drinking_Water_Tank_Inspection_Results/deltalake_table/")