###### Description: In this notebook we read apartment state rows from incoming csv files into a streamig dataframe, transform (clean, cast, rename) the data, add/update the latest state to a Databricks Delta table
###### Objective: (incoming csv files) --> "apartment_streamingDF" --> "apartment_df" --> "apartment_data"

In [2]:
import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime
import pandas as pd

from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col
import time

In [3]:
# Schema for Apartment JSON
apartment_schema = StructType([
            StructField("Building_id", IntegerType(), True),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Type", StringType(), True),
            StructField("Rent_fee", StringType(), True),
            StructField("Building_name", StringType(), True),
            StructField("Appt_details", StringType(), True),
            StructField("event_time", TimestampType(), True),
            StructField("fetch_time", StringType(), True)])

###### Description: Get landlord csv files as a streaming "apartment_streamingDF" and process it on the fly and get transformed stream "apartment_df"
###### Objective: (incoming csv files) --> "apartment_streamingDF" --> "apartment_df"

In [5]:
# Get Apartment Steaming DataFrame from csv files

# streaming starts here by reading the input files 
apartment_Path = "/FileStore/apartment/apartment/inprogress/"
apartment_streamingDF = (
  spark
    .readStream
    .schema(apartment_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .option("multiLine", "true")
    .csv(apartment_Path)
)

apartment_df = apartment_streamingDF.select("*").where("Building_id IS NOT NULL")
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(apartment_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Type', 'Rent_fee', 'Building_name', 'Appt_details'])


- ###### Now "apartment_df" contains pre-processed apartment state rows
- ###### After this point we need comparison
- ###### Stream-Stream subtraction is not supported
- ###### So we dump the incoming data to a query result "apartment_datalake" which will give updated resulsts upon request
- ###### "apartment_datalake" is not streaming but it will give updated results upon request
- ###### From "apartment_datalake" we filter out the unseen rows to "apartment_landlord_df"

In [7]:
apartment_datalake_query = apartment_df.writeStream.format("memory").queryName("apartment_datalake").start()

## Enter batch mode

In [9]:
def getDelta_df(entity):
  
  #   Save snapshot of data into hive table to work with
  spark.sql("select * from " + entity + "_datalake").write.mode("overwrite").saveAsTable(entity + "_temp")
  #   Take snapshot
  datalake_snapshot = spark.sql("select * from " + entity + "_temp")
    
  if (len(spark.sql("show tables like '" + entity + "_tracker'").collect()) == 1):
    seq_tracker = spark.sql("select * from " + entity + "_tracker")
    datalake_eq = (( datalake_snapshot
                    .join(seq_tracker, seq_tracker.sequence == datalake_snapshot.fetch_time))
                   .drop("sequence").write.saveAsTable("temp_data"))

    spark.sql("refresh table temp_data")

    delta_df = datalake_snapshot.subtract(spark.sql("select * from temp_data"))
    
    delta_df.write.mode("overwrite").saveAsTable(entity + "_delta")
      
    delta_df.select(col("fetch_time").alias("sequence")).distinct().write.insertInto(entity + "_tracker")

    spark.sql("drop table temp_data")
  else:
    datalake_snapshot.write.saveAsTable(entity + "_delta")
    datalake_snapshot.select(col("fetch_time").alias("sequence")).distinct().write.saveAsTable(entity + "_tracker")    
  
  return spark.sql("select * from " + entity + "_delta")

In [10]:
def resetTrackingData(entity):
  if (len(spark.sql("show tables like '" + entity + "_delta'").collect()) == 1):
      spark.sql("drop table " + entity + "_delta")
      
  if (len(spark.sql("show tables like '" + entity + "_temp'").collect()) == 1):
      spark.sql("drop table " + entity + "_temp")
      
  if (len(spark.sql("show tables like '" + entity + "_tracker'").collect()) == 1):
      spark.sql("drop table " + entity + "_tracker")

In [11]:
def getLastApartmentState_df():
  entity = "apartment"
  delta_df = getDelta_df(entity).drop("fetch_time")
  temp_state_df = ( delta_df.groupBy("Building_id", "Apartment_number").agg(F.max(delta_df.event_time))
                   .select(col("Building_id").alias("Building_id1"), 
                           col("Apartment_number").alias("Apartment_number1"), 
                           col("max(event_time)").alias("event_time1")))
  delta_state_df = ( delta_df.join(temp_state_df,(delta_df.Building_id == temp_state_df.Building_id1)
                                           &(delta_df.Apartment_number == temp_state_df.Apartment_number1) 
                                                & (delta_df.event_time == temp_state_df.event_time1))
                    .drop("Building_id1")
                    .drop("Apartment_number1")
                    .drop("event_time1"))
  
  return delta_state_df

In [12]:
def updateApartment(new_State_df):
  
  if (len(spark.sql("select * from delta.`/delta/apartment/apartment_data`").collect()) == 0):
      new_State_df.write.format("delta").mode("overwrite").save("/delta/apartment/apartment_data")
  else:
    new_state_df.write.format("delta").mode("overwrite").save("/delta/apartment/apartment_data_temp")
    
    query_str = "MERGE INTO delta.`/delta/apartment/apartment_data` AS apartment_data \
    USING delta.`/delta/apartment/apartment_data_temp` AS apartment_data_temp \
    ON apartment_data.Building_id = apartment_data_temp.Building_id \
    AND apartment_data.Apartment_number = apartment_data_temp.Apartment_number \
    WHEN MATCHED THEN \
      UPDATE SET \
        apartment_data.Type = apartment_data_temp.Type, \
        apartment_data.Rent_fee = apartment_data_temp.Rent_fee, \
        apartment_data.Building_name = apartment_data_temp.Building_name, \
        apartment_data.Appt_details = apartment_data_temp.Appt_details, \
        apartment_data.event_time = apartment_data_temp.event_time \
    WHEN NOT MATCHED \
      THEN INSERT (Building_id, \
      Apartment_number, Type, Rent_fee, Building_name, Appt_details, event_time) VALUES (apartment_data_temp.Building_id, \
      apartment_data_temp.Apartment_number, apartment_data_temp.Type, \
      apartment_data_temp.Rent_fee, apartment_data_temp.Building_name, apartment_data_temp.Appt_details, apartment_data_temp.event_time)"
    
    spark.sql(query_str)
  return spark.sql("select * from delta.`/delta/apartment/apartment_data`")

Delta Table operation execution

In [14]:
spark.sql("select * from apartment_datalake where false").drop("fetch_time").write.format("delta").mode("overwrite").save("/delta/apartment/apartment_data")

In [15]:
display(apartment_df)

In [16]:
display(spark.sql("select * from apartment_datalake").orderBy("Building_id", "Apartment_number"))

In [17]:
new_state_df = getLastApartmentState_df()

In [18]:
new_state_df.count()

In [19]:
apartment_current_data_df = updateApartment(new_state_df)

In [20]:
display(apartment_current_data_df.orderBy("Building_id", "Apartment_number"))

In [21]:
apartment_current_data_df.count()

##### Uncomment and execute below cell to reset database state

In [23]:
# resetTrackingData("apartment")
# spark.sql("delete from delta.`/delta/apartment/apartment_data`")