###### Description: In this notebook we read landlord state rows from incoming csv files into a streamig dataframe, transform (clean, cast, rename) the data, add/update the latest state to a static hive tabe
###### Objective: (incoming csv files) --> "landlord_streamingDF" --> "landlord_df" --> "landlord_data"

In [2]:
import requests
import json
import optimus as op
import phonenumbers 
import re
import datetime
import pandas as pd

from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType, DoubleType, StructType, StructField
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.functions import rank, col
import time

In [3]:
# Schema for Apartment JSON
apartment_schema = StructType([
            StructField("Building_id", IntegerType(), True),
            StructField("Apartment_number", IntegerType(), True),
            StructField("Type", StringType(), True),
            StructField("Rent_fee", StringType(), True),
            StructField("Building_name", StringType(), True),
            StructField("Appt_details", StringType(), True),
            StructField("event_time", TimestampType(), True),
            StructField("fetch_time", StringType(), True)])

###### Description: Get landlord csv files as a streaming "landlord_streamingDF" and process it on the fly and get transformed stream "landlord_df"
###### Objective: (incoming csv files) --> "landlord_streamingDF" --> "landlord_df"

In [5]:
# Get Landlord Steaming DataFrame from csv files

# streaming starts here by reading the input files 
apartment_Path = "/FileStore/apartment/apartment/inprogress/"
apartment_streamingDF = (
  spark
    .readStream
    .schema(apartment_schema)
    .option("maxFilesPerTrigger", "1")
    .option("header", "true")
    .option("multiLine", "true")
    .csv(apartment_Path)
)

apartment_df = apartment_streamingDF.select("*")
# landlord_df = landlord_df.select(landlord_df.Landlord_id, landlord_df.Password, landlord_df.Landlord_name, landlord_df.Address_line_1,  landlord_df.City, landlord_df.PostCode, landlord_df.Region, landlord_df.Seen)
# Instantiation of DataTransformer class:
transformer = op.DataFrameTransformer(apartment_df)
# Replace NA with 0's
transformer.replace_na(0.0, columns="*")
# Clear accents: clear_accents only from name column and not everywhere 
transformer.clear_accents(columns='*')
# Remove special characters:  From all Columns 
transformer.remove_special_chars(columns=['Type', 'Rent_fee', 'Building_name', 'Appt_details'])


In [6]:
display(apartment_streamingDF)

- ###### Now "landlord_df" contains pre-processed landlord state rows
- ###### After this point we need comparison
- ###### Stream-Stream subtraction is not supported
- ###### So we dump the incoming data to a query result "landlord_datalake" which will give updated resulsts upon request
- ###### "landlord_datalake" is not streaming but it will give updated results upon request
- ###### From "landlord_datalake" we filter out the unseen rows to "unseen_landlord_df"

In [8]:
apartment_datalake_query = apartment_df.writeStream.format("memory").queryName("apartment_datalake").start()

## Enter batch mode

###### Take a snapshot of the landlord_df where Seen = false
###### Add the fetch time columns to hive table landlord_seq_tracker

In [11]:
def getDelta_df(entity):
  
  #   Save snapshot of data into hive table to work with
  spark.sql("select * from " + entity + "_datalake").write.mode("overwrite").saveAsTable(entity + "_temp")
  #   Take snapshot
  datalake_snapshot = spark.sql("select * from " + entity + "_temp")
    
  if (len(spark.sql("show tables like '" + entity + "_tracker'").collect()) == 1):
    seq_tracker = spark.sql("select * from " + entity + "_tracker")
    datalake_eq = (( datalake_snapshot
                    .join(seq_tracker, seq_tracker.sequence == datalake_snapshot.fetch_time))
                   .drop("sequence").write.saveAsTable("temp_data"))

    spark.sql("refresh table temp_data")

    delta_df = datalake_snapshot.subtract(spark.sql("select * from temp_data"))
    
    delta_df.write.mode("overwrite").saveAsTable(entity + "_delta")
      
    delta_df.select(col("fetch_time").alias("sequence")).distinct().write.insertInto(entity + "_tracker")

    spark.sql("drop table temp_data")
  else:
    datalake_snapshot.write.saveAsTable(entity + "_delta")
    datalake_snapshot.select(col("fetch_time").alias("sequence")).distinct().write.saveAsTable(entity + "_tracker")    
  
  return spark.sql("select * from " + entity + "_delta")

In [12]:
def resetTrackingData(entity):
  if (len(spark.sql("show tables like '" + entity + "_delta'").collect()) == 1):
      spark.sql("drop table " + entity + "_delta")
      
  if (len(spark.sql("show tables like '" + entity + "_temp'").collect()) == 1):
      spark.sql("drop table " + entity + "_temp")
      
  if (len(spark.sql("show tables like '" + entity + "_tracker'").collect()) == 1):
      spark.sql("drop table " + entity + "_tracker")

In [13]:
def getLastApartmentState_df():
  entity = "apartment"
  delta_df = getDelta_df(entity).drop("fetch_time")
  temp_state_df = ( delta_df.groupBy("Building_id", "Apartment_number").agg(F.max(delta_df.event_time))
                   .select(col("Building_id").alias("Building_id1"), 
                           col("Apartment_number").alias("Apartment_number1"), 
                           col("max(event_time)").alias("event_time1")))
  delta_state_df = ( delta_df.join(temp_state_df,(delta_df.Building_id == temp_state_df.Building_id1)
                                           &(delta_df.Apartment_number == temp_state_df.Apartment_number1) 
                                                & (delta_df.event_time == temp_state_df.event_time1))
                    .drop("Building_id1")
                    .drop("Apartment_number1")
                    .drop("event_time1"))
  
  return delta_state_df

In [14]:
def replaceApartmentRows(update_df):
  for row in update_df.collect():
    spark.sql("delete from landlord_data where Building_id=" + str(row.Landlord_id) + " AND Apartment_number=" + str(row.Apartment_number))
  update_df.write.mode("append").saveAsTable("apartment_data")

In [15]:
def updateApartment(new_State_df):
  
  if (len(spark.sql("show tables like 'apartment_data'").collect()) == 0):
      new_State_df.write.format("parquet").saveAsTable("apartment_data")
  else:
    apartment_data_df = (spark.sql("select * from apartment_data")
                        .select(col("Building_id").alias("Building_id1"), 
                                col("Apartment_number").alias("Apartment_number1"), 
                                col("Type").alias("Type1"), 
                                col("Rent_fee").alias("Rent_fee1"), 
                                col("Building_name").alias("Building_name1"), 
                                col("Appt_details").alias("Appt_details1"), 
                                col("event_time").alias("event_time1")))

    update_rows_df = (apartment_data_df.join(state_df, (new_State_df.Building_id == apartment_data_df.Building_id1)
                                             &(new_State_df.Apartment_number == apartment_data_df.Apartment_number1),  'outer')
               .select(new_State_df.Building_id, 
                       new_State_df.Apartment_number,
                       F.when(new_State_df.event_time > apartment_data_df.event_time1, new_State_df.Type)
                       .otherwise(apartment_data_df.Type1).alias("Type"), 

                       F.when(new_State_df.event_time > apartment_data_df.event_time1, new_State_df.Rent_fee)
                       .otherwise(apartment_data_df.Rent_fee1).alias("Rent_fee"), 

                       F.when(new_State_df.event_time > apartment_data_df.event_time1, new_State_df.Building_name)
                       .otherwise(apartment_data_df.Building_name1).alias("Building_name"), 

                       F.when(new_State_df.event_time > apartment_data_df.event_time1, new_State_df.Appt_details)
                       .otherwise(apartment_data_df.Appt_details1).alias("Appt_details"), 

                       F.when(new_State_df.event_time > apartment_data_df.event_time1, new_State_df.event_time)
                       .otherwise(apartment_data_df.event_time1).alias("event_time")))

    new_ids_df = (new_State_df.select("Building_id", "Apartment_number").subtract(update_rows_df.select("Building_id", "Apartment_number"))
                  .distinct().select(col("Building_id").alias("Building_id1"), col("Apartment_number").alias("Apartment_number1")))

    new_rows_df = (new_State_df.join(new_ids_df, (new_State_df.Building_id == new_ids_df.Building_id1)
                                             &(new_State_df.Apartment_number == new_ids_df.Apartment_number1), "outer")
                   .drop("Building_id1", "Apartment_number1")
                   .distinct())
    new_State_df.write.insertInto("apartment_data")
  
#   replaceBuildingRows(update_rows_df)

In [16]:
new_state_df = getLastApartmentState_df()

In [17]:
updateApartment(new_state_df)

In [18]:
display(new_state_df.orderBy("Building_id", "Apartment_number"))

In [19]:
spark.sql("update apartment_data set Type='haha' where Building_id=1")

In [20]:
resetTrackingData("apartment")
spark.sql("drop table apartment_data")