# 0. Set the default lakehouse for notebook to run

In [None]:
%%configure
{ 
    "defaultLakehouse": { 
        "name": {
                  "parameterName": "lakehouseName",
                  "defaultValue": "defaultlakehousename"
        }
    }
}

# 1. Initialize Parameters

In [None]:
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit
import java.util.UUID
import java.text.SimpleDateFormat
import java.time.{LocalDate, LocalDateTime, Period}
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit
import java.util.Calendar
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val runId  = "00000000-0000-0000-0000-000000000000"
val workspaceId =  spark.conf.get("trident.workspace.id")
val workspaceName =  "LakeHouseTesting"
val lakehouseId = spark.conf.get("trident.lakehouse.id")
val lakehouseName = spark.conf.get("trident.lakehouse.name")
val sitesStagingTableName = "Sites_Staging"
val sitesFinalTableName = "Sites"
val filesStagingTableName = "Files_Staging"
val filesFinalTableName = "Files"
spark.conf.set("spark.sql.caseSensitive", true)




# 2. Read Sites Dataset from Staging Table

In [None]:
val lakehouse  = mssparkutils.lakehouse.get(lakehouseName)
val lakehouseId  = lakehouse.id
val workspaceName = notebookutils.runtime.context("currentWorkspaceName")
println("Started reading Sites dataset")

val sitesStagingLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${sitesStagingTableName}"
val dfSitesStaging = spark.read.format("delta").load(sitesStagingLocation)
println("Completed reading Sites dataset")

# 3. Read Files Dataset from Staging Table

In [None]:
println("Started reading Files dataset")

val filesStagingLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${filesStagingTableName}"
val dfFilesStaging = spark.read.format("delta").load(filesStagingLocation)
println("Completed reading Files dataset")

# 4. Check Final Tables Exists or not 

In [None]:
import io.delta.tables.DeltaTable

val sitesFinalLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${sitesFinalTableName}"
val filesFinalLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${filesFinalTableName}"

val sitesFinalTableExists = DeltaTable.isDeltaTable(spark, sitesFinalLocation)
if (!sitesFinalTableExists) {
    println("Final Sites table not exists. Creating final Sites table with schema only")
    dfSitesStaging.filter("1=2").write.format("delta").mode("overwrite").save(sitesFinalLocation)
    println("Final Sites table created")
}else {
    println("Final Sites table exists already")
}



val filesFinalTableExists = DeltaTable.isDeltaTable(spark, filesFinalLocation)
if (!filesFinalTableExists) {
    println("Final Files table not exists. Creating final Files table with schema only")
    dfFilesStaging.filter("1=2").write.format("delta").mode("overwrite").save(filesFinalLocation)
    println("Final Files table created")
}else {
    println("Final Files table exists already")
}


# 5. Merge Sites Data from Staging table to Final table

In [None]:
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val deltaTableSource = DeltaTable.forPath(spark, sitesStagingLocation)
val deltaTableTarget = DeltaTable.forPath(spark, sitesFinalLocation)

import spark.implicits._
val dfSource = deltaTableSource.toDF

//Delete records that have Operation as Deleted 
println("Merging Sites dataset from current staging table")
deltaTableTarget
  .as("target")
  .merge(
    dfSource.as("source"),
    "source.Id = target.Id")
  .whenMatched("source.Operation = 'Deleted'")
  .delete()
  .whenMatched("source.Operation != 'Deleted'")
  .updateAll()
  .whenNotMatched("source.Operation != 'Deleted'")
  .insertAll()
  .execute()
println("Merging of Sites dataset completed")

# 6. Merge Files Data from Staging table to Final table

In [None]:
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val deltaTableFilesSource = DeltaTable.forPath(spark, filesStagingLocation)
val deltaTableFilesTarget = DeltaTable.forPath(spark, filesFinalLocation)

import spark.implicits._
val dfFilesSource = deltaTableFilesSource.toDF

//Delete records that have Operation as Deleted 
println("Merging Files dataset from current staging table")

//Step 1: Delete all rows based on keys

println("Started: Cleaned up older files")
deltaTableFilesTarget
  .as("target")
  .merge(
    dfFilesSource.as("source"),
    """source.SiteId = target.SiteId and source.ItemId = target.ItemId """)
  .whenMatched()
  .delete()
  .execute()
println("Complted: Cleaned up older files")

deltaTableFilesTarget
  .as("target")
  .merge(
    dfFilesSource.as("source"),
    """source.SiteId = target.SiteId and source.ItemId = target.ItemId """)
  .whenMatched("source.Operation = 'Deleted'")
  .delete()
  .whenMatched("source.Operation != 'Deleted'")
  .updateAll()
  .whenNotMatched("source.Operation != 'Deleted'")
  .insertAll()
  .execute()
  println("Merging of Files dataset completed")
  

# 7. Read Sites and Files dataset - Sample TOP 10 Rows

In [None]:
var sqlQuery = s"SELECT * FROM ${lakehouseName}.${sitesFinalTableName} order by SnapshotDate DESC LIMIT 10"
val dfSitesAll = spark.sql(sqlQuery)
display(dfSitesAll)

sqlQuery = s"SELECT * FROM ${lakehouseName}.${filesFinalTableName}  order by SnapshotDate DESC LIMIT 10"
val dfFilesAll = spark.sql(sqlQuery)
display(dfFilesAll)

# 8. Check and Create Files Agg table if not exits already

In [None]:
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val filesAggTableName = filesFinalTableName + "_Aggs"
val filesAggLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${filesAggTableName}"

val filesAggsTableExists = DeltaTable.isDeltaTable(spark, filesAggLocation)
if (!filesAggsTableExists) {
    println("Files Agg table not exists. Creating Files Agg table without schema ")


    val fileAggsSchema =
    StructType(Array(
        StructField("SiteId", StringType, nullable = true),
        StructField("Extension", StringType, nullable = true),
        StructField("FileCount_BySite", LongType, nullable = false),
        StructField("SizeInBytes", LongType, nullable = true))
    )

    val dfFileAggsEmpty = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], fileAggsSchema)
    dfFileAggsEmpty.filter("1=2").write.format("delta").mode("overwrite").save(filesAggLocation)

    println("Files Agg table created")
}else {
    println("Files Agg table exists already")
}


# 9. Generate File Aggs

In [None]:
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val filesDetailSource = DeltaTable.forPath(spark, filesFinalLocation)
val deltaTableFileAggsTarget = DeltaTable.forPath(spark, filesAggLocation)

//Step 1: Read Files data
println("Started: Reading Files data")
import spark.implicits._
val dfFilesDetailSource = filesDetailSource.toDF

//Step 2: Aggregate Files data at Site and Extension Level
println("Started: Aggregating Files data at Site")
val dfFileAggs = dfFilesDetailSource.groupBy("SiteId", "Extension").agg(count("SiteId").alias("FileCount_BySite"), sum("SizeInBytes").alias("SizeInBytes"))
//display(dfFileAggsEmpty)

//Step 3: Write Files data at Site and Extension Level
println("Started: Writing Files data at Site and Extension (Memory Intesive Operation). Expect to run for longer time based on data in files table")
dfFileAggs.write.format("delta").mode("overwrite").save(filesAggLocation)
println("Completed: Writing Files data at Site and Extension")