# 0. Set the default lakehouse for notebook to run from pipeline

In [None]:
%%configure
{ 
    "defaultLakehouse": { 
        "name": {
                  "parameterName": "lakehouseName",
                  "defaultValue": "defaultlakehousename"
        }
    }
}

# 1. Initialize Parameters

In [None]:
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit
import java.util.UUID
import java.text.SimpleDateFormat
import java.time.{LocalDate, LocalDateTime, Period}
import java.time.format.DateTimeFormatter
import java.time.temporal.ChronoUnit
import java.util.Calendar
import java.sql.Timestamp
import io.delta.tables._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.{Window, WindowSpec}
import org.apache.spark.sql.functions.{coalesce, lit, sum, col, _}
import org.apache.spark.sql.types.{StructField, _}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val runId  = "00000000-0000-0000-0000-000000000000"
val workspaceId =  spark.conf.get("trident.workspace.id")
val workspaceName =  "LakeHouseTesting"
val lakehouseId = spark.conf.get("trident.lakehouse.id")
val lakehouseName =   "IMAXDefault"
val sitesStagingTableName = "Sites_Staging"
val sitesFinalTableName = "Sites"
val filesStagingTableName = "Files_Staging"
val filesFinalTableName = "Files"
val endTime  = "2024-11-15T00:00:00Z"
spark.conf.set("spark.sql.caseSensitive", true)// Welcome to your new notebook


# 2. Checking Required Final Tables exists or not

In [None]:
val lakehouse  = mssparkutils.lakehouse.get(lakehouseName)
val lakehouseId  = lakehouse.id
val workspaceName = notebookutils.runtime.context("currentWorkspaceName")

val filesStagingLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${filesStagingTableName}"
val sitesStagingLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${sitesStagingTableName}"
val sitesFinalLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${sitesFinalTableName}"
val filesFinalLocation = s"abfss://${workspaceId}@onelake.dfs.fabric.microsoft.com/${lakehouseId}/Tables/${filesFinalTableName}"

//Need to attach a lake house before this
val tables = spark.catalog.listTables()
val siteTableCount = tables.filter(col("name") === lit(sitesFinalTableName)  and array_contains(col("namespace"), lakehouseName) ).count()
val filesTableCount = tables.filter(col("name") === lit(filesFinalTableName) and array_contains(col("namespace"), lakehouseName)).count()
val siteStagingTableCount = tables.filter(col("name") === lit(sitesStagingTableName)  and array_contains(col("namespace"), lakehouseName) ).count()
val filesStagingTableCount = tables.filter(col("name") === lit(filesStagingTableName) and array_contains(col("namespace"), lakehouseName)).count()


# 3. Getting Snapshot dates from last successful extracts

In [None]:
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

val dtCurrentDateFormatt = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.S")
val dtRequiredtDateFormatt = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")
var siteDataExists: Boolean = false
var filesDataExists: Boolean = false

val siteSnapshotDate = {
    if (siteTableCount ==1) {
        val dfSites = spark.sql(s"SELECT MAX(SnapshotDate) AS SnapshotDate FROM ${lakehouseName}.${sitesFinalTableName} ")
        val rowSites: Row = dfSites.select("SnapshotDate").head(1)(0)
        if (rowSites.get(0) == null) 
            endTime 
        else  
        {
            siteDataExists = true
            println(s"Sites data Exists: ${siteDataExists}")
            LocalDateTime.parse(rowSites.get(0).toString(), dtCurrentDateFormatt).format(dtRequiredtDateFormatt)
        }
    }
    else {
        endTime
    }
}

val filesSnapshotDate = {
    if (filesTableCount ==1) {
        val dffiles = spark.sql(s"SELECT MAX(SnapshotDate) AS SnapshotDate FROM ${lakehouseName}.${filesFinalTableName} ")
        val rowfiles: Row = dffiles.select("SnapshotDate").head(1)(0)
        if (rowfiles.get(0) == null) 
            endTime 
        else {
            filesDataExists = true
            println(s"files data Exists: ${filesDataExists}")
            LocalDateTime.parse(rowfiles.get(0).toString(), dtCurrentDateFormatt).format(dtRequiredtDateFormatt) 
        }  
    }
    else {
        endTime
    }
}



# 4. Generate View Script for Sites

In [None]:
val sitesView: String = s"""
CREATE OR ALTER VIEW vw${sitesFinalTableName}   
AS
SELECT  *,[StorageQuotaFriendly] =  (case 
                when StorageQuota < 1048576 then concat(ceiling(StorageQuota / 1024.0), ' KB')
                when StorageQuota < 1073741824 then concat(ceiling(StorageQuota / 1048576.0), ' MB')
                when StorageQuota < 1099511627776  then concat(ceiling(StorageQuota / 1073741824.0), ' GB')
                when StorageQuota < 1125899906842624  then concat(ceiling(StorageQuota / 1099511627776.0), ' TB')
                else concat(ceiling(StorageQuota / 1125899906842624.0), ' PB')
            end )
        ,[StorageUsedFriendly] =  (case 
                when StorageUsed < 1048576 then concat(ceiling(StorageUsed / 1024.0), ' KB')
                when StorageUsed < 1073741824 then concat(ceiling(StorageUsed / 1048576.0), ' MB')
                when StorageUsed < 1099511627776  then concat(ceiling(StorageUsed / 1073741824.0), ' GB')
                when StorageUsed < 1125899906842624  then concat(ceiling(StorageUsed / 1099511627776.0), ' TB')
                else concat(ceiling(StorageUsed / 1125899906842624.0), ' PB')
            end )            
  FROM ${sitesFinalTableName}
""".stripMargin.replaceAll("[\n\r]"," ")
println(sitesView)

# 5. Generate View Script for Files

In [None]:
val filesView: String = s"""
CREATE OR ALTER VIEW vw${filesFinalTableName}         
    AS      
SELECT * FROM ${filesFinalTableName}
""".stripMargin.replaceAll("[\n\r]"," ")
println(filesView)

# 6. Generate View Script for File Aggs

In [None]:
val fileAggsView: String = s"""
CREATE OR ALTER VIEW vw${filesFinalTableName}_Aggs  
    AS      
SELECT * FROM ${filesFinalTableName}_Aggs 
""".stripMargin.replaceAll("[\n\r]"," ")
println(fileAggsView)

# 7. Truncate the Staging tables from previous runs if data already exists

In [None]:
if (siteStagingTableCount ==1) {
    spark.sql(s"DELETE FROM ${lakehouseName}.${sitesStagingTableName} ")
    println(s"Staging table deleted: ${lakehouseName}.${sitesStagingTableName}")
}else {
    println(s"Staging table ${lakehouseName}.${sitesFinalTableName} not found")
}


if (filesStagingTableCount ==1) {
    spark.sql(s"DELETE FROM ${lakehouseName}.${filesStagingTableName} ")
    println(s"Staging table deleted: ${lakehouseName}.${filesStagingTableName}")
}else {
    println(s"Staging table ${lakehouseName}.${filesStagingTableName} not found")
}

# 8. Return snapshot dates back to Pipeline

In [None]:
import mssparkutils.notebook
val returnData= s"""{\"LakehouseId\": \"${lakehouseId}\", \"SitesStagingTableName\": \"${sitesStagingTableName}\", \"SitesFinalTableName\": \"${sitesFinalTableName}\",  \"SitesSnapshotDate\": \"${siteSnapshotDate}\", \"SitesDataExists\": ${siteDataExists}, \"SitesView\": \"${sitesView}\",  \"FilesStagingTableName\": \"${filesStagingTableName}\", \"FilesFinalTableName\": \"${filesFinalTableName}\", \"FilesSnapshotDate\": \"${filesSnapshotDate}\", \"EndSnapshotDate\": \"${endTime}\", \"FilesDataExists\": ${filesDataExists}, \"FilesView\": \"${filesView}\", \"FileAggsView\": \"${fileAggsView}\"}"""
println(returnData)
mssparkutils.notebook.exit(returnData)