# Introduction to DLT
DLT works with three types of Datasets:
* Streaming Tables (Permanent / Temporary) used to append data source, incremental data
* Materialized Views - Used for transformations, aggregations or computations
* Views - used for intermediate transformations, not stored in the target schema

In [0]:

import dlt

# Streaming table due to spark.readStream

@dlt.table(
  name="plans",
  table_properties = { "quality": "bronze"},
  comment = "Plans bronze table"
)
def func():
  return spark.readStream.format("cloudfiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", "/Volumes/magnusp_catalog/training/source/schemas/plans")\
    .option("cloudFiles.schemaHints", "plan_id integer,plan_name string,cost_per_mb decimal(5,3), cost_per_message decimal(5,3), cost_per_minute decimal(5,3),ld_cost_per_minute decimal(5,3),intl_cost_per_minute decimal(5,3)")\
    .option("cloudFiles.schemaEvolutionMode", "none")\
    .load("/Volumes/magnusp_catalog/training/source/plans.json")

In [0]:
import dlt
from pyspark.sql.functions import expr

# VIEW (not materialized in target)
@dlt.table(
  name="customer_plan_and_events",
  table_properties = { "quality": "silver"}
)
def func():
  df = spark.readStream.table("customers").alias("c")
  df_plans = spark.readStream.table("plans").alias("p")
  df_events = spark.read.table("device_events").alias("e")
  df_joined = df.join(df_plans,on=expr("c.plan = p.plan_id"))
  df_joined_events = df_joined.join(df_events, how="LEFT_OUTER",on=["device_id"])
  return df_joined_events

In [0]:
# Materialized view 
@dlt.table(
  name="device_events",
  table_properties = { "quality": "bronze"},
  comment = "Events bronze table"
)
def func():
  df = spark.read.table("magnusp_catalog.training.events")
  return df

In [0]:
import dlt
from pyspark.sql.functions import count,sum,col,lower

@dlt.table(
  name = "sms_costs",
  table_properties = { "quality": "gold"},
  comment = "Cube costs SMS"
)
def func():
  df = spark.read.table("customer_plan_and_events")
  df_sms = df.where(
    lower(col("event_type")) == "sms"
  )
  df_sms_cnt = df_sms.groupBy(
    "customer_id",
    "phone_number", 
    "device_id", 
    "cost_per_message"
  ).agg(
    count("event_ts").alias("sms_cnt")
  )
  df_sms_cube = df_sms_cnt.cube(
    "customer_id",
    "phone_number", 
    "device_id"
  ).agg(
    sum(
      col("sms_cnt") * col("cost_per_message")
    ).alias("total_cost")
  )
  return df_sms_cube

In [0]:
import dlt
from pyspark.sql.functions import count,sum,col,lower

@dlt.table(
  name = "internet_costs",
  table_properties = { "quality": "gold"},
  comment = "Cube costs Internet"
)
def func():
  df = spark.read.table("customer_plan_and_events")
  df_internet = df.where(
    lower(col("event_type")) == "internet"
  )
  df_internet_mb = df_internet.groupBy(
    "customer_id",
    "phone_number", 
    "device_id", 
    "cost_per_mb"
  ).agg(
    sum("bytes_transferred").alias("bytes_transferred")
  )
  df_internet_cube = df_internet_mb.cube(
    "customer_id",
    "phone_number", 
    "device_id"
  ).agg(
    sum(
      col("bytes_transferred") * col("cost_per_mb")
    ).alias("total_cost")
  )
  return df_internet_cube