In [0]:
spark.sql("USE CATALOG yogurt")
spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
spark.sql("USE SCHEMA silver")

display(spark.sql("SELECT current_catalog() AS catalog, current_database() AS schema"))

In [0]:
%sql
CREATE OR REPLACE TABLE region (
  r_regionkey INT, r_name STRING, r_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE nation (
  n_nationkey INT, n_name STRING, n_regionkey INT, n_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE part (
  p_partkey BIGINT, p_name STRING, p_mfgr STRING, p_brand STRING, p_type STRING,
  p_size INT, p_container STRING, p_retailprice DECIMAL(12,2), p_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE supplier (
  s_suppkey BIGINT, s_name STRING, s_address STRING, s_nationkey INT,
  s_phone STRING, s_acctbal DECIMAL(12,2), s_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE partsupp (
  ps_partkey BIGINT, ps_suppkey BIGINT, ps_availqty INT,
  ps_supplycost DECIMAL(12,2), ps_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE customer (
  c_custkey BIGINT, c_name STRING, c_address STRING, c_nationkey INT, c_phone STRING,
  c_acctbal DECIMAL(12,2), c_mktsegment STRING, c_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE orders (
  o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus STRING,
  o_totalprice DECIMAL(12,2), o_orderdate DATE, o_orderpriority STRING,
  o_clerk STRING, o_shippriority INT, o_comment STRING
) USING DELTA;

CREATE OR REPLACE TABLE lineitem (
  l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT, l_linenumber INT,
  l_quantity DECIMAL(12,2), l_extendedprice DECIMAL(12,2), l_discount DECIMAL(12,2),
  l_tax DECIMAL(12,2), l_returnflag STRING, l_linestatus STRING,
  l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,
  l_shipinstruct STRING, l_shipmode STRING, l_comment STRING
) USING DELTA;

In [0]:
tables = ["region","nation","part","supplier","partsupp","customer","orders","lineitem"]

for t in tables:
    spark.sql(f"CREATE OR REPLACE TABLE yogurt.silver.{t} USING DELTA AS SELECT * FROM yogurt.bronze.{t}")

print("Silver tables loaded from yogurt.bronze")

In [0]:
from pyspark.sql import Row

rows = []
for t in ["region","nation","part","supplier","partsupp","customer","orders","lineitem"]:
    b = spark.sql(f"SELECT COUNT(*) AS c FROM yogurt.bronze.{t}").collect()[0][0]
    s = spark.sql(f"SELECT COUNT(*) AS c FROM yogurt.silver.{t}").collect()[0][0]
    rows.append(Row(table=t, bronze=b, silver=s, status=("yes" if b==s else "no")))

display(spark.createDataFrame(rows).orderBy("table"))

In [0]:
%sql
-- Q1: total items customer 446044 bought (sum of quantities)
SELECT SUM(li.l_quantity) AS total_items
FROM yogurt.silver.orders o
JOIN yogurt.silver.lineitem li ON li.l_orderkey = o.o_orderkey
WHERE o.o_custkey = 446044;


In [0]:
%sql
-- Q2: first purchase date of customer 564787
SELECT MIN(o_orderdate) AS first_purchase_date
FROM yogurt.silver.orders
WHERE o_custkey = 564787;


In [0]:
%sql
-- Q3: how many orders customer 85909 made
SELECT COUNT(*) AS orders_count
FROM yogurt.silver.orders
WHERE o_custkey = 85909;


In [0]:
%sql
-- Q4: nationality of the customer with the most orders
WITH cnt AS (
  SELECT o_custkey, COUNT(*) AS c
  FROM yogurt.silver.orders
  GROUP BY o_custkey
),
top1 AS (
  SELECT o_custkey, c FROM cnt ORDER BY c DESC LIMIT 1
)
SELECT n.n_name AS nationality, t.c AS total_orders, t.o_custkey AS customer_id
FROM top1 t
JOIN yogurt.silver.customer c ON c.c_custkey = t.o_custkey
JOIN yogurt.silver.nation   n ON n.n_nationkey = c.c_nationkey;
