In [0]:
%sql
-- Clean up the final project catalog if rerun 
DROP VOLUME IF EXISTS cscie103_catalog_final.bronze.landing;

DROP SCHEMA IF EXISTS cscie103_catalog_final.bronze CASCADE;
DROP SCHEMA IF EXISTS cscie103_catalog_final.silver CASCADE;
DROP SCHEMA IF EXISTS cscie103_catalog_final.gold   CASCADE;

DROP CATALOG IF EXISTS cscie103_catalog_final CASCADE;

In [0]:
%sql
-- Create final project catalog
CREATE CATALOG cscie103_catalog_final
COMMENT 'cscie103 final catalog with medallion architecture';

-- Create schemas for each medallion layer
CREATE SCHEMA cscie103_catalog_final.bronze
COMMENT 'Prosumers Bronze layer schema for raw/landing data';

CREATE SCHEMA cscie103_catalog_final.silver
COMMENT 'Prosumers Silver layer schema for cleaned and validated data';

CREATE SCHEMA cscie103_catalog_final.gold
COMMENT 'Prosumers Gold layer schema for curated analytics data';

-- Create managed landing volume in Bronze schema
CREATE VOLUME cscie103_catalog_final.bronze.landing
COMMENT 'Managed landing zone for Prosumers data files';

-- Create managed silver volume in Silver schema for checkpoints stream processing
CREATE VOLUME if not exists cscie103_catalog_final.silver.checkpoints
COMMENT 'Managed stream checkpoints for Prosumers';

CREATE VOLUME if not exists cscie103_catalog_final.gold.reports
COMMENT 'Managed report checkpoints for Prosumers';

-- Grant all privileges to users
GRANT ALL PRIVILEGES ON CATALOG cscie103_catalog_final TO `lil658@g.harvard.edu`;

In [0]:
spark.sql('USE CATALOG cscie103_catalog_final')

In [0]:
county_name_mapping = {
0:"HARJUMAA",
1:"HIIUMAA",
2:"IDA-VIRUMAA",
3:"JÄRVAMAA",
4:"JÕGEVAMAA",
5:"LÄÄNE-VIRUMAA",
6:"LÄÄNEMAA",
7:"PÄRNUMAA",
8:"PÕLVAMAA",
9:"RAPLAMAA",
10:"SAAREMAA",
11:"TARTUMAA",
12:"UNKNOWN",
13:"VALGAMAA",
14:"VILJANDIMAA",
15:"VÕRUMAA"
}
product_tyupe_mapping = {0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}

#Join client with county and product mapping cand create a materialized view
# first let's turn the mapping into a spark dataframe and save it as a table
county_df=spark.createDataFrame(county_name_mapping.items(), ["county_id", "county_name"])
county_df.write.mode("overwrite").saveAsTable("silver.county_mapping")
product_df=spark.createDataFrame(product_tyupe_mapping.items(), ["product_id", "product_type"])
product_df.write.mode("overwrite").saveAsTable("silver.product_mapping")