In [0]:
#Michael McGeachy - mjm2xmm - Final DSS Project
#Limits were put in display functions for this submission to make it easier to grade. both are uploaded for transparency

import os
import json
import pymongo
import pyspark.pandas as pd  # This uses Koalas that is included in PySpark version 3.2 or newer.
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, DecimalType

#### 2.0. Instantiate Global Variables

In [0]:
# Azure MySQL Server Connection Information ###################
jdbc_hostname = "ds2000-mjm"
jdbc_port = 3306
src_database = "sakila_dw"

connection_properties = {
  "user" : "mmcgeachy",
  "password" : "Jack2020!",
  "driver" : "org.mariadb.jdbc.Driver"
}

# MongoDB Atlas Connection Information ########################
atlas_cluster_name = "cluster0.wrb1goe"
atlas_database_name = "sakila_dw"
atlas_user_name = "mjm2xmm"
atlas_password = "mjm2xmm"

# Data Files (JSON) Information ###############################
dst_database = "sakila_dlh"

base_dir = "dbfs:/FileStore/final_data"
database_dir = f"{base_dir}/{dst_database}"

data_dir = f"{base_dir}/rentals"
batch_dir = f"{data_dir}/batch"
stream_dir = f"{data_dir}/stream"

orders_stream_dir = f"{stream_dir}/rental"
payments_stream_dir = f"{stream_dir}/payments"
#inventory_trans_stream_dir = f"{stream_dir}/inventory_transactions"

orders_output_bronze = f"{database_dir}/fact_orders/bronze"
orders_output_silver = f"{database_dir}/fact_orders/silver"
orders_output_gold   = f"{database_dir}/fact_orders/gold"

purchase_orders_output_bronze = f"{database_dir}/fact_purchase_orders/bronze"
purchase_orders_output_silver = f"{database_dir}/fact_purchase_orders/silver"
purchase_orders_output_gold   = f"{database_dir}/fact_purchase_orders/gold"

inventory_trans_output_bronze = f"{database_dir}/fact_inventory_transactions/bronze"
inventory_trans_output_silver = f"{database_dir}/fact_inventory_transactions/silver"
inventory_trans_output_gold   = f"{database_dir}/fact_inventory_transactions/gold"

# Delete the Streaming Files ################################## 
dbutils.fs.rm(f"{database_dir}/fact_orders", True) 
dbutils.fs.rm(f"{database_dir}/fact_purchase_orders", True) 
dbutils.fs.rm(f"{database_dir}/fact_inventory_transactions", True)

# Delete the Database Files ###################################
dbutils.fs.rm(database_dir, True)

True

#### 3.0. Define Global Functions

In [0]:
##################################################################################################################
# Use this Function to Fetch a DataFrame from the MongoDB Atlas database server Using PyMongo.
##################################################################################################################
def get_mongo_dataframe(user_id, pwd, cluster_name, db_name, collection, conditions, projection, sort):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    
    client = pymongo.MongoClient(mongo_uri)

    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    if conditions and projection and sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection).sort(sort)))
    elif conditions and projection and not sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection)))
    else:
        dframe = pd.DataFrame(list(db[collection].find()))

    client.close()
    
    return dframe

##################################################################################################################
# Use this Function to Create New Collections by Uploading JSON file(s) to the MongoDB Atlas server.
##################################################################################################################
def set_mongo_collection(user_id, pwd, cluster_name, db_name, src_file_path, json_files):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    '''Read in a JSON file, and Use It to Create a New Collection'''
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(src_file_path, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)

    client.close()
    
    return result

### Section II: Populate Dimensions by Ingesting Reference (Cold-path) Data 
#### 1.0. Fetch Reference Data From an Azure MySQL Database
##### 1.1. Create a New Databricks Metadata Database.

In [0]:
%sql
DROP DATABASE IF EXISTS sakila_dlh CASCADE;

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sakila_dlh
COMMENT "DS-2002 Final Project Database"
LOCATION "dbfs:/FileStore/final_data/sakila_dlh"
WITH DBPROPERTIES (contains_pii = true, purpose = "DS-2002 Final Project");

##### 1.2. Create a New Table that Sources Date Dimension Data from a Table in an Azure MySQL database. 

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_date
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://ds2000-mjm.mysql.database.azure.com:3306/sakila_dw", --Replace with your Server Name
  dbtable "dim_date",
  user "mmcgeachy",    
  password "Jack2020!" 
)

In [0]:
# Dim Date table from MySQL

%sql
USE DATABASE sakila_dlh;

CREATE OR REPLACE TABLE sakila_dlh.dim_date
COMMENT "Date Dimension Table"
LOCATION "dbfs:/FileStore/final_data/sakila_dlh/dim_date"
AS SELECT * FROM view_date

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_date;

col_name,data_type,comment
date_key,int,
full_date,date,
date_name,varchar(11),
date_name_us,varchar(11),
date_name_eu,varchar(11),
day_of_week,int,
day_name_of_week,varchar(10),
day_of_month,int,
day_of_year,int,
weekday_weekend,varchar(10),


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_date LIMIT 5

date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,week_of_year,month_name,month_of_year,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000103,2000-01-03,2000/01/03,01/03/2000,03/01/2000,2,Monday,3,3,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000104,2000-01-04,2000/01/04,01/04/2000,04/01/2000,3,Tuesday,4,4,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000105,2000-01-05,2000/01/05,01/05/2000,05/01/2000,4,Wednesday,5,5,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


##### 1.3. Create a New Table that Sources Product Dimension Data from an Azure MySQL database.

In [0]:
%sql
-- Create a Temporary View named "view_product" that extracts data from your MySQL Northwind database.

CREATE OR REPLACE TEMPORARY VIEW view_staff
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://ds2000-mjm.mysql.database.azure.com:3306/sakila_dw", --Replace with your Server Name
  dbtable "dim_staff",
  user "mmcgeachy",    --Replace with your User Name
  password "Jack2020!"  --Replace with you password
)

In [0]:
%sql
USE DATABASE sakila_dlh;

-- Create a new table named "northwind_dlh.dim_product" using data from the view named "view_product"

CREATE OR REPLACE TABLE sakila_dlh.dim_staff
COMMENT "Staff Table"
LOCATION "dbfs:/FileStore/final_data/sakila_dlh/dim_staff"
AS SELECT * FROM view_staff

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_staff;

col_name,data_type,comment
staff_id,bigint,
first_name,varchar(65535),
last_name,varchar(65535),
email,varchar(65535),
store_id,bigint,
username,varchar(65535),
password,varchar(65535),
,,
# Delta Statistics Columns,,
Column Names,"first_name, email, username, store_id, last_name, staff_id, password",


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_staff LIMIT 5

staff_id,first_name,last_name,email,store_id,username,password
1,Mike,Hillyer,Mike.Hillyer@sakilastaff.com,1,Mike,8cb2237d0679ca88db6464eac60da96345513964
2,Jon,Stephens,Jon.Stephens@sakilastaff.com,2,Jon,


#### 2.0. Fetch Reference Data from a MongoDB Atlas Database
##### 2.1. View the Data Files on the Databricks File System

In [0]:
#Mongo Source for Payments Table

display(dbutils.fs.ls(batch_dir))  # '/dbfs/FileStore/final_data/rentals/batch'

path,name,size,modificationTime
dbfs:/FileStore/final_data/rentals/batch/payments.json,payments.json,1742725,1701374388000
dbfs:/FileStore/final_data/rentals/batch/rental.csv,rental.csv,803707,1701985689000
dbfs:/FileStore/final_data/rentals/batch/rental.json,rental.json,1525645,1701374754000


##### 2.2. Create a New MongoDB Database, and Load JSON Data Into a New MongoDB Collection
**NOTE:** The following cell **can** be run more than once because the **set_mongo_collection()** function **is** idempotent.

In [0]:
source_dir = '/dbfs/FileStore/final_data/rentals/batch'
json_files = {"payments" : 'payments.json'
            }

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, atlas_database_name, source_dir, json_files) 

<pymongo.results.InsertManyResult at 0x7f367cb62ac0>

##### 2.3.1. Fetch Customer Dimension Data from the New MongoDB Collection

In [0]:
%scala
import com.mongodb.spark._

val userName = "mjm2xmm"
val pwd = "mjm2xmm"
val clusterName = "cluster0.wrb1goe"
val atlas_uri = s"mongodb+srv://$userName:$pwd@$clusterName.mongodb.net/?retryWrites=true&w=majority"

In [0]:
%scala

val df_payments = spark.read.format("com.mongodb.spark.sql.DefaultSource")
.option("spark.mongodb.input.uri", atlas_uri)
.option("database", "sakila_dw")
.option("collection", "payments").load()
.select("payment_key","staff_id","rental_id","amount","payment_date")

display(df_payments.limit(100))

/*limit put in for grading practicality

payment_key,staff_id,rental_id,amount,payment_date
1,1,76,2.99,2005-05-25 11:30:37
2,1,573,0.99,2005-05-28 10:35:23
3,1,1185,5.99,2005-06-15 00:54:12
4,2,1422,0.99,2005-06-15 18:02:53
5,2,1476,9.99,2005-06-15 21:08:46
6,1,1725,4.99,2005-06-16 15:18:57
7,1,2308,4.99,2005-06-18 08:41:48
8,2,2363,0.99,2005-06-18 13:33:59
9,1,3284,3.99,2005-06-21 06:24:45
10,2,4526,5.99,2005-07-08 03:17:05


In [0]:
%scala
df_payments.printSchema()

##### 2.3.2. Use the Spark DataFrame to Create a New Customer Dimension Table in the Databricks Metadata Database (northwind_dlh)

In [0]:
%scala
df_payments.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_payments")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_payments

col_name,data_type,comment
payment_key,int,
staff_id,int,
rental_id,int,
amount,double,
payment_date,string,
,,
# Delta Statistics Columns,,
Column Names,"rental_id, payment_date, amount, payment_key, staff_id",
Column Selection Method,first-32,
,,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_payments LIMIT 5

payment_key,staff_id,rental_id,amount,payment_date
1,1,76,2.99,2005-05-25 11:30:37
2,1,573,0.99,2005-05-28 10:35:23
3,1,1185,5.99,2005-06-15 00:54:12
4,2,1422,0.99,2005-06-15 18:02:53
5,2,1476,9.99,2005-06-15 21:08:46


#### 3.0. Fetch Data from a File System
##### 3.1. Use PySpark to Read From a CSV File

In [0]:
#File Source for Rental Table

rental_csv = f"{batch_dir}/rental.csv"

df_rental = spark.read.format('csv').options(header='true', inferSchema='true').load(rental_csv)
display(df_rental.limit(100))

#limit put in for practical grading

rental_id,rental_date,return_date
1,2005-05-24T22:53:30Z,2005-05-26T22:04:30Z
2,2005-05-24T22:54:33Z,2005-05-28T19:40:33Z
3,2005-05-24T23:03:39Z,2005-06-01T22:12:39Z
4,2005-05-24T23:04:41Z,2005-06-03T01:43:41Z
5,2005-05-24T23:05:21Z,2005-06-02T04:33:21Z
6,2005-05-24T23:08:07Z,2005-05-27T01:32:07Z
7,2005-05-24T23:11:53Z,2005-05-29T20:34:53Z
8,2005-05-24T23:31:46Z,2005-05-27T23:33:46Z
9,2005-05-25T00:00:40Z,2005-05-28T00:22:40Z
10,2005-05-25T00:02:21Z,2005-05-31T22:44:21Z


In [0]:
df_rental.printSchema()

root
 |-- rental_id: integer (nullable = true)
 |-- rental_date: timestamp (nullable = true)
 |-- return_date: timestamp (nullable = true)



In [0]:
df_rental.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_rental")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_rental;

col_name,data_type,comment
rental_id,int,
rental_date,timestamp,
return_date,timestamp,
,,
# Delta Statistics Columns,,
Column Names,"rental_id, rental_date, return_date",
Column Selection Method,first-32,
,,
# Detailed Table Information,,
Catalog,spark_catalog,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_rental LIMIT 5;

rental_id,rental_date,return_date
1,2005-05-24T22:53:30Z,2005-05-26T22:04:30Z
2,2005-05-24T22:54:33Z,2005-05-28T19:40:33Z
3,2005-05-24T23:03:39Z,2005-06-01T22:12:39Z
4,2005-05-24T23:04:41Z,2005-06-03T01:43:41Z
5,2005-05-24T23:05:21Z,2005-06-02T04:33:21Z


##### Verify Dimension Tables

In [0]:
%sql
USE sakila_dlh;
SHOW TABLES

database,tableName,isTemporary
sakila_dlh,dim_date,False
sakila_dlh,dim_payments,False
sakila_dlh,dim_rental,False
sakila_dlh,dim_staff,False
,view_date,True
,view_staff,True


### Section III: Integrate Reference Data with Real-Time Data
#### 6.0. Use AutoLoader to Process Streaming (Hot Path) Orders Fact Data 
##### 6.1. Bronze Table: Process 'Raw' JSON Data

In [0]:
# Uses 3 source files for fact table from streaming folder

(spark.readStream
 .format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaHints", "staff_id BIGINT")
 .option("cloudFiles.schemaHints", "first_name STRING")
 .option("cloudFiles.schemaHints", "last_name STRING")
 .option("cloudFiles.schemaHints", "rental_id BIGINT") 
 .option("cloudFiles.schemaHints", "amount DECIMAL")
 .option("cloudFiles.schemaHints", "rental_date_key BIGINT")
 .option("cloudFiles.schemaLocation", orders_output_bronze)
 .option("cloudFiles.inferColumnTypes", "true")
 .option("multiLine", "true")
 .load(orders_stream_dir)
 .createOrReplaceTempView("orders_raw_tempview"))



In [0]:
%sql
/* Add Metadata for Traceability */
CREATE OR REPLACE TEMPORARY VIEW orders_bronze_tempview AS (
  SELECT *, current_timestamp() receipt_time, input_file_name() source_file
  FROM orders_raw_tempview
)

In [0]:
%sql
SELECT * FROM orders_bronze_tempview limit 100

/* limit put for practical grading purposes

amount,first_name,last_name,rental_date_key,rental_id,staff_id,_rescued_data,receipt_time,source_file
1.99,Mike,Hillyer,20050528,663,1,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
0.99,Jon,Stephens,20050528,664,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,665,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
5.99,Mike,Hillyer,20050528,666,1,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,667,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
2.99,Jon,Stephens,20050528,668,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,669,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
6.99,Mike,Hillyer,20050528,670,1,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
2.99,Mike,Hillyer,20050528,671,1,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
6.99,Jon,Stephens,20050528,672,2,,2023-12-07T23:17:45.502Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json


In [0]:
(spark.table("orders_bronze_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{orders_output_bronze}/_checkpoint")
      .outputMode("append")
      .table("fact_orders_bronze"))

<pyspark.sql.streaming.query.StreamingQuery at 0x7f367c3ea860>

##### 6.2. Silver Table: Include Reference Data

In [0]:
(spark.readStream
  .table("fact_orders_bronze")
  .createOrReplaceTempView("orders_silver_tempview"))

In [0]:
%sql
SELECT * FROM orders_silver_tempview limit 100 

/*limit put in place for practical reasons

amount,first_name,last_name,rental_date_key,rental_id,staff_id,_rescued_data,receipt_time,source_file
1.99,Mike,Hillyer,20050528,663,1,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
0.99,Jon,Stephens,20050528,664,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,665,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
5.99,Mike,Hillyer,20050528,666,1,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,667,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
2.99,Jon,Stephens,20050528,668,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
4.99,Jon,Stephens,20050528,669,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
6.99,Mike,Hillyer,20050528,670,1,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
2.99,Mike,Hillyer,20050528,671,1,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json
6.99,Jon,Stephens,20050528,672,2,,2023-12-07T23:17:58.785Z,dbfs:/FileStore/final_data/rentals/stream/rental/fact_orders_3.json


In [0]:
%sql
DESCRIBE EXTENDED orders_silver_tempview

col_name,data_type,comment
amount,double,
first_name,string,
last_name,string,
rental_date_key,bigint,
rental_id,bigint,
staff_id,bigint,
_rescued_data,string,
receipt_time,timestamp,
source_file,string,


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW fact_orders_silver_tempview AS (
  SELECT o.rental_id,
      o.staff_id,
      o.rental_date_key,
      e.last_name AS employee_last_name,
      e.first_name AS employee_first_name,
      e.email AS employee_email,
      c.amount as payment,
      c.payment_date,
      r.rental_date,
      rd.day_name_of_week AS order_day_name_of_week,
      rd.day_of_month AS order_day_of_month,
      rd.weekday_weekend AS order_weekday_weekend,
      rd.month_name AS order_month_name,
      rd.calendar_quarter AS order_quarter,
      rd.calendar_year AS order_year
      

  FROM orders_silver_tempview AS o
  INNER JOIN sakila_dlh.dim_staff AS e
  ON e.staff_id = o.staff_id
  INNER JOIN sakila_dlh.dim_payments AS c
  ON c.rental_id = o.rental_id
  INNER JOIN sakila_dlh.dim_rental AS r
  ON r.rental_id = o.rental_id
  LEFT OUTER JOIN sakila_dlh.dim_date AS rd
  ON rd.date_key = o.rental_date_key
)

In [0]:
(spark.table("fact_orders_silver_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{orders_output_silver}/_checkpoint")
      .outputMode("append")
      .table("fact_orders_silver"))

<pyspark.sql.streaming.query.StreamingQuery at 0x7f367c3ea830>

In [0]:
%sql
SELECT * FROM fact_orders_silver LIMIT 5

rental_id,staff_id,rental_date_key,employee_last_name,employee_first_name,employee_email,payment,payment_date,rental_date,order_day_name_of_week,order_day_of_month,order_weekday_weekend,order_month_name,order_quarter,order_year
663,1,20050528,Hillyer,Mike,Mike.Hillyer@sakilastaff.com,1.99,2005-05-28 21:23:02,2005-05-28T21:23:02Z,Saturday,28,Weekend,May,2,2005
664,2,20050528,Stephens,Jon,Jon.Stephens@sakilastaff.com,0.99,2005-05-28 21:31:08,2005-05-28T21:31:08Z,Saturday,28,Weekend,May,2,2005
665,2,20050528,Stephens,Jon,Jon.Stephens@sakilastaff.com,4.99,2005-05-28 21:38:39,2005-05-28T21:38:39Z,Saturday,28,Weekend,May,2,2005
666,1,20050528,Hillyer,Mike,Mike.Hillyer@sakilastaff.com,5.99,2005-05-28 21:48:51,2005-05-28T21:48:51Z,Saturday,28,Weekend,May,2,2005
667,2,20050528,Stephens,Jon,Jon.Stephens@sakilastaff.com,4.99,2005-05-28 21:49:02,2005-05-28T21:49:02Z,Saturday,28,Weekend,May,2,2005


Databricks data profile. Run in Databricks to view.

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.fact_orders_silver

col_name,data_type,comment
rental_id,bigint,
staff_id,bigint,
rental_date_key,bigint,
employee_last_name,varchar(65535),
employee_first_name,varchar(65535),
employee_email,varchar(65535),
payment,double,
payment_date,string,
rental_date,timestamp,
order_day_name_of_week,varchar(10),


In [0]:
#gold table with query that shows sales and number of orders in June 2005 for each salesmen
# As for its business use, this would be good for comparing performance over a given time
%sql
    SELECT   
          t.last_name AS staff_name,
          SUM(orders.payment) AS total_sales,
          COUNT(st.rental_id) AS numer_of_orders
    FROM sakila_dlh.fact_orders_silver AS orders 
    JOIN sakila_dlh.dim_rental st ON orders.rental_id=st.rental_id
    JOIN sakila_dlh.dim_staff t ON orders.staff_id=t.staff_id
    WHERE (orders.rental_date_key >= 20050601) and (orders.rental_date_key <= 20050630)
    GROUP BY t.last_name

staff_name,total_sales,numer_of_orders
Stephens,997.5400000000018,246
Hillyer,1051.410000000002,259
