In [0]:
%sql

select * from text.`/public/retail_db/orders`

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW orders (
  order_id INT,
  order_date STRING,
  order_customer_id INT, 
  order_status STRING
) USING CSV OPTIONS(
  path='/public/retail_db/orders',
  sep=','
)

In [0]:
%sql

select * from orders

In [0]:
%sql

select * from text.`/public/retail_db/order_items`

In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW order_items (
  order_item_id INT,
  order_item_order_id INT,
  order_item_product_id INT, 
  order_item_quantity INT,
  order_item_subtotal FLOAT,
  order_item_product_price FLOAT
) USING CSV OPTIONS(
  path='/public/retail_db/order_items',
  sep=','
)

In [0]:
%sql

select * from order_items

In [0]:
%sql

DESCRIBE orders

In [0]:
%sql

DESCRIBE order_items

In [0]:
%sql

SELECT o.order_date, oi.order_item_product_id, round(sum(oi.order_item_subtotal), 2) as revenue
from orders o
join order_items oi
on o.order_id = oi.order_item_order_id
where o.order_status in ('CLOSED', 'COMPLETE')
GROUP BY 1, 2
ORDER BY 1, 3

In [0]:
%sql

INSERT OVERWRITE DIRECTORY 'dbfs:/public/reatail_db/daily_product_revenue'
USING PARQUET
SELECT o.order_date, oi.order_item_product_id, round(sum(oi.order_item_subtotal), 2) as revenue
from orders o
join order_items oi
on o.order_id = oi.order_item_order_id
where o.order_status in ('CLOSED', 'COMPLETE')
GROUP BY 1, 2
ORDER BY 1, 3

In [0]:
%fs ls dbfs:/public/reatail_db/daily_product_revenue

In [0]:
%sql

select * from parquet.`/public/reatail_db/daily_product_revenue/`
order by order_date, revenue desc

In [0]:
%sql

select * from text.`dbfs:/public/retail_db/schemas.json`

In [0]:
schemas_text = spark.read.text('dbfs:/public/retail_db/schemas.json', wholetext=True).first().value

In [0]:
import json

In [0]:
column_details = json.loads(schemas_text)['orders']

In [0]:
column_details

In [0]:
columns = [col['column_name'] for col in sorted(column_details, key=lambda col: col['column_position'])]

In [0]:
%sql

select * from csv.`dbfs:/public/retail_db/orders`

In [0]:
orders = spark.read.csv('dbfs:/public/retail_db/orders', inferSchema=True).toDF(*columns)

In [0]:
from pyspark.sql.functions import count, col

orders.groupBy('order_status').agg(count('*').alias('order_count')).orderBy(col('order_count').desc()).show()

In [0]:
def get_column_names(schemas, tableName):
    schema = spark.read.text('dbfs:/public/retail_db/schemas.json', wholetext=True).first().value
    schemas = json.loads(schema)
    column_details = schemas[tableName]
    columns = [col['column_name'] for col in sorted(column_details, key=lambda col: col['column_position'])]
    return columns

In [0]:
get_column_names('dbfs:/public/retail_db/schemas.json', 'orders')

In [0]:
ds_list = ['departments', 'categories', 'products', 'customers', 'orders', 'order_items']

In [0]:
base_dir = 'dbfs:/public/retail_db'

In [0]:
for ds in ds_list:
    print(f'Processing {ds} data')
    columns = get_column_names(f'{base_dir}/schemas.json', ds)
    print(columns)

In [0]:
for ds in ds_list:
    print(f'Processing {ds} data')
    columns = get_column_names(f'{base_dir}/schemas.json', ds)
    df = spark.read.csv(f'{base_dir}/{ds}', inferSchema=True).toDF(*columns)
    df.write.mode('overwrite').parquet(f'{base_dir}_parquet/{ds}')

In [0]:
%fs ls dbfs:/public/retail_db_parquet/categories