In [0]:
%sql
use banana

In [0]:
%fs ls dbfs:/public/retail_db

path,name,size,modificationTime
dbfs:/public/retail_db/README.md,README.md,806,1695484802091
dbfs:/public/retail_db/categories/,categories/,0,1695484787983
dbfs:/public/retail_db/create_db.sql,create_db.sql,10303297,1695484791659
dbfs:/public/retail_db/create_db_tables_pg.sql,create_db_tables_pg.sql,1748,1695484792227
dbfs:/public/retail_db/customers/,customers/,0,1695484792610
dbfs:/public/retail_db/departments/,departments/,0,1695484793900
dbfs:/public/retail_db/load_db_tables_pg.sql,load_db_tables_pg.sql,10297372,1695484796882
dbfs:/public/retail_db/order_items/,order_items/,0,1695484798843
dbfs:/public/retail_db/orders/,orders/,0,1695484797238
dbfs:/public/retail_db/products/,products/,0,1695484800954


In [0]:
orders = spark.read.csv('dbfs:/public/retail_db/orders/part-00000', schema='order_id int, order_date date, order_customer_id int, order_status string')

In [0]:
orders.createOrReplaceTempView('orders_v')

In [0]:
%sql

select * from orders_v limit 10

order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,4530,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,5657,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT


In [0]:
display(
    spark.sql('''
          select order_status,
          count(1) order_count
          from orders_v
          group by order_status
          order by 2 desc
    ''')
)

order_status,order_count
COMPLETE,22899
PENDING_PAYMENT,15030
PROCESSING,8275
PENDING,7610
CLOSED,7556
ON_HOLD,3798
SUSPECTED_FRAUD,1558
CANCELED,1428
PAYMENT_REVIEW,729


In [0]:
from pyspark.sql.functions import count, lit, col, round, sum

In [0]:
display(
    orders.groupBy('order_status'). \
    agg(count(lit(1)).alias('order_count'))
)

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


In [0]:
order_count_by_status = orders.groupBy('order_status'). \
    agg(count(lit(1)).alias('order_count'))

In [0]:
order_count_by_status.write.saveAsTable('banana.order_count_by_status')

In [0]:
order_count_by_status.write.saveAsTable('banana.order_count_by_status', format='parquet/delta', mode='append/overwrite/etc')

In [0]:
%sql

use banana

In [0]:
%sql

select * from order_count_by_status limit 10

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


In [0]:
order_count_by_status.write.saveAsTable('banana.order_count_by_status', mode='append')

In [0]:
%sql

select * from order_count_by_status 

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428
PENDING_PAYMENT,15030


In [0]:
order_count_by_status.write.saveAsTable('banana.order_count_by_status', mode='overwrite')

In [0]:
%sql

select * from order_count_by_status limit 10

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


In [0]:
%sql

drop table order_count_by_status

In [0]:
%sql

create table order_count_by_status (
  order_status string,
  order_count int
)

In [0]:
order_count_by_status.write.insertInto('banana.order_count_by_status')

In [0]:
%sql

select * from order_count_by_status limit 10

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


In [0]:
order_count_by_status.write.insertInto('banana.order_count_by_status', overwrite=True)

In [0]:
%sql

select * from order_count_by_status limit 10

order_status,order_count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


In [0]:
orders = spark.read.table('orders')

In [0]:
order_items = spark.read.table('order_items')

In [0]:
display(orders.limit(10))

order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT


In [0]:
display (orders.filter("order_status in ('COMPLETE', 'CLOSED')"). \
    join(order_items, orders['order_id'] == order_items['order_item_order_id']). \
    groupBy('order_date', 'order_item_product_id').agg(round(sum('order_item_subtotal'), 2).alias('revenue')). \
    orderBy('order_date', col('revenue').desc()). \
    limit(10))

order_date,order_item_product_id,revenue
2013-07-25 00:00:00.0,1004,5599.72
2013-07-25 00:00:00.0,191,5099.49
2013-07-25 00:00:00.0,957,4499.7
2013-07-25 00:00:00.0,365,3359.44
2013-07-25 00:00:00.0,1073,2999.85
2013-07-25 00:00:00.0,1014,2798.88
2013-07-25 00:00:00.0,403,1949.85
2013-07-25 00:00:00.0,502,1650.0
2013-07-25 00:00:00.0,627,1079.73
2013-07-25 00:00:00.0,226,599.99


In [0]:
spark.catalog.listTables('banana')

[Table(name='crud_demo', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='crud_demo_stg', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='daily_product_revenue', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='daily_revenue', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='daily_revenue_stg', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='departments', catalog='spark_catalog', namespace=['banana'], description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='order_count_by_status', catalog='spark_catalog', namespace=['banana'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='order_items', catalog='spar

In [0]:
for table in spark.catalog.listTables('banana'):
    print(table.name)

crud_demo
crud_demo_stg
daily_product_revenue
daily_revenue
daily_revenue_stg
departments
order_count_by_status
order_items
orders
orders_single_columns
sales
sales_fact
teste
users
orders_v


In [0]:
for table in spark.catalog.listTables('banana'):
    if table.tableType == 'TEMPORARY':
        print(table.name)

orders_v


In [0]:
for table in spark.catalog.listTables('banana'):
    if table.tableType == 'TEMPORARY':
        print(table.name)
        spark.catalog.dropTempView(table.name)

orders_v


In [0]:
for table in spark.catalog.listTables('banana'):
    if table.tableType == 'TEMPORARY':
        print(table.name)