In [0]:
orders_df =  spark.read.csv(
    'dbfs:/public/retail_db/orders', 
    schema='''
    order_id INT, 
    order_date DATE, 
    order_customer_id INT, 
    order_status STRING
    '''
)

order_items_df = spark.read.csv(
    'dbfs:/public/retail_db/order_items',
    schema='''
        order_item_id INT,
        order_item_order_id INT,
        order_item_product_id INT,
        order_item_quatity INT,
        order_item_subtotal FLOAT,
        order_item_product_price FLOAT
    '''
)

In [0]:
from pyspark.sql.functions import *

In [0]:
daily_revenue = 

In [0]:
orders_df.select('order_status').distinct().show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



In [0]:
daily_product_revenue_df = orders_df\
    .filter("order_status IN ('COMPLETE','CLOSED')")\
    .join(order_items_df, orders_df['order_id'] == order_items_df['order_item_order_id'])\
    .groupBy(orders_df['order_date'], order_items_df['order_item_order_id'])\
    .agg(round(sum('order_item_subtotal'), 2).alias('revenue'))

In [0]:
display(
    daily_product_revenue_df.orderBy('order_date')
)

order_date,order_item_order_id,revenue
2013-07-25,57,637.9
2013-07-25,1,299.98
2013-07-25,18,449.96
2013-07-25,65,299.98
2013-07-25,56,699.89
2013-07-25,45,499.85
2013-07-25,35,129.99
2013-07-25,72,463.92
2013-07-25,95,199.95
2013-07-25,63,899.92


In [0]:
# Global Ranks - orderBy
# Ranks with in each partition or group - partitiondy and ordersy
# df.select(rank().over(Window.orderBy(col('revenue').desc())))
# df.select(rank().over(Window.partitionBy('order_date').orderBy(col('revenue').desc())))

In [0]:
display(
    daily_product_revenue_df\
    .filter("order_date = '2014-01-01'").orderBy('order_date', col('revenue').desc())
)

order_date,order_item_order_id,revenue
2014-01-01,25948,1699.88
2014-01-01,25958,1429.86
2014-01-01,61907,1299.91
2014-01-01,25939,1259.89
2014-01-01,25913,1159.91
2014-01-01,25936,1127.89
2014-01-01,25984,899.88
2014-01-01,25959,899.85
2014-01-01,25882,879.93
2014-01-01,61913,879.87


In [0]:
from pyspark.sql.window import Window

In [0]:
display(
    daily_product_revenue_df\
    .filter("order_date = '2014-01-01'")\
    .withColumn('drnk', dense_rank().over(Window.orderBy(col('revenue').desc())))\
    .filter("drnk <=5")\
    .orderBy('order_date', col('revenue').desc())
)

order_date,order_item_order_id,revenue,drnk
2014-01-01,25948,1699.88,1
2014-01-01,25958,1429.86,2
2014-01-01,61907,1299.91,3
2014-01-01,25939,1259.89,4
2014-01-01,25913,1159.91,5


In [0]:
display(
    daily_product_revenue_df\
    .filter("date_format(order_date, 'yyyyMM') = 201401")\
    .groupBy(date_format('order_date', 'yyyyMM').alias('order_month'), 'order_item_order_id')\
    .agg(round(sum('revenue'), 2).alias('revenue'))\
    .withColumn('drnk', dense_rank().over(Window.orderBy(col('revenue').desc())))\
    .filter("drnk <=5")\
    .orderBy('order_month', col('revenue').desc())
)

order_month,order_item_order_id,revenue,drnk
201401,68778,2629.9,1
201401,30299,1899.9,2
201401,29735,1819.87,3
201401,25948,1699.88,4
201401,62657,1694.93,5


In [0]:
display(
    daily_product_revenue_df\
        .filter("date_format(order_date, 'yyyyMM') = 201401")
        .withColumn('drnk', dense_rank().over(Window.partitionBy('order_date').orderBy(col('revenue').desc())))\
        .orderBy('order_date', col('revenue').desc())
)

order_date,order_item_order_id,revenue,drnk
2014-01-01,25948,1699.88,1
2014-01-01,25958,1429.86,2
2014-01-01,61907,1299.91,3
2014-01-01,25939,1259.89,4
2014-01-01,25913,1159.91,5
2014-01-01,25936,1127.89,6
2014-01-01,25984,899.88,7
2014-01-01,25959,899.85,8
2014-01-01,25882,879.93,9
2014-01-01,61913,879.87,10


In [0]:
spec = Window.partitionBy('order_date').orderBy(col('revenue').desc())

In [0]:
spec

Out[41]: <pyspark.sql.window.WindowSpec at 0x7f7f89d733d0>

In [0]:
display(
    daily_product_revenue_df\
        .filter("date_format(order_date, 'yyyyMM') = 201401")
        .withColumn('drnk', dense_rank().over(spec))\
        .filter('drnk <= 3')\
        .orderBy('order_date', col('revenue').desc())
)

order_date,order_item_order_id,revenue,drnk
2014-01-01,25948,1699.88,1
2014-01-01,25958,1429.86,2
2014-01-01,61907,1299.91,3
2014-01-02,25999,1199.93,1
2014-01-02,26066,1124.82,2
2014-01-02,26061,989.89,3
2014-01-03,68778,2629.9,1
2014-01-03,61951,1529.92,2
2014-01-03,67978,1499.92,3
2014-01-04,26363,1399.87,1


In [0]:
display(
    daily_product_revenue_df\
        .filter("order_date BETWEEN '2014-01-01' AND '2014-03-31'")
        .withColumn('drnk', dense_rank().over(spec))\
        .filter('drnk <= 3')\
        .orderBy('order_date', col('revenue').desc())
)

order_date,order_item_order_id,revenue,drnk
2014-01-01,25948,1699.88,1
2014-01-01,25958,1429.86,2
2014-01-01,61907,1299.91,3
2014-01-02,25999,1199.93,1
2014-01-02,26066,1124.82,2
2014-01-02,26061,989.89,3
2014-01-03,68778,2629.9,1
2014-01-03,61951,1529.92,2
2014-01-03,67978,1499.92,3
2014-01-04,26363,1399.87,1


In [0]:
%sql

show tables

database,tableName,isTemporary
default,orders_single_column,False
default,student_scores,False


In [0]:
%sql

create table if not exists student_scores (
  student_id int,
  student_score int
);

In [0]:
%sql

insert overwrite student_scores values 
(1,980),
(2,960),
(3,NULL),
(4,990),
(5,920),
(6,960),
(7,980),
(8,960),
(9,940),
(10,NULL)

num_affected_rows,num_inserted_rows
10,10


In [0]:
%sql

select * from student_scores
order by student_score

student_id,student_score
3,
10,
5,920.0
9,940.0
2,960.0
6,960.0
8,960.0
1,980.0
7,980.0
4,990.0


In [0]:
display(
    spark.read.table('student_scores').
    orderBy(col('student_score').desc())
)

student_id,student_score
4,990.0
1,980.0
7,980.0
2,960.0
6,960.0
8,960.0
9,940.0
5,920.0
3,
10,


In [0]:
spec = Window.orderBy(col('student_score').desc())

In [0]:
student_scores = spark.read.table('student_scores')

In [0]:
display(student_scores)

student_id,student_score
1,980.0
2,960.0
3,
4,990.0
5,920.0
6,960.0
7,980.0
8,960.0
9,940.0
10,


In [0]:
display(
    student_scores\
    .withColumn('rnk', rank().over(spec))\
    .withColumn('drnk', dense_rank().over(spec))\
    .orderBy(col('student_score').desc())
)

student_id,student_score,rnk,drnk
4,990.0,1,1
1,980.0,2,2
7,980.0,2,2
6,960.0,4,3
8,960.0,4,3
2,960.0,4,3
9,940.0,7,4
5,920.0,8,5
3,,9,6
10,,9,6
