# [SETUP] 
connect to DuckDB

In [1]:
# Load the extension
%load_ext sql

In [2]:
# Connect to DuckDB
%sql duckdb:///tpch.db

In [3]:
%config SqlMagic.displaylimit = 100

In [4]:
%%sql
-- Run a simple show tables
SELECT
  table_name
FROM
  information_schema.tables
WHERE
  table_schema = 'main'

table_name
customer
lineitem
nation
orders
part
partsupp
region
supplier


## [Exercise] 

Write a query to remove duplicates from the clickstream data (created as CTE below)

**Time limit during live workshop: 5 min**

**Hint:**
  1. Think about how you can use `row_number` as shown above to remove duplicates.

In [19]:
%%sql
WITH clickstream AS (
    SELECT
        1 AS user_id, '2024-07-01 10:00:00' AS click_time UNION ALL
    SELECT
        1 AS user_id, '2024-07-01 10:05:00' AS click_time UNION ALL
    SELECT
        1 AS user_id, '2024-07-01 10:10:00' AS click_time UNION ALL
    SELECT
        1 AS user_id, '2024-07-01 10:10:00' AS click_time UNION ALL
    SELECT
        1 AS user_id, '2024-07-01 10:10:00' AS click_time UNION ALL
    SELECT
        1 AS user_id, '2024-07-01 10:10:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:15:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:20:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:20:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:20:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:20:00' AS click_time UNION ALL
    SELECT
        2 AS user_id, '2024-07-01 10:25:00' AS click_time
),
-- your code here
ranked_clicks AS (
    SELECT
        user_id,
        click_time,
        ROW_NUMBER()
        OVER (
            PARTITION BY
                user_id,
                click_time
        ) AS click_rank
    FROM
        clickstream
)
SELECT
    *
FROM
    ranked_clicks
WHERE
    click_rank = 1
ORDER BY
    user_id,
    click_time;

user_id,click_time,click_rank
1,2024-07-01 10:00:00,1
1,2024-07-01 10:05:00,1
1,2024-07-01 10:10:00,1
2,2024-07-01 10:15:00,1
2,2024-07-01 10:20:00,1
2,2024-07-01 10:25:00,1


## [Exercise] 

Write a query to pivot orders data by orderpriority and show average total price grouped by year
    
**Time limit during live workshop: 5 min**

**Hint**: 
    1. Use `strftime(o_orderdate, '%Y') AS order_year` to get order_year.

`orders` table schema: ![Orders](../../images/orders.png)

In [27]:
%%sql
-- your query here
SELECT
    strftime(o_orderdate, '%Y') AS order_year,
    ROUND(AVG(CASE
                WHEN o_orderpriority = '1-URGENT' THEN o_totalprice
                ELSE NULL
            END), 2) AS urgent_order_avg_price,
    ROUND(AVG(CASE
                WHEN o_orderpriority = '2-HIGH' THEN o_totalprice
                ELSE NULL
            END), 2) AS high_order_avg_price,
    ROUND(AVG(CASE
                WHEN o_orderpriority = '3-MEDIUM' THEN o_totalprice
                ELSE NULL
            END), 2) AS medium_order_avg_price,
    ROUND(AVG(CASE
                WHEN o_orderpriority = '4-NOT SPECIFIED' THEN o_totalprice
                ELSE NULL
            END), 2) AS not_specified_order_avg_price,
    ROUND(AVG(CASE
                WHEN o_orderpriority = '5-LOW' THEN o_totalprice
                ELSE NULL
            END), 2) AS low_order_avg_price
FROM
    orders
GROUP BY
    strftime(o_orderdate, '%Y');

order_year,urgent_order_avg_price,high_order_avg_price,medium_order_avg_price,not_specified_order_avg_price,low_order_avg_price
1992,150779.6,151588.26,151384.55,150863.81,151271.86
1995,150920.56,151526.86,151343.54,150597.69,151086.98
1994,151426.23,152017.45,150287.01,150731.78,151605.29
1996,151061.18,151774.86,151454.07,151569.38,151042.53
1997,151030.14,150796.07,151257.83,150211.43,151236.81
1993,151896.15,151544.17,150977.08,151375.31,151783.54
1998,151605.58,151681.48,151524.74,149777.83,151734.08
