## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [3]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [4]:
def select(query):
    return pd.read_sql(query, con)

In [5]:
pd.options.display.max_rows = 10

## 9. 시계열 기반으로 데이터 집계하기

### [9-1] 날짜별 매출과 평균 구매액을 집계하는 쿼리

In [6]:
select('SELECT * FROM purchase_log;')

Unnamed: 0,dt,order_id,user_id,purchase_amount
0,2014-01-01,1,rhwpvvitou,13900
1,2014-01-01,2,hqnwoamzic,10616
2,2014-01-02,3,tzlmqryunr,21156
3,2014-01-02,4,wkmqqwbyai,14893
4,2014-01-03,5,ciecbedwbq,13054
...,...,...,...,...
21,2014-01-08,22,bwfbchzgnl,2299
22,2014-01-09,23,zzgauelgrt,16475
23,2014-01-09,24,qrzfcwecge,6469
24,2014-01-10,25,njbpsrvvcq,16584


In [7]:
query_91 = """
        SELECT 
           dt
         , COUNT(*) AS purchase_count
         , SUM(purchase_amount) AS total_amount
         , AVG(purchase_amount) AS avg_amount
        FROM
         purchase_log
        GROUP BY
         dt
        ORDER By
         dt
        ;
        """

select(query_91)

Unnamed: 0,dt,purchase_count,total_amount,avg_amount
0,2014-01-01,2,24516,12258.0
1,2014-01-02,2,36049,18024.5
2,2014-01-03,3,53029,17676.333333
3,2014-01-04,3,29299,9766.333333
4,2014-01-05,3,48256,16085.333333
5,2014-01-06,3,29440,9813.333333
6,2014-01-07,3,47679,15893.0
7,2014-01-08,3,19760,6586.666667
8,2014-01-09,2,22944,11472.0
9,2014-01-10,2,27923,13961.5


### [9-2] 날짜별 매출과 7일 이동평균을 집계하는 쿼리

In [8]:
query_92 = """
        SELECT 
           dt
         , SUM(purchase_amount) AS total_amount
           -- 최근 최대 7일 동안의 평균 계산하기
         , AVG(SUM(purchase_amount))
            OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
            AS seven_day_avg
           -- 최근 7일 동안의 평균을 확실하게 계산하기        
         , CASE
            WHEN
             COUNT(*)
             OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
             = 7
            THEN
             AVG(SUM(purchase_amount))
             OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
           END
           AS seven_day_avg_strict
        FROM
         purchase_log
        GROUP BY
         dt
        ORDER By
         dt
        ;
        """

select(query_92)

Unnamed: 0,dt,total_amount,seven_day_avg,seven_day_avg_strict
0,2014-01-01,24516,24516.0,
1,2014-01-02,36049,30282.5,
2,2014-01-03,53029,37864.666667,
3,2014-01-04,29299,35723.25,
4,2014-01-05,48256,38229.8,
5,2014-01-06,29440,36764.833333,
6,2014-01-07,47679,38324.0,38324.0
7,2014-01-08,19760,37644.571429,37644.571429
8,2014-01-09,22944,35772.428571,35772.428571
9,2014-01-10,27923,32185.857143,32185.857143


### [9-3] 날짜별 매출과 당월 누계 매출을 집계하는 쿼리

In [9]:
query_93 = """
        SELECT
           dt
           -- '연-월' 추출하기
         , substring(dt, 1, 7) AS year_month
         , SUM(purchase_amount) AS total_amount
         , SUM(SUM(purchase_amount))
            OVER(PARTITION BY substring(dt, 1, 7) ORDER BY dt ROWS UNBOUNDED PRECEDING)
            AS agg_amount
        FROM
         purchase_log
        GROUP BY
         dt
        ORDER By
         dt
        ;
        """

select(query_93)

Unnamed: 0,dt,year_month,total_amount,agg_amount
0,2014-01-01,2014-01,24516,24516.0
1,2014-01-02,2014-01,36049,60565.0
2,2014-01-03,2014-01,53029,113594.0
3,2014-01-04,2014-01,29299,142893.0
4,2014-01-05,2014-01,48256,191149.0
5,2014-01-06,2014-01,29440,220589.0
6,2014-01-07,2014-01,47679,268268.0
7,2014-01-08,2014-01,19760,288028.0
8,2014-01-09,2014-01,22944,310972.0
9,2014-01-10,2014-01,27923,338895.0


### [9-4] 날짜별 매출을 일시 테이블로 만드는 쿼리

In [10]:
query_94 = """
        WITH
        daily_purchase AS (
         SELECT
            dt
            -- '연', '월', '일'을 각각 추출하기
          , substring(dt, 1, 4) AS year
          , substring(dt, 6, 2) AS month
          , substring(dt, 9, 2) AS date          
          , SUM(purchase_amount) AS purchase_amount
          , COUNT(order_id) AS orders
         FROM
           purchase_log
         GROUP BY
           dt
        )
        SELECT *
        FROM daily_purchase
        ORDER BY dt
        ;
        """

select(query_94)

Unnamed: 0,dt,year,month,date,purchase_amount,orders
0,2014-01-01,2014,1,1,24516,2
1,2014-01-02,2014,1,2,36049,2
2,2014-01-03,2014,1,3,53029,3
3,2014-01-04,2014,1,4,29299,3
4,2014-01-05,2014,1,5,48256,3
5,2014-01-06,2014,1,6,29440,3
6,2014-01-07,2014,1,7,47679,3
7,2014-01-08,2014,1,8,19760,3
8,2014-01-09,2014,1,9,22944,2
9,2014-01-10,2014,1,10,27923,2


### [9-5] daily_purchase 테이블에 대해 당월 누계 매출을 집계하는 쿼리

In [11]:
query_95 = """
        WITH
        daily_purchase AS (
         SELECT
            dt
            -- '연', '월', '일'을 각각 추출하기
          , substring(dt, 1, 4) AS year
          , substring(dt, 6, 2) AS month
          , substring(dt, 9, 2) AS date          
          , SUM(purchase_amount) AS purchase_amount
          , COUNT(order_id) AS orders
         FROM
           purchase_log
         GROUP BY
           dt
        )
        SELECT
           dt
         , concat(year, '-', month) AS year_month
         , purchase_amount
         , SUM(purchase_amount)
            -- PARTITION BY 뒤에 GROUP BY 처럼 1개 이상 사용가능
            OVER(PARTITION BY year, month ORDER BY dt ROWS UNBOUNDED PRECEDING)
           AS agg_amount
        FROM daily_purchase
        ORDER BY dt
        ;
        """

select(query_95)

Unnamed: 0,dt,year_month,purchase_amount,agg_amount
0,2014-01-01,2014-01,24516,24516.0
1,2014-01-02,2014-01,36049,60565.0
2,2014-01-03,2014-01,53029,113594.0
3,2014-01-04,2014-01,29299,142893.0
4,2014-01-05,2014-01,48256,191149.0
5,2014-01-06,2014-01,29440,220589.0
6,2014-01-07,2014-01,47679,268268.0
7,2014-01-08,2014-01,19760,288028.0
8,2014-01-09,2014-01,22944,310972.0
9,2014-01-10,2014-01,27923,338895.0


### [9-6] 월별 매출과 작대비를 계산하는 쿼리

In [12]:
select('SELECT * FROM purchase_log2;')

Unnamed: 0,dt,order_id,user_id,purchase_amount
0,2014-01-01,1,rhwpvvitou,13900
1,2014-02-08,95,chtanrqtzj,28469
2,2014-03-09,168,bcqgtwxdgq,18899
3,2014-04-11,250,kdjyplrxtk,12394
4,2014-05-11,325,pgnjnnapsc,2282
...,...,...,...,...
19,2015-08-08,1441,lpglkecvsl,12906
20,2015-09-07,1516,mgtlsfgfbj,5696
21,2015-10-07,1591,trgjscaajt,13398
22,2015-11-06,1666,ccfbjyeqrb,6213


In [13]:
query_96 = """
        WITH
        daily_purchase AS (
         SELECT
            dt
            -- '연', '월', '일'을 각각 추출하기
          , substring(dt, 1, 4) AS year
          , substring(dt, 6, 2) AS month
          , substring(dt, 9, 2) AS date          
          , SUM(purchase_amount) AS purchase_amount
          , COUNT(order_id) AS orders
         FROM
           purchase_log2
         GROUP BY
           dt
        )
        SELECT
           month
         , SUM(CASE year WHEN '2014' THEN purchase_amount END) AS amount_2014
         , SUM(CASE year WHEN '2015' THEN purchase_amount END) AS amount_2015         
           -- 2014년 대비 2015년의 월별 매출 성장률
         , 100.0
           * SUM(CASE year WHEN '2014' THEN purchase_amount END)
           / SUM(CASE year WHEN '2014' THEN purchase_amount END)
           AS rate
        FROM 
           daily_purchase
        GROUP BY month
        ORDER BY month
        ;
        """

select(query_96)

Unnamed: 0,month,amount_2014,amount_2015,rate
0,01,13900.0,22111.0,100.0
1,02,28469.0,11965.0,100.0
2,03,18899.0,20215.0,100.0
3,04,12394.0,11792.0,100.0
4,05,2282.0,18087.0,100.0
...,...,...,...,...
7,08,6243.0,12906.0,100.0
8,09,3832.0,5696.0,100.0
9,10,6716.0,13398.0,100.0
10,11,16444.0,6213.0,100.0


### [9-7] 2015년 매출에 대한 Z 차트를 작성하는 쿼리

In [14]:
query_97 = """
        WITH
        daily_purchase AS (
         SELECT
            dt
            -- '연', '월', '일'을 각각 추출하기
          , substring(dt, 1, 4) AS year
          , substring(dt, 6, 2) AS month
          , substring(dt, 9, 2) AS date          
          , SUM(purchase_amount) AS purchase_amount
          , COUNT(order_id) AS orders
         FROM
           purchase_log2
         GROUP BY
           dt
        )
        , monthly_amount AS (
          SELECT
            -- 월별 매출 집계하기
            year
          , month
          , SUM(purchase_amount) AS amount
          FROM
            daily_purchase
          GROUP BY
            year, month
        )
        , calc_index AS (
           SELECT
             year
           , month
           , amount
             -- 2015년의 누계 매출 집계하기
           , SUM(CASE WHEN year = '2015' THEN amount END)
              OVER(ORDER BY year, month ROWS UNBOUNDED PRECEDING)
             AS agg_amount
             -- 당월부터 11개월 이전까지의 매출 합계(이동년계) 집계하기
           , SUM(amount)
              OVER(ORDER BY year, month ROWS BETWEEN 11 PRECEDING AND CURRENT ROW)
             AS year_avg_amount
           FROM
             monthly_amount
           ORDER BY
             year, month
        )
        -- 마지막으로 2015년의 데이터만 압축하기
        SELECT
           concat(year, '-', month) AS year_month
         , amount
         , agg_amount
         , year_avg_amount
        FROM
           calc_index
        WHERE
           year = '2015'
        ORDER BY
           year_month
        ;
        """

select(query_97)

Unnamed: 0,year_month,amount,agg_amount,year_avg_amount
0,2015-01,22111.0,22111.0,160796.0
1,2015-02,11965.0,34076.0,144292.0
2,2015-03,20215.0,54291.0,145608.0
3,2015-04,11792.0,66083.0,145006.0
4,2015-05,18087.0,84170.0,160811.0
...,...,...,...,...
7,2015-08,12906.0,130854.0,187045.0
8,2015-09,5696.0,136550.0,188909.0
9,2015-10,13398.0,149948.0,195591.0
10,2015-11,6213.0,156161.0,185360.0


### [9-8] 매출과 관련된 지표를 집계하는 쿼리

In [15]:
query_98 = """
        WITH
        daily_purchase AS (
         SELECT
            dt
            -- '연', '월', '일'을 각각 추출하기
          , substring(dt, 1, 4) AS year
          , substring(dt, 6, 2) AS month
          , substring(dt, 9, 2) AS date          
          , SUM(purchase_amount) AS purchase_amount
          , COUNT(order_id) AS orders
         FROM
           purchase_log2
         GROUP BY
           dt
        )
        , monthly_purchase AS (
          SELECT
            year
          , month
          , SUM(orders) AS orders
          , AVG(purchase_amount) AS avg_amount
          , SUM(purchase_amount) AS monthly
          FROM
            daily_purchase
          GROUP BY
            year, month
        )
        SELECT
           concat(year, '-', month) AS year_month
         , orders
         , avg_amount
         , monthly
         , SUM(monthly)
            OVER(PARTITION BY year ORDER BY month ROWS UNBOUNDED PRECEDING)
           AS agg_amonut
           -- 12개월 전의 매출 구하기
         , LAG(monthly, 12)
              OVER(ORDER BY year, month)
           AS last_year
         , 100.0
           * monthly
           / LAG(monthly, 12)
              OVER(ORDER BY year, month)
           AS rate
        FROM
           monthly_purchase
        ORDER BY
           year_month
        ;
        """

select(query_98)

Unnamed: 0,year_month,orders,avg_amount,monthly,agg_amonut,last_year,rate
0,2014-01,1.0,13900.0,13900.0,13900.0,,
1,2014-02,1.0,28469.0,28469.0,42369.0,,
2,2014-03,1.0,18899.0,18899.0,61268.0,,
3,2014-04,1.0,12394.0,12394.0,73662.0,,
4,2014-05,1.0,2282.0,2282.0,75944.0,,
...,...,...,...,...,...,...,...
19,2015-08,1.0,12906.0,12906.0,130854.0,6243.0,206.727535
20,2015-09,1.0,5696.0,5696.0,136550.0,3832.0,148.643006
21,2015-10,1.0,13398.0,13398.0,149948.0,6716.0,199.493746
22,2015-11,1.0,6213.0,6213.0,156161.0,16444.0,37.782778


## 10. 다면적인 축을 사용해 데이터 집약하기

### [10-1] 카테고리별 매출과 소계를 동시에 구하는 쿼리

In [17]:
select('SELECT * FROM purchase_detail_log;')

Unnamed: 0,dt,order_id,user_id,item_id,price,category,sub_category
0,2017-01-18,48291,usr33395,lad533,37300,ladys_fashion,bag
1,2017-01-18,48291,usr33395,lad329,97300,ladys_fashion,jacket
2,2017-01-18,48291,usr33395,lad102,114600,ladys_fashion,jacket
3,2017-01-18,48291,usr33395,lad886,33300,ladys_fashion,bag
4,2017-01-18,48292,usr52832,dvd871,32800,dvd,documentary
...,...,...,...,...,...,...,...
11,2017-01-18,48294,usr33604,cd477,25800,cd,classic
12,2017-01-18,48294,usr33604,boo468,31000,book,business
13,2017-01-18,48294,usr33604,foo402,48700,food,meats
14,2017-01-18,48295,usr38013,foo134,32000,food,fish


In [25]:
query_101 = """
        WITH
        sub_category_amount AS (
         -- 소 카테고리의 매출 집계하기
         SELECT
            category AS category
          , sub_category AS sub_category
          , SUM(price) AS amount
         FROM
           purchase_detail_log
         GROUP BY
           category, sub_category
        )
        , category_amount AS (
           -- 대 카테고리의 매출 집계하기
           SELECT
              category
            , 'all' AS sub_category
            , SUM(price) AS amount
           FROM
              purchase_detail_log
           GROUP BY
              category
        )
        , total_amount AS (
            -- 전체 매출 집계하기
            SELECT
               'all' AS category
             , 'all' AS sub_category
             , SUM(price) AS amount
            FROM
               purchase_detail_log
        )
                  SELECT category, sub_category, amount FROM sub_category_amount
        UNION ALL SELECT category, sub_category, amount FROM category_amount
        UNION ALL SELECT category, sub_category, amount FROM total_amount        
        ;
        """

select(query_101)

Unnamed: 0,category,sub_category,amount
0,ladys_fashion,bag,127900
1,food,fish,32000
2,food,meats,48700
3,dvd,documentary,32800
4,mens_fashion,jacket,116300
...,...,...,...
14,mens_fashion,all,116300
15,cd,all,25800
16,dvd,all,32800
17,food,all,80700


### [10-2] ROLLUP을 사용해서 카테고리별 매출과 소계를 동시에 구하는 쿼리

In [28]:
query_102 = """
        SELECT
           -- ROLLUP을 사용하면 소계를 계산할 때
           -- 레코드 집계 키가 NULL이 되므로,
           -- COALESCE 함수로 NULL을 문자열 'all'로 변환
           COALESCE(category, 'all') AS category
         , COALESCE(sub_category, 'all') AS sub_category
         , SUM(price) AS amount
        FROM
           purchase_detail_log
        GROUP BY
           ROLLUP(category, sub_category)
        ;
        """

select(query_102)

Unnamed: 0,category,sub_category,amount
0,all,all,861100
1,ladys_fashion,bag,127900
2,food,fish,32000
3,food,meats,48700
4,dvd,documentary,32800
...,...,...,...
14,outdoor,all,28600
15,mens_fashion,all,116300
16,cd,all,25800
17,dvd,all,32800


### [10-3] 매출 구성비누계와 ABC 등급을 계산하는 쿼리

In [32]:
query_103 = """
        WITH
        monthly_sales AS (
         SELECT
            category
          , SUM(price) AS amount
         FROM
           purchase_detail_log
         WHERE
           dt BETWEEN '2017-01-01' AND '2017-01-31'
         GROUP BY
           category
        )
        , sales_composition_ratio AS (
           SELECT
              category
            , amount
            -- 구성비 : 100.0 * <항목별 매출> / <전체 매출>
            , 100.0 * amount / SUM(amount) OVER() AS composition_ratio
            -- 구성비누계 : 100.0 * <항목별 구계 매출> / <전체 매출>
            , 100.0 * SUM(amount) OVER(ORDER BY amount DESC
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
            / SUM(amount) OVER() AS cumulative_ratio     
           FROM
              monthly_sales
        )
        SELECT
           *
         , CASE
            WHEN cumulative_ratio BETWEEN 0 AND 70 THEN 'A'
            WHEN cumulative_ratio BETWEEN 70 AND 90 THEN 'B'
            WHEN cumulative_ratio BETWEEN 90 AND 100 THEN 'C'
           END AS abc_rank
        FROM
           sales_composition_ratio
        ORDER BY
           amount DESC
        ;
        """

select(query_103)

Unnamed: 0,category,amount,composition_ratio,cumulative_ratio,abc_rank
0,ladys_fashion,497400,57.763326,57.763326,A
1,mens_fashion,116300,13.505981,71.269307,B
2,food,80700,9.371734,80.641041,B
3,book,53500,6.212983,86.854024,B
4,dvd,32800,3.809081,90.663105,C
5,outdoor,28600,3.321333,93.984439,C
6,game,26000,3.019394,97.003832,C
7,cd,25800,2.996168,100.0,C


### [10-4] 팬 차트 작성 때 필요한 데이터를 구하는 쿼리

In [None]:
query_104 = """
        WITH
        monthly_sales AS (
         SELECT
            category
          , SUM(price) AS amount
         FROM
           purchase_detail_log
         WHERE
           dt BETWEEN '2017-01-01' AND '2017-01-31'
         GROUP BY
           category
        )
        , sales_composition_ratio AS (
           SELECT
              category
            , amount
            -- 구성비 : 100.0 * <항목별 매출> / <전체 매출>
            , 100.0 * amount / SUM(amount) OVER() AS composition_ratio
            -- 구성비누계 : 100.0 * <항목별 구계 매출> / <전체 매출>
            , 100.0 * SUM(amount) OVER(ORDER BY amount DESC
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
            / SUM(amount) OVER() AS cumulative_ratio     
           FROM
              monthly_sales
        )
        SELECT
           *
         , CASE
            WHEN cumulative_ratio BETWEEN 0 AND 70 THEN 'A'
            WHEN cumulative_ratio BETWEEN 70 AND 90 THEN 'B'
            WHEN cumulative_ratio BETWEEN 90 AND 100 THEN 'C'
           END AS abc_rank
        FROM
           sales_composition_ratio
        ORDER BY
           amount DESC
        ;
        """

select(query_104)