## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [1]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [2]:
def select(query):
    return pd.read_sql(query, con)

In [3]:
pd.options.display.max_rows = 10

## 12. 시계열에 따른 사용자 전체의 상태 변화 찾기

### [12-1] 날짜별 등록 수의 추이를 집계하는 쿼리

In [4]:
select('SELECT * FROM mst_users;')

Unnamed: 0,user_id,sex,birth_date,register_date,register_device,withdraw_date
0,U001,M,1977-06-17,2016-10-01,pc,
1,U002,F,1953-06-12,2016-10-01,sp,2016-10-10
2,U003,M,1965-01-06,2016-10-01,pc,
3,U004,F,1954-05-21,2016-10-05,pc,
4,U005,M,1987-11-23,2016-10-05,sp,
...,...,...,...,...,...,...
25,U026,M,1969-02-21,2016-11-10,sp,
26,U027,F,2001-07-10,2016-11-10,pc,
27,U028,M,1976-05-26,2016-11-15,app,
28,U029,M,1964-04-06,2016-11-28,pc,


In [5]:
select('SELECT * FROM action_log;')

Unnamed: 0,session,user_id,action,stamp
0,989004ea,U001,view,2016-10-01 18:00:00
1,989004ea,U001,view,2016-10-01 18:01:00
2,989004ea,U001,view,2016-10-01 18:10:00
3,47db0370,U001,follow,2016-10-05 19:00:00
4,47db0370,U001,view,2016-10-05 19:10:00
...,...,...,...,...
9,87b5725f,U002,follow,2016-10-01 12:00:00
10,87b5725f,U002,follow,2016-10-01 12:01:00
11,87b5725f,U002,follow,2016-10-01 12:02:00
12,9afaf87c,U002,view,2016-10-02 13:00:00


In [6]:
query_121 = """
        SELECT
           register_date
         , COUNT(DISTINCT user_id) AS register_count
        FROM
           mst_users
        GROUP BY
           register_date
        ORDER BY
           register_date
        ;
        """

select(query_121)

Unnamed: 0,register_date,register_count
0,2016-10-01,3
1,2016-10-05,2
2,2016-10-10,3
3,2016-10-15,1
4,2016-10-16,1
...,...,...
10,2016-11-04,1
11,2016-11-05,2
12,2016-11-10,2
13,2016-11-15,1


### [12-2] 매달 등록 수와 전월비를 계산하는 쿼리

In [7]:
query_122 = """
        WITH
        mst_users_with_year_month AS (
         SELECT
            *
          , substring(register_date, 1, 7) AS year_month
         FROM
            mst_users
        )
        SELECT
           year_month
         , COUNT(DISTINCT user_id) AS register_count
         , LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY year_month)
           AS last_month_count
         , 1.0
           * COUNT(DISTINCT user_id)
           / LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY year_month)
           AS month_over_month_ratio
        FROM
           mst_users_with_year_month
        GROUP BY
           year_month
        ;
        """

select(query_122)

Unnamed: 0,year_month,register_count,last_month_count,month_over_month_ratio
0,2016-10,14,,
1,2016-11,16,14.0,1.142857


### [12-3] 디바이스들의 등록 수를 집계하는 쿼리 

In [8]:
query_123 = """
        WITH
        mst_users_with_year_month AS (
         SELECT
            *
          , substring(register_date, 1, 7) AS year_month
         FROM
            mst_users
        )
        SELECT
           year_month
         , COUNT(DISTINCT user_id) AS register_count
         , COUNT(DISTINCT CASE WHEN register_device = 'pc' THEN user_id END) AS register_pc
         , COUNT(DISTINCT CASE WHEN register_device = 'sp' THEN user_id END) AS register_sp
         , COUNT(DISTINCT CASE WHEN register_device = 'app' THEN user_id END) AS register_app
        FROM
           mst_users_with_year_month
        GROUP BY
           year_month
        ;
        """

select(query_123)

Unnamed: 0,year_month,register_count,register_pc,register_sp,register_app
0,2016-10,14,7,4,3
1,2016-11,16,4,4,8


### [12-4] '로그 최근 일자'와 '사용자별 등록일의 다음날'을 계산하는 쿼리

- 지속률 : 등록일 기준으로 이후 지정일 동안 사용자가 서비스를 얼마나 이용했는지 나타내는 지표
    - 지속률(Repeat) -> 사용자가 매일 사용했으면 하는 서비스
- 정착률 : 등록일 기준으로 이후 지정한 7일 동안 사용자가 서비스를 사용했는지 나타내는 지표
    - 정착률(Retention) -> 사용자에게 어떤 목적이 생겼을 때 사용했으면 하는 서비스

In [9]:
query_124 = """
        WITH
        action_log_with_mst_users AS (
         SELECT
            u.user_id
          , u.register_date
            -- 액션 날짜와 로그 전체의 최신 날짜를 날짜 자료형으로 변환하기
          , CAST(a.stamp AS date) AS action_date
          , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
            -- 등록일 다음날의 날짜 계산하기
          , CAST(u.register_date::date + '1 day'::interval AS date)
            AS next_day_1
         FROM
            mst_users AS u
         LEFT OUTER JOIN
            action_log AS a
         ON u.user_id = a.user_id
        )
        SELECT *
        FROM
           action_log_with_mst_users
        ORDER BY
           register_date
        ;
        """

select(query_124)

Unnamed: 0,user_id,register_date,action_date,latest_date,next_day_1
0,U003,2016-10-01,,2016-10-20,2016-10-02
1,U001,2016-10-01,2016-10-20,2016-10-20,2016-10-02
2,U001,2016-10-01,2016-10-20,2016-10-20,2016-10-02
3,U001,2016-10-01,2016-10-20,2016-10-20,2016-10-02
4,U002,2016-10-01,2016-10-01,2016-10-20,2016-10-02
...,...,...,...,...,...
37,U026,2016-11-10,,2016-10-20,2016-11-11
38,U027,2016-11-10,,2016-10-20,2016-11-11
39,U028,2016-11-15,,2016-10-20,2016-11-16
40,U030,2016-11-28,,2016-10-20,2016-11-29


### [12-5] 사용자의 액션 플래그를 계산하는 쿼리

In [10]:
query_125 = """
        WITH
        action_log_with_mst_users AS (
         SELECT
            u.user_id
          , u.register_date
            -- 액션 날짜와 로그 전체의 최신 날짜를 날짜 자료형으로 변환하기
          , CAST(a.stamp AS date) AS action_date
          , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
            -- 등록일 다음날의 날짜 계산하기
          , CAST(u.register_date::date + '1 day'::interval AS date)
            AS next_day_1
         FROM
            mst_users AS u
         LEFT OUTER JOIN
            action_log AS a
         ON u.user_id = a.user_id
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
             -- 4. 등록일 다음날에 액션을 했는지 안 했는지를 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 등록일 다음날에 한 액션의 합계 구하기
              SUM(
               -- 2. 등록일 다음날이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN next_day_1 <= latest_date THEN
                -- 1. 등록일 다음날의 날짜에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN next_day_1 = action_date THEN 1 ELSE 0 END
               END 
              )
           ) AS next_1_day_action
          FROM
             action_log_with_mst_users
          GROUP BY
             user_id, register_date
        )
        SELECT *
        FROM
           user_action_flag
        ORDER BY
           register_date, user_id
        ;
        """

select(query_125)

Unnamed: 0,user_id,register_date,next_1_day_action
0,U001,2016-10-01,0.0
1,U002,2016-10-01,1.0
2,U003,2016-10-01,0.0
3,U004,2016-10-05,0.0
4,U005,2016-10-05,0.0
...,...,...,...
25,U026,2016-11-10,
26,U027,2016-11-10,
27,U028,2016-11-15,
28,U029,2016-11-28,


### [12-6] 다음날 지속률을 계산하는 쿼리

In [11]:
query_126 = """
        WITH
        action_log_with_mst_users AS (
         SELECT
            u.user_id
          , u.register_date
            -- 액션 날짜와 로그 전체의 최신 날짜를 날짜 자료형으로 변환하기
          , CAST(a.stamp AS date) AS action_date
          , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
            -- 등록일 다음날의 날짜 계산하기
          , CAST(u.register_date::date + '1 day'::interval AS date)
            AS next_day_1
         FROM
            mst_users AS u
         LEFT OUTER JOIN
            action_log AS a
         ON u.user_id = a.user_id
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
             -- 4. 등록일 다음날에 액션을 했는지 안 했는지를 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 등록일 다음날에 한 액션의 합계 구하기
              SUM(
               -- 2. 등록일 다음날이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN next_day_1 <= latest_date THEN
                -- 1. 등록일 다음날의 날짜에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN next_day_1 = action_date THEN 1 ELSE 0 END
               END 
              )
           ) AS next_1_day_action
          FROM
             action_log_with_mst_users
          GROUP BY
             user_id, register_date
        )
        SELECT
           register_date
         , AVG(100.0 * next_1_day_action) AS repeat_rate_1_day
        FROM
           user_action_flag
        GROUP BY
           register_date
        ORDER BY
           register_date
        ;
        """

select(query_126)

Unnamed: 0,register_date,repeat_rate_1_day
0,2016-10-01,33.333333
1,2016-10-05,0.000000
2,2016-10-10,0.000000
3,2016-10-15,0.000000
4,2016-10-16,0.000000
...,...,...
10,2016-11-04,
11,2016-11-05,
12,2016-11-10,
13,2016-11-15,


### [12-7] 지속률 지표를 관리하는 마스터 테이블을 작성하는 쿼리

In [12]:
query_127 = """
        WITH
        repeat_interval(index_name, interval_date) AS (
         VALUES
            ('01 day repeat', 1)
          , ('02 day repeat', 2)
          , ('03 day repeat', 3)
          , ('04 day repeat', 4)
          , ('05 day repeat', 5)
          , ('06 day repeat', 6)
          , ('07 day repeat', 7)           
        )
        SELECT *
        FROM repeat_interval
        ORDER BY index_name
        ;
        """

select(query_127)

Unnamed: 0,index_name,interval_date
0,01 day repeat,1
1,02 day repeat,2
2,03 day repeat,3
3,04 day repeat,4
4,05 day repeat,5
5,06 day repeat,6
6,07 day repeat,7


### [12-8] 지속률을 세로 기반으로 집계하는 쿼리

In [13]:
query_128 = """
        WITH
        repeat_interval(index_name, interval_date) AS (
          VALUES
             ('01 day repeat', 1)
           , ('02 day repeat', 2)
           , ('03 day repeat', 3)
           , ('04 day repeat', 4)
           , ('05 day repeat', 5)
           , ('06 day repeat', 6)
           , ('07 day repeat', 7)           
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
             -- 등록일로부터 n일 후의 날짜 계산하기
           , r.index_name
           , CAST(CAST(u.register_date AS date)+ interval '1 day' * r.interval_date AS date)
             AS index_date
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 등록일 다음날에 액션을 했는지 안 했는지를 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 등록일 다음날에 한 액션의 합계 구하기
              SUM(
               -- 2. 등록일 다음날이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_date <= latest_date THEN
                -- 1. 등록일 다음날의 날짜에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN index_date = action_date THEN 1 ELSE 0 END
               END 
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_date
        )
        SELECT
           register_date
         , index_name
         , AVG(100.0 * index_date_action) AS repeat_rate
        FROM
           user_action_flag
        GROUP BY
           register_date, index_name
        ORDER BY
           register_date, index_name
        ;
        """

select(query_128)

Unnamed: 0,register_date,index_name,repeat_rate
0,2016-10-01,01 day repeat,33.333333
1,2016-10-01,02 day repeat,0.000000
2,2016-10-01,03 day repeat,0.000000
3,2016-10-01,04 day repeat,33.333333
4,2016-10-01,05 day repeat,0.000000
...,...,...,...
100,2016-11-28,03 day repeat,
101,2016-11-28,04 day repeat,
102,2016-11-28,05 day repeat,
103,2016-11-28,06 day repeat,


### [12-9] 정착률 지표를 관리하는 마스터 테이블을 작성하는 쿼리

In [14]:
query_129 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('07 day retention', 1, 7)
          , ('14 day retention', 8, 14)
          , ('21 day retention', 15, 21)
          , ('28 day retention', 12, 28)
        )
        SELECT *
        FROM repeat_interval
        ORDER BY index_name
        ;
        """

select(query_129)

Unnamed: 0,index_name,interval_begin_date,interval_end_date
0,07 day retention,1,7
1,14 day retention,8,14
2,21 day retention,15,21
3,28 day retention,12,28


### [12-10] 정착률을 계산하는 쿼리

In [15]:
query_1210 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('07 day retention', 1, 7)
          , ('14 day retention', 8, 14)
          , ('21 day retention', 15, 21)
          , ('28 day retention', 12, 28)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        SELECT
           register_date
         , index_name
         , AVG(100.0 * index_date_action) AS repeat_rate
        FROM
           user_action_flag
        GROUP BY
           register_date, index_name
        ORDER BY
           register_date, index_name
        ;
        """

select(query_1210)

Unnamed: 0,register_date,index_name,repeat_rate
0,2016-10-01,07 day retention,66.666667
1,2016-10-01,14 day retention,0.000000
2,2016-10-01,21 day retention,
3,2016-10-01,28 day retention,
4,2016-10-05,07 day retention,0.000000
...,...,...,...
55,2016-11-15,28 day retention,
56,2016-11-28,07 day retention,
57,2016-11-28,14 day retention,
58,2016-11-28,21 day retention,


### [12-11] 지속률 지표를 관리하는 마스터 테이블을 정착률 형식으로 수정한 쿼리

In [16]:
query_1211 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('01 day repeat'   , 1, 1)
          , ('02 day repeat'   , 2, 2)
          , ('03 day repeat'   , 3, 3)
          , ('04 day repeat'   , 4, 4)
          , ('05 day repeat'   , 5, 5)
          , ('06 day repeat'   , 6, 6)
          , ('07 day repeat'   , 7, 7)
          , ('07 day retention', 1, 7)
          , ('14 day retention', 8, 14)
          , ('21 day retention', 15, 21)
          , ('28 day retention', 12, 28)
        )
        SELECT *
        FROM repeat_interval
        ORDER BY index_name
        ;
        """

select(query_1211)

Unnamed: 0,index_name,interval_begin_date,interval_end_date
0,01 day repeat,1,1
1,02 day repeat,2,2
2,03 day repeat,3,3
3,04 day repeat,4,4
4,05 day repeat,5,5
...,...,...,...
6,07 day repeat,7,7
7,07 day retention,1,7
8,14 day retention,8,14
9,21 day retention,15,21


### [12-12] n일 지속률들을 집계하는 쿼리

In [17]:
query_1212 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('01 day repeat'   , 1, 1)
          , ('02 day repeat'   , 2, 2)
          , ('03 day repeat'   , 3, 3)
          , ('04 day repeat'   , 4, 4)
          , ('05 day repeat'   , 5, 5)
          , ('06 day repeat'   , 6, 6)
          , ('07 day repeat'   , 7, 7)
          , ('07 day retention', 1, 7)
          , ('14 day retention', 8, 14)
          , ('21 day retention', 15, 21)
          , ('28 day retention', 12, 28)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        SELECT
           index_name
         , AVG(100.0 * index_date_action) AS repeat_rate
        FROM
           user_action_flag
        GROUP BY
           index_name
        ORDER BY
           index_name
        ;
        """

select(query_1212)

Unnamed: 0,index_name,repeat_rate
0,01 day repeat,8.333333
1,02 day repeat,0.000000
2,03 day repeat,0.000000
3,04 day repeat,10.000000
4,05 day repeat,0.000000
...,...,...
6,07 day repeat,0.000000
7,07 day retention,25.000000
8,14 day retention,0.000000
9,21 day retention,


### [12-13] 모든 사용자와 액션의 조합을 도출하는 쿼리

In [18]:
query_1213 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('01 day repeat', 1, 1)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_actions AS (
                    SELECT 'view'    AS action        
          UNION ALL SELECT 'comment' AS action
          UNION ALL SELECT 'follow'  AS action          
        )
        , mst_user_actions AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
          FROM
             mst_users AS u
          CROSS JOIN
             mst_actions AS a
        )
        SELECT *
        FROM
           mst_user_actions
        ORDER BY
           user_id, action
        ;
        """

select(query_1213)

Unnamed: 0,user_id,register_date,action
0,U001,2016-10-01,comment
1,U001,2016-10-01,follow
2,U001,2016-10-01,view
3,U002,2016-10-01,comment
4,U002,2016-10-01,follow
...,...,...,...
85,U029,2016-11-28,follow
86,U029,2016-11-28,view
87,U030,2016-11-28,comment
88,U030,2016-11-28,follow


### [12-14] 사용자의 액션 로그를 0, 1의 플래그로 표현하는 쿼리

In [19]:
query_1214 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('01 day repeat', 1, 1)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_actions AS (
                    SELECT 'view'    AS action        
          UNION ALL SELECT 'comment' AS action
          UNION ALL SELECT 'follow'  AS action          
        )
        , mst_user_actions AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
          FROM
             mst_users AS u
          CROSS JOIN
             mst_actions AS a
        )
        , register_action_flag AS (
          SELECT DISTINCT
             m.user_id
           , m.register_date
           , m.action
           , CASE
              WHEN a.action IS NOT NULL THEN 1
              ELSE 0
             END AS do_action
           , index_name
           , index_date_action
          FROM
             mst_user_actions AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           AND CAST(m.register_date AS date) = CAST(a.stamp AS date)
           AND m.action = a.action
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
        )
        SELECT *
        FROM
           register_action_flag
        ORDER BY
           user_id, index_name, action
        ;
        """

select(query_1214)

Unnamed: 0,user_id,register_date,action,do_action,index_name,index_date_action
0,U001,2016-10-01,comment,0,01 day repeat,0.0
1,U001,2016-10-01,follow,0,01 day repeat,0.0
2,U001,2016-10-01,view,1,01 day repeat,0.0
3,U002,2016-10-01,comment,0,01 day repeat,1.0
4,U002,2016-10-01,follow,1,01 day repeat,1.0
...,...,...,...,...,...,...
31,U011,2016-10-18,follow,0,01 day repeat,0.0
32,U011,2016-10-18,view,0,01 day repeat,0.0
33,U012,2016-10-18,comment,0,01 day repeat,0.0
34,U012,2016-10-18,follow,0,01 day repeat,0.0


### [12-15] 액션에 따른 지속률과 정착률을 집계하는 쿼리

In [20]:
query_1215 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('01 day repeat', 1, 1)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_actions AS (
                    SELECT 'view'    AS action        
          UNION ALL SELECT 'comment' AS action
          UNION ALL SELECT 'follow'  AS action          
        )
        , mst_user_actions AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
          FROM
             mst_users AS u
          CROSS JOIN
             mst_actions AS a
        )
        , register_action_flag AS (
          SELECT DISTINCT
             m.user_id
           , m.register_date
           , m.action
           , CASE
              WHEN a.action IS NOT NULL THEN 1
              ELSE 0
             END AS do_action
           , index_name
           , index_date_action
          FROM
             mst_user_actions AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           AND CAST(m.register_date AS date) = CAST(a.stamp AS date)
           AND m.action = a.action
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
        )
        SELECT
           action
         , COUNT(*) AS users
         , AVG(100.0 * do_action) AS usage_rate
         , index_name
         , AVG(CASE do_action WHEN 1 THEN 100.0 * index_date_action END) AS idx_rate
         , AVG(CASE do_action WHEN 0 THEN 100.0 * index_date_action END) AS no_action_idx_rate         
        FROM
           register_action_flag
        GROUP BY
           index_name, action
        ORDER BY
           index_name, action
        ;
        """

select(query_1215)

Unnamed: 0,action,users,usage_rate,index_name,idx_rate,no_action_idx_rate
0,comment,12,0.0,01 day repeat,,8.333333
1,follow,12,8.333333,01 day repeat,100.0,0.0
2,view,12,8.333333,01 day repeat,0.0,9.090909


### [12-16] 액션의 계급 마스터와 사용자 액션 플래그의 조합을 산출하는 쿼리

In [21]:
query_1216 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('14 day retention', 8, 14)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_action_bucket(action, min_count, max_count) AS (
          VALUES
            ('comment',  0,    0)
          , ('comment',  1,    5)
          , ('comment',  6,   10)
          , ('comment', 11, 9999) -- 최대값으로 간단하게 9999 입력
          , ('follow' ,  0,    0)
          , ('follow' ,  1,    5)          
          , ('follow' ,  6,   10)          
          , ('follow' , 11, 9999)
        )
        , mst_user_action_bucket AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
           , a.min_count
           , a.max_count
          FROM
             mst_users AS u
          CROSS JOIN
             mst_action_bucket AS a
        )
        SELECT *
        FROM
           mst_user_action_bucket
        ORDER BY
           user_id, action, min_count
        ;
        """

select(query_1216)

Unnamed: 0,user_id,register_date,action,min_count,max_count
0,U001,2016-10-01,comment,0,0
1,U001,2016-10-01,comment,1,5
2,U001,2016-10-01,comment,6,10
3,U001,2016-10-01,comment,11,9999
4,U001,2016-10-01,follow,0,0
...,...,...,...,...,...
235,U030,2016-11-28,comment,11,9999
236,U030,2016-11-28,follow,0,0
237,U030,2016-11-28,follow,1,5
238,U030,2016-11-28,follow,6,10


### [12-17] 등록 후 7일 동안의 액션 수를 집계하는 쿼리

In [22]:
query_1217 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('14 day retention', 8, 14)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_action_bucket(action, min_count, max_count) AS (
          VALUES
            ('comment',  0,    0)
          , ('comment',  1,    5)
          , ('comment',  6,   10)
          , ('comment', 11, 9999) -- 최대값으로 간단하게 9999 입력
          , ('follow' ,  0,    0)
          , ('follow' ,  1,    5)          
          , ('follow' ,  6,   10)          
          , ('follow' , 11, 9999)
        )
        , mst_user_action_bucket AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
           , a.min_count
           , a.max_count
          FROM
             mst_users AS u
          CROSS JOIN
             mst_action_bucket AS a
        )
        , register_action_flag AS (
           -- 등록일에서 7일 후까지의 액션 수를 세고,
           -- 액션 단계와 14일 정착 달성 플래그 계산하기
           SELECT
             m.user_id
           , m.action
           , m.min_count
           , m.max_count
           , COUNT(a.action) AS action_count
           , CASE
              WHEN COUNT(a.action) BETWEEN m.min_count AND m.max_count THEN 1
              ELSE 0
             END AS achieve
           , index_name
           , index_date_action
          FROM
             mst_user_action_bucket AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           -- 등록일 당일부터 7일 후까지의 액션 로그 결합하기
           AND CAST(a.stamp AS date)
                BETWEEN CAST(m.register_date AS date)
                    AND CAST(m.register_date AS date) + interval '7 days'
           AND m.action = a.action
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
          GROUP BY
             m.user_id
           , m.action
           , m.min_count
           , m.max_count
           , f.index_name
           , f.index_date_action
        )
        SELECT *
        FROM
           register_action_flag
        ORDER BY
           user_id, action, min_count
        ;
        """

select(query_1217)

Unnamed: 0,user_id,action,min_count,max_count,action_count,achieve,index_name,index_date_action
0,U001,comment,0,0,0,1,14 day retention,0.0
1,U001,comment,1,5,0,0,14 day retention,0.0
2,U001,comment,6,10,0,0,14 day retention,0.0
3,U001,comment,11,9999,0,0,14 day retention,0.0
4,U001,follow,0,0,2,0,14 day retention,0.0
...,...,...,...,...,...,...,...,...
35,U005,comment,11,9999,0,0,14 day retention,0.0
36,U005,follow,0,0,0,1,14 day retention,0.0
37,U005,follow,1,5,0,0,14 day retention,0.0
38,U005,follow,6,10,0,0,14 day retention,0.0


### [12-18] 등록 후 7일 동안의 액션 횟수별로 14일 정착률을 집계하는 쿼리

In [23]:
query_1218 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('14 day retention', 8, 14)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , mst_action_bucket(action, min_count, max_count) AS (
          VALUES
            ('comment',  0,    0)
          , ('comment',  1,    5)
          , ('comment',  6,   10)
          , ('comment', 11, 9999) -- 최대값으로 간단하게 9999 입력
          , ('follow' ,  0,    0)
          , ('follow' ,  1,    5)          
          , ('follow' ,  6,   10)          
          , ('follow' , 11, 9999)
        )
        , mst_user_action_bucket AS (
          SELECT
             u.user_id
           , u.register_date
           , a.action
           , a.min_count
           , a.max_count
          FROM
             mst_users AS u
          CROSS JOIN
             mst_action_bucket AS a
        )
        , register_action_flag AS (
           -- 등록일에서 7일 후까지의 액션 수를 세고,
           -- 액션 단계와 14일 정착 달성 플래그 계산하기
           SELECT
             m.user_id
           , m.action
           , m.min_count
           , m.max_count
           , COUNT(a.action) AS action_count
           , CASE
              WHEN COUNT(a.action) BETWEEN m.min_count AND m.max_count THEN 1
              ELSE 0
             END AS achieve
           , index_name
           , index_date_action
          FROM
             mst_user_action_bucket AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           -- 등록일 당일부터 7일 후까지의 액션 로그 결합하기
           AND CAST(a.stamp AS date)
                BETWEEN CAST(m.register_date AS date)
                    AND CAST(m.register_date AS date) + interval '7 days'
           AND m.action = a.action
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
          GROUP BY
             m.user_id
           , m.action
           , m.min_count
           , m.max_count
           , f.index_name
           , f.index_date_action
        )
        SELECT
           action
         , min_count || ' ~ ' || max_count AS count_range
         , SUM(CASE achieve WHEN 1 THEN 1 ELSE 0 END) AS achieve
         , index_name
         , AVG(CASE achieve WHEN 1 THEN 100.0 * index_date_action END) AS achieve_index_rate
        FROM
           register_action_flag
        GROUP BY
           index_name, action, min_count, max_count
        ORDER BY
           index_name, action, min_count
        ;
        """

select(query_1218)

Unnamed: 0,action,count_range,achieve,index_name,achieve_index_rate
0,comment,0 ~ 0,4,14 day retention,0.0
1,comment,1 ~ 5,1,14 day retention,0.0
2,comment,6 ~ 10,0,14 day retention,
3,comment,11 ~ 9999,0,14 day retention,
4,follow,0 ~ 0,3,14 day retention,0.0
5,follow,1 ~ 5,2,14 day retention,0.0
6,follow,6 ~ 10,0,14 day retention,
7,follow,11 ~ 9999,0,14 day retention,


### [12-19] 등록일 다음날부터 7일 동안의 사용 일수와 28일 정착 플래그를 생성하는 쿼리

In [24]:
query_1219 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('28 day retention', 22, 28)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , register_action_flag AS (
          SELECT
             m.user_id
           , COUNT(DISTINCT CAST(a.stamp AS date)) AS dt_count
           , index_name
           , index_date_action
          FROM
             mst_users AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           -- 등록 다음날부터 7일 이내의 액션 로그 결합하기
           AND CAST(a.stamp AS date)
               BETWEEN CAST(m.register_date AS date) + interval '1 day'
                   AND CAST(m.register_date AS date) + interval '8 days'
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
          GROUP BY
             m.user_id
           , f.index_name
           , f.index_date_action
        )
        SELECT *
        FROM register_action_flag
        ;
        """

select(query_1219)

Unnamed: 0,user_id,dt_count,index_name,index_date_action


### [12-20] 사용 일수에 따른 정착률을 집계하는 쿼리

In [25]:
query_1220 = """
        WITH
        repeat_interval(index_name, interval_begin_date, interval_end_date) AS (
         VALUES
            ('28 day retention', 22, 28)
        )
        , action_log_with_index_date AS (
          SELECT
             u.user_id
           , u.register_date
           , CAST(a.stamp AS date) AS action_date
           , MAX(CAST(a.stamp AS date)) OVER() AS latest_date
           , r.index_name
             -- 지표의 대상 기간 시작일과 종류일 계산하기
           , CAST(u.register_date::date + '1 day'::interval * r.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.register_date::date + '1 day'::interval * r.interval_end_date AS date)
             AS index_end_date             
          FROM
             mst_users AS u
          LEFT OUTER JOIN
             action_log AS a
          ON u.user_id = a.user_id
          CROSS JOIN
             repeat_interval AS r
        )
        , user_action_flag AS (
          SELECT
             user_id
           , register_date
           , index_name
             -- 4. 지표의 대상 기간에 액션을 했는지 플래그로 나타내기
           , SIGN(
              -- 3. 사용자별로 대상 기간에 한 액션의 합계 구하기
              SUM(
               -- 2. 대상 기간의 종료일이 로그의 최신 날짜 이전인지 확인하기
               CASE WHEN index_end_date <= latest_date THEN
                -- 1. 지표의 대상 기간에 액션을 했다면 1, 안 했다면 0 지정하기
                CASE WHEN action_date BETWEEN index_begin_date AND index_end_date
                 THEN 1 ELSE 0 END
               END
              )
           ) AS index_date_action
          FROM
             action_log_with_index_date
          GROUP BY
             user_id, register_date, index_name, index_begin_date, index_end_date
        )
        , register_action_flag AS (
          SELECT
             m.user_id
           , COUNT(DISTINCT CAST(a.stamp AS date)) AS dt_count
           , index_name
           , index_date_action
          FROM
             mst_users AS m
          LEFT JOIN
             action_log AS a
           ON m.user_id = a.user_id
           -- 등록 다음날부터 7일 이내의 액션 로그 결합하기
           AND CAST(a.stamp AS date)
               BETWEEN CAST(m.register_date AS date) + interval '1 day'
                   AND CAST(m.register_date AS date) + interval '8 days'
          LEFT JOIN
             user_action_flag AS f
           ON m.user_id = f.user_id
          WHERE
             f.index_date_action IS NOT NULL
          GROUP BY
             m.user_id
           , f.index_name
           , f.index_date_action
        )
        SELECT
           dt_count AS dates
         , COUNT(user_id) AS users
         , 100.0 * COUNT(user_id) / SUM(COUNT(user_id)) OVER() AS user_ratio
         , 100.0
           * SUM(COUNT(user_id))
              OVER(ORDER BY index_name, dt_count
               ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
           / SUM(COUNT(user_id)) OVER() AS cum_ratio
         , SUM(index_date_action) AS achieve_users
         , AVG(100.0 * index_date_action) AS achieve_ratio
        FROM 
           register_action_flag
        GROUP BY
           index_name, dt_count
        ORDER BY
           index_name, dt_count
        ;
        """

select(query_1220)

Unnamed: 0,dates,users,user_ratio,cum_ratio,achieve_users,achieve_ratio


### [12-21] 등록 월에서 12개월 후까지의 잔존율을 집계하는 쿼리

In [26]:
query_1221 = """
        WITH
        mst_intervals(interval_month) AS (
          SELECT *
          FROM generate_series(1, 12)
        )
        , mst_users_with_index_month AS (
          -- 사용자 마스터에 등록 월부터 12개월 후까지의 월을 추가하기
          SELECT
             u.user_id
           , u.register_date
             -- n개월 후의 날짜, 등록일, 등록 월 n개월 후의 월 계산하기
           , CAST(u.register_date::date + i.interval_month * '1 month'::interval AS date)
             AS index_date
           , substring(u.register_date, 1, 7) AS register_month
           , substring(CAST(
              u.register_date::date + i.interval_month * '1 month'::interval AS text), 1, 7)
             AS index_month
          FROM
             mst_users AS u
          CROSS JOIN
             mst_intervals AS i
        )
        , action_log_in_month AS (
          -- 액션 로그의 날짜에서 월 부분만 추출하기
          SELECT DISTINCT
             user_id
           , substring(stamp, 1, 7) As action_month
          FROM
             action_log
        )
        SELECT
           -- 사용자 마스터와 액션 로그를 결합한 뒤, 월별로 잔존율 계산하기
           u.register_month
         , u.index_month
           -- action_month가 NULL이 아니라면(액션을 했다면) 사용자 집계
         , SUM(CASE WHEN a.action_month IS NOT NULL THEN 1 ELSE 0 END) AS users
         , AVG(CASE WHEN a.action_month IS NOT NULL THEN 100.0 ELSE 0.0 END) AS retension_rate
        FROM
           mst_users_with_index_month AS u
        LEFT JOIN
           action_log_in_month AS a
         ON u.user_id = a.user_id
         AND u.index_month = a.action_month
        GROUP BY
           u.register_month , u.index_month
        ORDER BY
           u.register_month , u.index_month        
        ;
        """

select(query_1221)

Unnamed: 0,register_month,index_month,users,retension_rate
0,2016-10,2016-11,0,0.0
1,2016-10,2016-12,0,0.0
2,2016-10,2017-01,0,0.0
3,2016-10,2017-02,0,0.0
4,2016-10,2017-03,0,0.0
...,...,...,...,...
19,2016-11,2017-07,0,0.0
20,2016-11,2017-08,0,0.0
21,2016-11,2017-09,0,0.0
22,2016-11,2017-10,0,0.0


### [12-22] 신규 사용자 수 , 리피트 사용자 수, 컴백 사용자 수를 집계하는 쿼리

#### MAU(Monthly Active Usrs) : 특정 월에 서비스를 사용한 사용자 수

MAU를 3개로 나누어 분석하기
- 신규 사용자 : 이번 달에 등록한 신규 사용자
- 리피트 사용자 : 이전 달에도 사용했던 사용자
- 컴백 사용자 : 이번 달의 신규 등록자가 아니고, 이전 달에도 사용하지 않았던 한동안 사용하지 않았다가 돌아온 사용자

In [27]:
query_1222 = """
        WITH
        monthly_user_action AS (
          -- 월별 사용자 액션 집약하기
          SELECT DISTINCT
             u.user_id
           , substring(u.register_date, 1, 7) AS register_month
           , substring(l.stamp, 1, 7) AS action_month
           , substring(CAST(
              l.stamp::date - interval '1 month' AS text
             ), 1, 7) AS action_month_priv
          FROM
             mst_users AS u
          JOIN
             action_log AS l
           ON u.user_id = l.user_id
        )
        , monthly_user_with_type AS (
          -- 월별 사용자 분류 테이블
          SELECT
             action_month
           , user_id
           , CASE
              -- 등록 월과 액션월이 일치하면 신규 사용자
              WHEN register_month = action_month THEN 'new_user'
              -- 이전 월에 액션이 있다면 리피트 사용자
              WHEN action_month_priv
                 = LAG(action_month)
                    OVER(PARTITION BY user_id ORDER BY action_month)
                 THEN 'repeat_user'
              -- 이외의 경우는 컴백 사용자
              ELSE 'come_back_user'
             END AS c
           , action_month_priv
          FROM
             monthly_user_action
        )
        SELECT
           action_month
           -- 특정 달의 MAU
         , COUNT(user_id) AS mau
         , COUNT(CASE WHEN c = 'new_user'       THEN 1 END) AS new_users
         , COUNT(CASE WHEN c = 'repeat_user'    THEN 1 END) AS repeat_users         
         , COUNT(CASE WHEN c = 'come_back_user' THEN 1 END) AS come_back_users         
        FROM
           monthly_user_with_type
        GROUP BY
           action_month
        ORDER BY
           action_month
        ;
        """

select(query_1222)

Unnamed: 0,action_month,mau,new_users,repeat_users,come_back_users
0,2016-10,2,2,0,0


### [12-23] 리피트 사용자를 세분화해서 집계하는 쿼리

리피트 사용자를 3가지로 분류하기

- 신규 리피트 사용자 : 이전 달에는 신규 사용자로 분류되었으며, 이번 달에도 사용한 사용자
- 기존 리피트 사용자 : 이전 달에 리피트 사용자로 분류되었으며, 이번 달에도 사용한 사용자
- 컴백 리피트 사용자 : 이전 달에 컴백 사용자로 분류되었으며, 이번 달에도 사용한 사용자

In [28]:
query_1223 = """
        WITH
        monthly_user_action AS (
          -- 월별 사용자 액션 집약하기
          SELECT DISTINCT
             u.user_id
           , substring(u.register_date, 1, 7) AS register_month
           , substring(l.stamp, 1, 7) AS action_month
           , substring(CAST(
              l.stamp::date - interval '1 month' AS text
             ), 1, 7) AS action_month_priv
          FROM
             mst_users AS u
          JOIN
             action_log AS l
           ON u.user_id = l.user_id
        )
        , monthly_user_with_type AS (
          -- 월별 사용자 분류 테이블
          SELECT
             action_month
           , user_id
           , CASE
              -- 등록 월과 액션월이 일치하면 신규 사용자
              WHEN register_month = action_month THEN 'new_user'
              -- 이전 월에 액션이 있다면 리피트 사용자
              WHEN action_month_priv
                 = LAG(action_month)
                    OVER(PARTITION BY user_id ORDER BY action_month)
                 THEN 'repeat_user'
              -- 이외의 경우는 컴백 사용자
              ELSE 'come_back_user'
             END AS c
           , action_month_priv
          FROM
             monthly_user_action
        )
        , monthly_users AS (
          SELECT
             m1.action_month
           , COUNT(m1.user_id) AS mau
           , COUNT(CASE WHEN m1.c = 'new_user'       THEN 1 END) AS new_users
           , COUNT(CASE WHEN m1.c = 'repeat_user'    THEN 1 END) AS repeat_users         
           , COUNT(CASE WHEN m1.c = 'come_back_user' THEN 1 END) AS come_back_users         
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'new_user' THEN 1 END
             ) AS new_repeat_users
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'repeat_user' THEN 1 END
             ) AS continuous_repeat_users        
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'come_back_user' THEN 1 END
             ) AS come_back_repeat_users             
        FROM
           -- m1 : 해당 월의 사용자 분류 테이블
           monthly_user_with_type AS m1
        LEFT OUTER JOIN
           -- m0 : 이전 달의 사용자 분류 테이블
           monthly_user_with_type AS m0
         ON m1.user_id = m0.user_id
         AND m1.action_month_priv = m0.action_month
        GROUP BY
           m1.action_month
        )
        SELECT *
        FROM
           monthly_users
        ORDER BY
           action_month
        ;
        """

select(query_1223)

Unnamed: 0,action_month,mau,new_users,repeat_users,come_back_users,new_repeat_users,continuous_repeat_users,come_back_repeat_users
0,2016-10,2,2,0,0,0,0,0


### [12-24] MAU 내역과 MAU 속성들의 반복률을 계산하는 쿼리

In [29]:
query_1224 = """
        WITH
        monthly_user_action AS (
          -- 월별 사용자 액션 집약하기
          SELECT DISTINCT
             u.user_id
           , substring(u.register_date, 1, 7) AS register_month
           , substring(l.stamp, 1, 7) AS action_month
           , substring(CAST(
              l.stamp::date - interval '1 month' AS text
             ), 1, 7) AS action_month_priv
          FROM
             mst_users AS u
          JOIN
             action_log AS l
           ON u.user_id = l.user_id
        )
        , monthly_user_with_type AS (
          -- 월별 사용자 분류 테이블
          SELECT
             action_month
           , user_id
           , CASE
              -- 등록 월과 액션월이 일치하면 신규 사용자
              WHEN register_month = action_month THEN 'new_user'
              -- 이전 월에 액션이 있다면 리피트 사용자
              WHEN action_month_priv
                 = LAG(action_month)
                    OVER(PARTITION BY user_id ORDER BY action_month)
                 THEN 'repeat_user'
              -- 이외의 경우는 컴백 사용자
              ELSE 'come_back_user'
             END AS c
           , action_month_priv
          FROM
             monthly_user_action
        )
        , monthly_users AS (
          SELECT
             m1.action_month
           , COUNT(m1.user_id) AS mau
           , COUNT(CASE WHEN m1.c = 'new_user'       THEN 1 END) AS new_users
           , COUNT(CASE WHEN m1.c = 'repeat_user'    THEN 1 END) AS repeat_users         
           , COUNT(CASE WHEN m1.c = 'come_back_user' THEN 1 END) AS come_back_users         
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'new_user' THEN 1 END
             ) AS new_repeat_users
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'repeat_user' THEN 1 END
             ) AS continuous_repeat_users        
           , COUNT(
               CASE WHEN m1.c = 'repeat_user' AND m0.c = 'come_back_user' THEN 1 END
             ) AS come_back_repeat_users             
        FROM
           -- m1 : 해당 월의 사용자 분류 테이블
           monthly_user_with_type AS m1
        LEFT OUTER JOIN
           -- m0 : 이전 달의 사용자 분류 테이블
           monthly_user_with_type AS m0
         ON m1.user_id = m0.user_id
         AND m1.action_month_priv = m0.action_month
        GROUP BY
           m1.action_month
        )
        SELECT
           action_month
         , mau
         , new_users
         , repeat_users
         , come_back_users
         , new_repeat_users
         , continuous_repeat_users
         , come_back_repeat_users
           -- 이전 달에 신규 사용자이면서 해당 월에 신규 리피트 사용자인 사용자의 비율
         , 100.0 * new_repeat_users
           / NULLIF(LAG(new_users) OVER(ORDER BY action_month), 0)
           AS priv_new_repeat_ratio
           -- 이전 달에 리피트 사용자이면서 해당 월에 기존 리피트 사용자인 사용자의 비율
         , 100.0 * continuous_repeat_users
           / NULLIF(LAG(repeat_users) OVER(ORDER BY action_month), 0)
           AS priv_continuous_repeat_ratio           
           -- 이전 달에 컴백 사용자이면서 해당 월에 컴백 리피트 사용자인 사용자의 비율
         , 100.0 * come_back_repeat_users
           / NULLIF(LAG(come_back_users) OVER(ORDER BY action_month), 0)
           AS priv_come_back_repeat_ratio           
        FROM
           monthly_users
        ORDER BY
           action_month
        ;
        """

select(query_1224)

Unnamed: 0,action_month,mau,new_users,repeat_users,come_back_users,new_repeat_users,continuous_repeat_users,come_back_repeat_users,priv_new_repeat_ratio,priv_continuous_repeat_ratio,priv_come_back_repeat_ratio
0,2016-10,2,2,0,0,0,0,0,,,


### [12-25] 성장지수 산출을 위해 사용자 상태를 집계하는 쿼리

성장지수

- Signup : 신규 등록하고 사용을 시작함
- Deactivation : 액티브 유저가 비액티브 유저가 되었음
- Reactivation : 비액티브 유저가 액티브 유저로 돌아옴
- Exit : 서비스를 탈퇴하거나 사용을 중지함

In [30]:
query_1225 = """
        WITH
        unique_action_log AS (
          -- 같은 날짜 로그를 중복해 세지 않도록 중복 배제하기
          SELECT DISTINCT
             user_id
           , substring(stamp, 1, 10) AS action_date
          FROM
             action_log
        )
        , mst_calendar AS (
          -- 집계하고 싶은 기간을 캘린더 테이블로 만들어두기
          -- yyyy-mm--dd 형식으로 변환 후 기존 테이블의 날짜 형식처럼 재변환
          SELECT generate_series(timestamp '2016-10-01', '2016-11-04', '1 day')::date ::varchar
                 AS dt
        )
        , target_date_with_user AS (
          -- 사용자 마스터에 캘린더 테이블의 날짜를 target_date로 추가하기
          SELECT
             c.dt AS target_date
           , u.user_id
           , u.register_date
           , u.withdraw_date
          FROM
             mst_users AS u
          CROSS JOIN
             mst_calendar AS c
        )
        , user_status_log AS (
          SELECT
             u.target_date
           , u.user_id
           , u.register_date
           , u.withdraw_date
           , a.action_date
           , CASE WHEN u.register_date = a.action_date THEN 1 ELSE 0 END AS is_new
           , CASE WHEN u.withdraw_date = a.action_date THEN 1 ELSE 0 END AS is_exit
           , CASE WHEN u.target_date = a.action_date THEN 1 ELSE 0 END AS is_access
           , LAG(CASE WHEN u.target_date = a.action_date THEN 1 ELSE 0 END)
              OVER(PARTITION BY u.user_id ORDER BY u.target_date)
             AS was_access
          FROM
             target_date_with_user AS u
          LEFT JOIN
             unique_action_log AS a
           ON u.user_id = a.user_id
           AND u.target_date = a.action_date
          WHERE
             -- 집계 기간을 등록일 이후로만 필터링하기
             u.register_date <= u.target_date
             -- 탈퇴 날짜가 포함되어 있으면, 집계 기간을 탈퇴 날짜 이전만으로 필터링하기
             AND(u.withdraw_date IS NULL OR u.target_date <= u.withdraw_date)
        )
        SELECT
           target_date
         , user_id
         , is_new
         , is_exit
         , is_access
         , was_access
        FROM
           user_status_log
        ;
        """

select(query_1225)

Unnamed: 0,target_date,user_id,is_new,is_exit,is_access,was_access
0,2016-10-01,U001,1,0,1,
1,2016-10-02,U001,0,0,0,1.0
2,2016-10-03,U001,0,0,0,0.0
3,2016-10-04,U001,0,0,0,0.0
4,2016-10-05,U001,0,0,1,0.0
...,...,...,...,...,...,...
321,2016-11-03,U021,0,0,0,
322,2016-11-04,U021,0,0,0,0.0
323,2016-11-03,U022,0,0,0,
324,2016-11-04,U022,0,0,0,0.0


### [12-26] 매일의 성장지수를 계산하는 쿼리

In [31]:
query_1226 = """
        WITH
        unique_action_log AS (
          -- 같은 날짜 로그를 중복해 세지 않도록 중복 배제하기
          SELECT DISTINCT
             user_id
           , substring(stamp, 1, 10) AS action_date
          FROM
             action_log
        )
        , mst_calendar AS (
          -- 집계하고 싶은 기간을 캘린더 테이블로 만들어두기
          -- yyyy-mm--dd 형식으로 변환 후 기존 테이블의 날짜 형식처럼 재변환
          SELECT generate_series(timestamp '2016-10-01', '2016-11-04', '1 day')::date ::varchar
                 AS dt
        )
        , target_date_with_user AS (
          -- 사용자 마스터에 캘린더 테이블의 날짜를 target_date로 추가하기
          SELECT
             c.dt AS target_date
           , u.user_id
           , u.register_date
           , u.withdraw_date
          FROM
             mst_users AS u
          CROSS JOIN
             mst_calendar AS c
        )
        , user_status_log AS (
          SELECT
             u.target_date
           , u.user_id
           , u.register_date
           , u.withdraw_date
           , a.action_date
           , CASE WHEN u.register_date = a.action_date THEN 1 ELSE 0 END AS is_new
           , CASE WHEN u.withdraw_date = a.action_date THEN 1 ELSE 0 END AS is_exit
           , CASE WHEN u.target_date = a.action_date THEN 1 ELSE 0 END AS is_access
           , LAG(CASE WHEN u.target_date = a.action_date THEN 1 ELSE 0 END)
              OVER(PARTITION BY u.user_id ORDER BY u.target_date)
             AS was_access
          FROM
             target_date_with_user AS u
          LEFT JOIN
             unique_action_log AS a
           ON u.user_id = a.user_id
           AND u.target_date = a.action_date
          WHERE
             -- 집계 기간을 등록일 이후로만 필터링하기
             u.register_date <= u.target_date
             -- 탈퇴 날짜가 포함되어 있으면, 집계 기간을 탈퇴 날짜 이전만으로 필터링하기
             AND(u.withdraw_date IS NULL OR u.target_date <= u.withdraw_date)
        )
        , user_growth_index AS (
          SELECT
             *
           , CASE
              -- 어떤 날짜에 신규 등록 또는 탈퇴한 경우 signup 또는 exit로 판정하기
              WHEN is_new + is_exit = 1 THEN
               CASE
                WHEN is_new  = 1 THEN 'signup'
                WHEN is_exit = 1 THEN 'exit'
               END
              -- 신규 등록과 탈퇴가 아닌 경우 reactivation 또는 deactivation으로 판정하기
              -- 이 때 reactivation, deactivation의 정의에 맞지 않는 경우 NULL로 지정
              WHEN is_new + is_exit = 0 THEN
               CASE
                WHEN was_access = 0 AND is_access = 1 THEN 'reactivation'
                WHEN was_access = 1 AND is_access = 0 THEN 'deactivation'
               END
               -- 어떤 날짜에 신규 등록과 탈퇴를 함께 했다면(is_new + is_exit = 2) NULL로 지정
              END AS growth_index
           FROM
              user_status_log
        )
        SELECT
           target_date
         , SUM(CASE growth_index WHEN 'signup'       THEN  1 ELSE 0 END) AS signup
         , SUM(CASE growth_index WHEN 'reactivation' THEN  1 ELSE 0 END) AS reactivation
         , SUM(CASE growth_index WHEN 'deactivation' THEN -1 ELSE 0 END) AS deactivation
         , SUM(CASE growth_index WHEN 'exit'         THEN -1 ELSE 0 END) AS exit
         -- 성장지수 정의에 계산하기
         , SUM(
            CASE growth_index
             WHEN 'signup'       THEN  1
             WHEN 'reactivation' THEN  1
             WHEN 'deactivation' THEN -1
             WHEN 'exit'         THEN -1
             ELSE 0
            END
          ) AS growth_index
        FROM
           user_growth_index
        GROUP BY
           target_date
        ORDER BY
           target_date
        ;
        """

select(query_1226)

Unnamed: 0,target_date,signup,reactivation,deactivation,exit,growth_index
0,2016-10-01,2,0,0,0,2
1,2016-10-02,0,0,-1,0,-1
2,2016-10-03,0,0,-1,0,-1
3,2016-10-04,0,0,0,0,0
4,2016-10-05,0,1,0,0,1
...,...,...,...,...,...,...
30,2016-10-31,0,0,0,0,0
31,2016-11-01,0,0,0,0,0
32,2016-11-02,0,0,0,0,0
33,2016-11-03,0,0,0,0,0


## 13. 시계열에 따른 사용자의 개별적인 행동 분석하기

### [13-1] 신청일과 숙박일의 리드 타임을 계산하는 쿼리

In [33]:
query_131 = """
        WITH
        reservations(reservation_id, register_date, visit_date, days) AS (
          VALUES
            (1, date '2016-09-01', date '2016-10-01', 3)
          , (2, date '2016-09-20', date '2016-10-01', 2)
          , (3, date '2016-09-30', date '2016-11-20', 2)
          , (4, date '2016-10-01', date '2017-01-03', 2)
          , (5, date '2016-11-01', date '2016-12-28', 3)
        )
        SELECT
           reservation_id
         , register_date
         , visit_date
         , visit_date::date - register_date::date AS lead_time
        FROM
           reservations
        ;
        """

select(query_131)

Unnamed: 0,reservation_id,register_date,visit_date,lead_time
0,1,2016-09-01,2016-10-01,30
1,2,2016-09-20,2016-10-01,11
2,3,2016-09-30,2016-11-20,51
3,4,2016-10-01,2017-01-03,94
4,5,2016-11-01,2016-12-28,57


### [13-2] 각 단계에서의 리드 타임과 토탈 리드 타임을 계산하는 쿼리

In [35]:
query_132 = """
        WITH
        requests(user_id, product_id, request_date) AS (
          VALUES
            ('U001', '1', date '2016-09-01')
          , ('U001', '2', date '2016-09-20')
          , ('U002', '3', date '2016-09-30')
          , ('U003', '4', date '2016-10-01')
          , ('U004', '5', date '2016-11-01')
        )
        , estimates(user_id, product_id, estimate_date) AS (
          VALUES
            ('U001', '2', date '2016-09-21')
          , ('U002', '3', date '2016-10-15')
          , ('U003', '4', date '2016-10-15')
          , ('U004', '5', date '2016-12-01')
        )
        , orders(user_id, product_id, order_date) AS (
          VALUES
            ('U001', '2', date '2016-10-01')
          , ('U004', '5', date '2016-12-05')
        )
        SELECT
           r.user_id
         , r.product_id
         , e.estimate_date::date - r.request_date::date AS estimate_lead_time
         , o.order_date::date - e.estimate_date::date AS order_lead_time         
         , o.order_date::date - r.request_date::date AS total_lead_time
        FROM
           requests AS r
        LEFT OUTER JOIN
           estimates AS e
         ON r.user_id = e.user_id
         AND r.product_id = e.product_id
        LEFT OUTER JOIN
           orders AS o
         ON r.user_id = o.user_id
         AND r.product_id = o.product_id
        ;
        """

select(query_132)

Unnamed: 0,user_id,product_id,estimate_lead_time,order_lead_time,total_lead_time
0,U001,1,,,
1,U001,2,1.0,10.0,11.0
2,U002,3,15.0,,
3,U003,4,14.0,,
4,U004,5,30.0,4.0,34.0


### [12-3] 이전 구매일로부터의 일수를 계산하는 쿼리

In [36]:
query_133 = """
        WITH
        purchase_log(user_id, product_id, purchase_date) AS (
          VALUES
            ('U001', '1', '2016-09-01')
          , ('U001', '2', '2016-09-20')
          , ('U002', '3', '2016-09-30')
          , ('U001', '4', '2016-10-01')
          , ('U002', '5', '2016-11-01')
        )
        SELECT
           user_id
         , purchase_date
         , purchase_date::date
           - LAG(purchase_date::date)
              OVER(PARTITION BY user_id ORDER BY purchase_date) AS lead_time
        FROM
           purchase_log
        ;
        """

select(query_133)

Unnamed: 0,user_id,purchase_date,lead_time
0,U001,2016-09-01,
1,U001,2016-09-20,19.0
2,U001,2016-10-01,11.0
3,U002,2016-09-30,
4,U002,2016-11-01,32.0


### [13-4] 상품들이 카트에 추가된 시각과 구매된 시각을 산출하는 쿼리

In [37]:
select('SELECT * FROM action_log;')

Unnamed: 0,dt,session,user_id,action,products,stamp
0,2016-11-03,A,U001,add_cart,1,2016-11-03 18:00:00
1,2016-11-03,A,U001,add_cart,2,2016-11-03 18:01:00
2,2016-11-03,A,U001,add_cart,3,2016-11-03 18:02:00
3,2016-11-03,A,U001,purchase,123,2016-11-03 18:10:00
4,2016-11-03,B,U002,add_cart,1,2016-11-03 19:00:00
...,...,...,...,...,...,...
7,2016-11-04,C,U001,add_cart,4,2016-11-04 12:00:00
8,2016-11-04,C,U001,add_cart,5,2016-11-04 12:00:00
9,2016-11-04,C,U001,add_cart,6,2016-11-04 12:00:00
10,2016-11-04,D,U002,purchase,2,2016-11-04 13:00:00


In [39]:
query_134 = """
        WITH
        row_action_log AS (
          SELECT
             dt
           , user_id
           , action
           -- 쉼표로 구분된 product_id 리스트 전개하기
           , regexp_split_to_table(products, ',') AS product_id
           , stamp
          FROM
            action_log
        )
        , action_time_stats AS (
          -- 사용자와 상품 조합의 카드 추가 시간과 구매 시간 추출하기
          SELECT
             user_id
           , product_id
           , MIN(CASE action WHEN 'add_cart' THEN dt END) AS dt
           , MIN(CASE action WHEN 'add_cart' THEN stamp END) AS add_cart_time
           , MIN(CASE action WHEN 'purchase' THEN stamp END) AS purchase_time
           , EXTRACT(epoch from
               MIN(CASE action WHEN 'purchase' THEN stamp::timestamp END)
             - MIN(CASE action WHEN 'add_cart' THEN stamp::timestamp END))
             AS lead_time
          FROM
             row_action_log
          GROUP BY
             user_id, product_id
        )
        SELECT
           user_id
         , product_id
         , add_cart_time
         , purchase_time
         , lead_time
        FROM
           action_time_stats
        ORDER BY
           user_id, product_id
        ;
        """

select(query_134)

Unnamed: 0,user_id,product_id,add_cart_time,purchase_time,lead_time
0,U001,1,2016-11-03 18:00:00,2016-11-03 18:10:00,600.0
1,U001,2,2016-11-03 18:01:00,2016-11-03 18:10:00,540.0
2,U001,3,2016-11-03 18:02:00,2016-11-03 18:10:00,480.0
3,U001,4,2016-11-04 12:00:00,,
4,U001,5,2016-11-04 12:00:00,2016-11-04 15:00:00,10800.0
5,U001,6,2016-11-04 12:00:00,2016-11-04 15:00:00,10800.0
6,U002,1,2016-11-03 19:00:00,2016-11-03 20:00:00,3600.0
7,U002,2,2016-11-03 20:30:00,2016-11-04 13:00:00,59400.0


### [13-5] 카트 추가 후 n시간 이내에 구매된 상품 수와 구매율을 집계하는 쿼리

In [41]:
query_135 = """
        WITH
        row_action_log AS (
          SELECT
             dt
           , user_id
           , action
           -- 쉼표로 구분된 product_id 리스트 전개하기
           , regexp_split_to_table(products, ',') AS product_id
           , stamp
          FROM
            action_log
        )
        , action_time_stats AS (
          -- 사용자와 상품 조합의 카드 추가 시간과 구매 시간 추출하기
          SELECT
             user_id
           , product_id
           , MIN(CASE action WHEN 'add_cart' THEN dt END) AS dt
           , MIN(CASE action WHEN 'add_cart' THEN stamp END) AS add_cart_time
           , MIN(CASE action WHEN 'purchase' THEN stamp END) AS purchase_time
           , EXTRACT(epoch from
               MIN(CASE action WHEN 'purchase' THEN stamp::timestamp END)
             - MIN(CASE action WHEN 'add_cart' THEN stamp::timestamp END))
             AS lead_time
          FROM
             row_action_log
          GROUP BY
             user_id, product_id
        )
        , purchase_lead_time_flag AS (
          SELECT
             user_id
           , product_id
           , dt
           , CASE WHEN lead_time <=  1 * 60 * 60 THEN 1 ELSE 0 END AS purchase_1_hour
           , CASE WHEN lead_time <=  6 * 60 * 60 THEN 1 ELSE 0 END AS purchase_6_hours
           , CASE WHEN lead_time <= 24 * 60 * 60 THEN 1 ELSE 0 END AS purchase_24_hours
           , CASE WHEN lead_time <= 48 * 60 * 60 THEN 1 ELSE 0 END AS purchase_48_hours
           , CASE
              WHEN lead_time IS NULL OR NOT (lead_time <= 48 * 60 * 60) THEN 1
              ELSE 0
             END AS not_purchase
          FROM
             action_time_stats
        )
        SELECT
           dt
         , COUNT(*) AS add_cart
         , SUM(purchase_1_hour) AS purchase_1_hour
         , AVG(purchase_1_hour) AS purchase_1_hour_rate
         , SUM(purchase_6_hours) AS purchase_6_hours
         , AVG(purchase_6_hours) AS purchase_6_hours_rate
         , SUM(purchase_24_hours) AS purchase_24_hours
         , AVG(purchase_24_hours) AS purchase_24_hours_rate
         , SUM(purchase_48_hours) AS purchase_48_hours
         , AVG(purchase_48_hours) AS purchase_48_hours_rate
        FROM
           purchase_lead_time_flag
        GROUP BY
           dt
        ;
        """

select(query_135)

Unnamed: 0,dt,add_cart,purchase_1_hour,purchase_1_hour_rate,purchase_6_hours,purchase_6_hours_rate,purchase_24_hours,purchase_24_hours_rate,purchase_48_hours,purchase_48_hours_rate
0,2016-11-03,5,4,0.8,4,0.8,5,1.0,5,1.0
1,2016-11-04,3,0,0.0,2,0.666667,2,0.666667,2,0.666667


### [13-6] 사용자들의 등록일부터 경과한 일수별 매출을 계산하는 쿼리

In [44]:
query_136 = """
        WITH
        index_intervals(index_name, interval_begin_date, interval_end_date) AS (
          VALUES
            ('30 day sales amount', 0, 30)
          , ('45 day sales amount', 0, 45)
          , ('60 day sales amount', 0, 60)
        )
        , mst_users_with_base_date AS (
          SELECT
             user_id
             -- 기준일로 등록일 사용하기
           , register_date AS base_date
          FROM
             mst_users
        )
        , purchase_log_with_index_date AS (
          SELECT
             u.user_id
           , u.base_date
           -- 액션의 날짜와 로그 전체의 최신 날짜를 날짜 자료형으로 변환하기
           , CAST(p.stamp AS date) AS action_date
           , MAX(CAST(p.stamp AS date)) OVER() AS latest_date
           , substring(u.base_date, 1, 7) AS month
           , i.index_name
           -- 지표 대상 기간의 시작일과 종료일 계산하기
           , CAST(u.base_date::date + '1 day'::interval * i.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.base_date::date + '1 day'::interval * i.interval_end_date AS date)
             AS index_end_date
           , p.amount
          FROM
             mst_users_with_base_date AS u
          LEFT OUTER JOIN
             action_log AS p
           ON u.user_id = p.user_id
           AND p.action = 'purchase'
          CROSS JOIN
             index_intervals AS i
        )
        SELECT *
        FROM
           purchase_log_with_index_date
        ;
        """

select(query_136)

Unnamed: 0,user_id,base_date,action_date,latest_date,month,index_name,index_begin_date,index_end_date,amount
0,U001,2016-10-01,2016-10-01,2016-12-01,2016-10,30 day sales amount,2016-10-01,2016-10-31,2000.0
1,U001,2016-10-01,2016-10-01,2016-12-01,2016-10,45 day sales amount,2016-10-01,2016-11-15,2000.0
2,U001,2016-10-01,2016-10-01,2016-12-01,2016-10,60 day sales amount,2016-10-01,2016-11-30,2000.0
3,U001,2016-10-01,2016-11-01,2016-12-01,2016-10,30 day sales amount,2016-10-01,2016-10-31,2000.0
4,U001,2016-10-01,2016-11-01,2016-12-01,2016-10,45 day sales amount,2016-10-01,2016-11-15,2000.0
...,...,...,...,...,...,...,...,...,...
34,U009,2016-10-15,,2016-12-01,2016-10,45 day sales amount,2016-10-15,2016-11-29,
35,U009,2016-10-15,,2016-12-01,2016-10,60 day sales amount,2016-10-15,2016-12-14,
36,U010,2016-10-16,,2016-12-01,2016-10,30 day sales amount,2016-10-16,2016-11-15,
37,U010,2016-10-16,,2016-12-01,2016-10,45 day sales amount,2016-10-16,2016-11-30,


### [13-7] 월별 등록자수와 경과일수별 매출을 집계하는 쿼리

In [46]:
query_137 = """
        WITH
        index_intervals(index_name, interval_begin_date, interval_end_date) AS (
          VALUES
            ('30 day sales amount', 0, 30)
          , ('45 day sales amount', 0, 45)
          , ('60 day sales amount', 0, 60)
        )
        , mst_users_with_base_date AS (
          SELECT
             user_id
             -- 기준일로 등록일 사용하기
           , register_date AS base_date
          FROM
             mst_users
        )
        , purchase_log_with_index_date AS (
          SELECT
             u.user_id
           , u.base_date
           -- 액션의 날짜와 로그 전체의 최신 날짜를 날짜 자료형으로 변환하기
           , CAST(p.stamp AS date) AS action_date
           , MAX(CAST(p.stamp AS date)) OVER() AS latest_date
           , substring(u.base_date, 1, 7) AS month
           , i.index_name
           -- 지표 대상 기간의 시작일과 종료일 계산하기
           , CAST(u.base_date::date + '1 day'::interval * i.interval_begin_date AS date)
             AS index_begin_date
           , CAST(u.base_date::date + '1 day'::interval * i.interval_end_date AS date)
             AS index_end_date
           , p.amount
          FROM
             mst_users_with_base_date AS u
          LEFT OUTER JOIN
             action_log AS p
           ON u.user_id = p.user_id
           AND p.action = 'purchase'
          CROSS JOIN
             index_intervals AS i
        )
        , user_purchase_amount AS (
          SELECT
             user_id
           , month
           , index_name
             -- 3. 지표의 대상 기간에 구매한 금액을 사용자별로 합계 내기
           , SUM(
              -- 1. 지표의 대상 기간의 종료일이 로그의 최신 날짜에 포함되었는지 확인하기
              CASE WHEN index_end_date <= latest_date THEN
               -- 2. 지표의 대상 기간에 구매한 경우에는 구매 금액, 이외의 경우 0 지정하기
               CASE
                WHEN action_date BETWEEN index_begin_date AND index_end_date THEN amount
                ELSE 0
               END
              END
            ) AS index_date_amount
          FROM
             purchase_log_with_index_date
          GROUP BY
             user_id, month, index_name, index_begin_date, index_end_date
        )
        SELECT
           month
           -- 등록자수 세기
           -- 다만 지표의 대상 기간의 종료일이 로그의 최신 날짜 이전에 포함되지 않게 조건 걸기
         , COUNT(index_date_amount) AS users
         , index_name
           -- 지표의 대상 기간 동안 구매한 사용자 수
         , COUNT(CASE WHEN index_date_amount > 0 THEN user_id END) AS purchase_uu
           -- 지표의 대상 기간 동안의 합계 매출
         , SUM(index_date_amount) AS total_amount
           -- 등록자별 평균매출
         , AVG(index_date_amount) AS avg_amount
        FROM
           user_purchase_amount
        GROUP BY
           month, index_name
        ORDER BY
           month, index_name
        ;
        """

select(query_137)

Unnamed: 0,month,users,index_name,purchase_uu,total_amount,avg_amount
0,2016-10,10,30 day sales amount,1,2000.0,200.0
1,2016-10,10,45 day sales amount,2,5000.0,500.0
2,2016-10,3,60 day sales amount,2,5000.0,1666.666667
