## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [1]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [2]:
def select(query):
    return pd.read_sql(query, con)

In [3]:
pd.options.display.max_rows = 10

## 11. 사용자 전체의 특징과 경향 찾기

### [11-1] 액션 수와 비율을 계산하는 쿼리

In [4]:
select('SELECT * FROM mst_users;')

Unnamed: 0,user_id,sex,birth_date,register_date,register_device,withdraw_date
0,U001,M,1977-06-17,2016-10-01,pc,
1,U002,F,1953-06-12,2016-10-01,sp,2016-10-10
2,U003,M,1965-01-06,2016-10-01,pc,
3,U004,F,1954-05-21,2016-10-05,pc,
4,U005,M,1987-11-23,2016-10-05,sp,
5,U006,F,1950-01-21,2016-10-10,pc,2016-10-10
6,U007,F,1950-07-18,2016-10-10,app,
7,U008,F,2006-12-09,2016-10-10,sp,
8,U009,M,2004-10-23,2016-10-15,pc,
9,U010,F,1987-03-18,2016-10-16,pc,


In [5]:
select('SELECT * FROM action_log;')

Unnamed: 0,session,user_id,action,category,products,amount,stamp
0,989004ea,U001,purchase,drama,"D001,D002",2000.0,2016-11-03 18:10:00
1,989004ea,U001,view,,,,2016-11-03 18:00:00
2,989004ea,U001,favorite,drama,D001,,2016-11-03 18:00:00
3,989004ea,,review,drama,D001,,2016-11-03 18:00:00
4,989004ea,U001,add_cart,drama,D001,,2016-11-03 18:00:00
...,...,...,...,...,...,...,...
15,87b5725f,U001,add_cart,action,A004,,2016-11-04 12:00:00
16,87b5725f,,add_cart,action,A005,,2016-11-04 12:00:00
17,87b5725f,U001,add_cart,action,A006,,2016-11-04 12:00:00
18,9afaf87c,U002,purchase,drama,D002,1000.0,2016-11-04 13:00:00


UU : Unique Users를 나타내는 중복 없이 집계된 사용자 수

In [6]:
query_111 = """
        WITH
        stats AS (
          -- 로그 전체의 유니크 사용자 수 구하기
          SELECT COUNT(DISTINCT session) AS total_uu
          FROM action_log
        )
        SELECT
           l.action
           -- 액션 UU
         , COUNT(DISTINCT l.session) AS action_uu
           -- 액션의 수
         , COUNT(*) AS action_count
           -- 전체 UU
         , s.total_uu
           -- 사용률 : <액션 UU> / <전체 UU>
         , 100.0 * COUNT(DISTINCT l.session) / s.total_uu AS usage_rate
           -- 사용률 : <액션 수> / <액션 UU>
         , 1.0 * COUNT(*) / COUNT(DISTINCT l.session) AS count_per_user
        FROM
           action_log AS l
        CROSS JOIN
           stats AS s
        GROUP BY
           l.action, s.total_uu
        ;
        """

select(query_111)

Unnamed: 0,action,action_uu,action_count,total_uu,usage_rate,count_per_user
0,add_cart,3,12,4,75.0,4.0
1,favorite,1,1,4,25.0,1.0
2,purchase,3,5,4,75.0,1.666667
3,review,1,1,4,25.0,1.0
4,view,1,1,4,25.0,1.0


### [11-2] 로그인 상태를 판별하는 쿼리

In [7]:
query_112 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- user_id가 NULL 또는 빈 문자가 아닌 경우 login이라고 판정하기
           , CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END
             AS login_status
          FROM action_log
        )
        SELECT *
        FROM action_log_with_status
        ;
        """

select(query_112)

Unnamed: 0,session,user_id,action,login_status
0,989004ea,U001,purchase,login
1,989004ea,U001,view,login
2,989004ea,U001,favorite,login
3,989004ea,,review,guest
4,989004ea,U001,add_cart,login
...,...,...,...,...
15,87b5725f,U001,add_cart,login
16,87b5725f,,add_cart,guest
17,87b5725f,U001,add_cart,login
18,9afaf87c,U002,purchase,login


### [11-3] 로그인 상태에 따라 액션 수 등을 따로 집계하는 쿼리

In [8]:
query_113 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- user_id가 NULL 또는 빈 문자가 아닌 경우 login이라고 판정하기
           , CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END
             AS login_status
          FROM action_log
        )
        SELECT
           COALESCE(action, 'all') AS action
         , COALESCE(login_status, 'all') AS login_status
         , COUNT(DISTINCT session) AS action_uu
         , COUNT(*) AS action_count
        FROM
           action_log_with_status
        GROUP BY
           ROLLUP(action, login_status)
        ;
        """

select(query_113)

Unnamed: 0,action,login_status,action_uu,action_count
0,add_cart,guest,1,1
1,add_cart,login,3,11
2,add_cart,all,3,12
3,favorite,login,1,1
4,favorite,all,1,1
...,...,...,...,...
7,review,guest,1,1
8,review,all,1,1
9,view,login,1,1
10,view,all,1,1


### [11-4] 회원 상태를 판별하는 쿼리

In [9]:
query_114 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- 로그를 타임스태프 순서로 나열하고, 한 번이라도 로그인한 사용자일 경우
             -- 이후의 모든 로그 상태를 member로 설정
           , CASE
              WHEN
               COALESCE(MAX(user_id)
                OVER(PARTITION BY session ORDER BY stamp
                 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
                , '') <> ''
               THEN 'member'
              ELSE 'none'
             END AS member_status
           , stamp
          FROM
             action_log
        )
        SELECT *
        FROM action_log_with_status
        ;
        """

select(query_114)

Unnamed: 0,session,user_id,action,member_status,stamp
0,47db0370,U002,add_cart,member,2016-11-03 19:00:00
1,47db0370,U002,purchase,member,2016-11-03 20:00:00
2,47db0370,U002,add_cart,member,2016-11-03 20:30:00
3,87b5725f,,add_cart,none,2016-11-04 12:00:00
4,87b5725f,U001,add_cart,member,2016-11-04 12:00:00
...,...,...,...,...,...
15,989004ea,U001,add_cart,member,2016-11-03 18:02:00
16,989004ea,U001,purchase,member,2016-11-03 18:10:00
17,989004ea,U001,purchase,member,2016-11-03 18:10:00
18,9afaf87c,U002,purchase,member,2016-11-04 13:00:00


### [11-5] 사용자의 생일을 계산하는 쿼리

In [11]:
query_115 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
           , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
             mst_users_with_int_birth_date
        )  
        SELECT
           user_id, sex, birth_date, age
        FROM
           mst_users_with_age
        ;
        """

select(query_115)

Unnamed: 0,user_id,sex,birth_date,age
0,U001,M,1977-06-17,39.0
1,U002,F,1953-06-12,63.0
2,U003,M,1965-01-06,51.0
3,U004,F,1954-05-21,62.0
4,U005,M,1987-11-23,29.0
5,U006,F,1950-01-21,66.0
6,U007,F,1950-07-18,66.0
7,U008,F,2006-12-09,10.0
8,U009,M,2004-10-23,12.0
9,U010,F,1987-03-18,29.0


### [11-6] 성별과 연령으로 연령별 구분을 계산하는 쿼리

In [12]:
query_116 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT *
        FROM
           mst_users_with_category
        ;
        """

select(query_116)

Unnamed: 0,user_id,sex,age,category
0,U001,M,39.0,M2
1,U002,F,63.0,F3
2,U003,M,51.0,M3
3,U004,F,62.0,F3
4,U005,M,29.0,M1
5,U006,F,66.0,F3
6,U007,F,66.0,F3
7,U008,F,10.0,C
8,U009,M,12.0,C
9,U010,F,29.0,F1


### [11-7] 연령별 구분의 사람 수를 계산하는 쿼리

In [14]:
query_117 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT
           category
         , COUNT(*) AS user_count
        FROM
           mst_users_with_category
        GROUP BY
           category
        ;
        """

select(query_117)

Unnamed: 0,category,user_count
0,M1,1
1,M2,1
2,C,2
3,M3,1
4,F1,1
5,F3,4


### [11-8] 연령별 구분과 카테고리를 집계하는 쿼리

In [16]:
query_118 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT
           p.category AS product_category
         , u.category AS user_category
         , COUNT(*) AS purchase_count
        FROM
           action_log AS p
         JOIN
           mst_users_with_category AS u
         ON p.user_id = u.user_id
        WHERE
           action = 'purchase'
        GROUP BY
           p.category, u.category
        ORDER BY
           p.category, u.category
        ;
        """

select(query_118)

Unnamed: 0,product_category,user_category,purchase_count
0,action,M2,1
1,drama,F3,2
2,drama,M2,2


### [11-9] 한 주에 며칠 사용되었는지를 집계하는 쿼리

In [17]:
query_119 = """
        WITH
        action_log_with_dt AS (
         SELECT
            *
          , substring(stamp, 1, 10) AS dt
         FROM action_log
        )
        , action_day_count_per_user AS(
         SELECT
            user_id
          , COUNT(DISTINCT dt) AS action_day_count
         FROM
            action_log_with_dt
         WHERE
            dt BETWEEN '2016-11-01' AND '2016-11-07'
         GROUP BY
            user_id
        )
        SELECT
           action_day_count
         , COUNT(DISTINCT user_id) AS user_count
        FROM
           action_day_count_per_user
        GROUP BY
           action_day_count
        ORDER BY
           action_day_count
        ;
        """

select(query_119)

Unnamed: 0,action_day_count,user_count
0,2,2


### [11-10] 구성비와 구성비누계를 계산하는 쿼리

In [19]:
query_1110 = """
        WITH
        action_log_with_dt AS (
         SELECT
            *
          , substring(stamp, 1, 10) AS dt
         FROM action_log
        )
        , action_day_count_per_user AS(
         SELECT
            user_id
          , COUNT(DISTINCT dt) AS action_day_count
         FROM
            action_log_with_dt
         WHERE
            dt BETWEEN '2016-11-01' AND '2016-11-07'
         GROUP BY
            user_id
        )
        SELECT
           action_day_count
         , COUNT(DISTINCT user_id) AS user_count
           -- 구성비
         , 100.0
           * COUNT(DISTINCT user_id)
           / SUM(COUNT(DISTINCT user_id)) OVER()
           AS composition_ratio
           -- 구성비 누계
         , 100.0
           * SUM(COUNT(DISTINCT user_id))
              OVER(ORDER BY action_day_count
               ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
           / SUM(COUNT(DISTINCT user_id)) OVER()
           AS cumulative_ratio
        FROM
           action_day_count_per_user
        GROUP BY
           action_day_count
        ORDER BY
           action_day_count
        ;
        """

select(query_1110)

Unnamed: 0,action_day_count,user_count,composition_ratio,cumulative_ratio
0,2,2,100.0,100.0


### [11-11] 사용자들의 액션 플래그를 집계하는 쿼리

In [21]:
query_1111 = """
        WITH
        user_action_flag AS(
         -- 사용자가 액션을 했으면 1, 안 했으면 0으로 플래그 붙이기
         SELECT
            user_id
          , SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase
          , SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review
          , SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite         
         FROM
            action_log
         GROUP BY
            user_id
        )
        SELECT *
        FROM user_action_flag
        ;
        """

select(query_1111)

Unnamed: 0,user_id,has_purchase,has_review,has_favorite
0,,0.0,1.0,0.0
1,U002,1.0,0.0,0.0
2,U001,1.0,0.0,1.0


### [11-12] 모든 액션 조합에 대한 사용자 수 계산하기

In [22]:
query_1112 = """
        WITH
        user_action_flag AS(
         -- 사용자가 액션을 했으면 1, 안 했으면 0으로 플래그 붙이기
         SELECT
            user_id
          , SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase
          , SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review
          , SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite         
         FROM
            action_log
         GROUP BY
            user_id
        )
        , action_venn_diagram AS (
          SELECT
             has_purchase
           , has_review
           , has_favorite
           , COUNT(*) AS users
          FROM
             user_action_flag
          GROUP BY
             -- CUBE를 사용해서 모든 액션 조합 구하기
             CUBE(has_purchase, has_review, has_favorite)
        )
        SELECT *
        FROM action_venn_diagram
        ORDER BY
           has_purchase, has_review, has_favorite
        ;
        """

select(query_1112)

Unnamed: 0,has_purchase,has_review,has_favorite,users
0,0.0,1.0,0.0,1
1,0.0,1.0,,1
2,0.0,,0.0,1
3,0.0,,,1
4,1.0,0.0,0.0,1
...,...,...,...,...
13,,1.0,0.0,1
14,,1.0,,1
15,,,0.0,2
16,,,1.0,1


### [11-13] 벤 다이어그램을 만들기 위해 데이터를 가공하는 쿼리

In [23]:
query_1113 = """
        WITH
        user_action_flag AS(
         -- 사용자가 액션을 했으면 1, 안 했으면 0으로 플래그 붙이기
         SELECT
            user_id
          , SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase
          , SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review
          , SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite         
         FROM
            action_log
         GROUP BY
            user_id
        )
        , action_venn_diagram AS (
          SELECT
             has_purchase
           , has_review
           , has_favorite
           , COUNT(*) AS users
          FROM
             user_action_flag
          GROUP BY
             -- CUBE를 사용해서 모든 액션 조합 구하기
             CUBE(has_purchase, has_review, has_favorite)
        )
        SELECT
           -- 0, 1 플래그를 문자열을 가공하기
           CASE has_purchase
            WHEN 1 THEN 'purchase' WHEN 0 THEN 'not purchase' ELSE 'any'
           END AS has_purchase
         , CASE has_review
            WHEN 1 THEN 'review' WHEN 0 THEN 'not review' ELSE 'any'
           END AS has_review
         , CASE has_favorite
            WHEN 1 THEN 'favorite' WHEN 0 THEN 'not favorite' ELSE 'any'
           END AS has_favorite
         , users
           -- 전체 사용자 수를 기반으로 비율 구하기
         , 100.0 * users
           / NULLIF(
              -- 모든 액션이 NULL인 사용자 수가 전체 사용자 수를 나타내므로
              -- 해당 레코드의 사용자 수를 Window 함수로 구하기
              SUM(CASE WHEN has_purchase IS NULL
                        AND has_review IS NULL
                        AND has_favorite IS NULL
                       THEN users ELSE 0 END) OVER()
             , 0)
           AS ratio
        FROM
           action_venn_diagram
        ORDER BY
           has_purchase, has_review, has_favorite
        ;
        """

select(query_1113)

Unnamed: 0,has_purchase,has_review,has_favorite,users,ratio
0,any,any,any,3,100.000000
1,any,any,favorite,1,33.333333
2,any,any,not favorite,2,66.666667
3,any,not review,any,2,66.666667
4,any,not review,favorite,1,33.333333
...,...,...,...,...,...
13,purchase,any,favorite,1,33.333333
14,purchase,any,not favorite,1,33.333333
15,purchase,not review,any,2,66.666667
16,purchase,not review,favorite,1,33.333333


### [11-14] 구매액이 많은 순서로 사용자 그룹을 10등분하는 쿼리

In [25]:
query_1114 = """
        WITH
        user_purchase_amount AS(
         SELECT
            user_id
          , SUM(amount) AS purchase_amount
         FROM
            action_log
         WHERE
            action = 'purchase'
         GROUP BY
            user_id
        )
        , users_with_decile AS (
          SELECT
             user_id
           , purchase_amount
           , ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
          FROM
             user_purchase_amount
        )
        SELECT *
        FROM users_with_decile
        ;
        """

select(query_1114)

Unnamed: 0,user_id,purchase_amount,decile
0,U001,5000,1
1,U002,2000,2


### [11-15] 10분할한 Decile들을 집계하는 쿼리

In [27]:
query_1115 = """
        WITH
        user_purchase_amount AS(
         SELECT
            user_id
          , SUM(amount) AS purchase_amount
         FROM
            action_log
         WHERE
            action = 'purchase'
         GROUP BY
            user_id
        )
        , users_with_decile AS (
          SELECT
             user_id
           , purchase_amount
           , ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
          FROM
             user_purchase_amount
        )
        , decile_with_purchase_amount AS (
          SELECT
             decile
           , SUM(purchase_amount) AS amount
           , AVG(purchase_amount) AS avg_amount
           , SUM(SUM(purchase_amount)) OVER (ORDER BY decile) AS cumulative_amount
           , SUM(SUM(purchase_amount)) OVER () AS total_amount
          FROM
             users_with_decile
          GROUP BY
             decile
        )
        SELECT *
        FROM decile_with_purchase_amount
        ;
        """

select(query_1115)

Unnamed: 0,decile,amount,avg_amount,cumulative_amount,total_amount
0,1,5000.0,5000.0,5000.0,7000.0
1,2,2000.0,2000.0,7000.0,7000.0


### [11-16] 구매액이 많은 Decile 순서로 구성비와 구성비누계를 계산하는 쿼리

In [28]:
query_1116 = """
        WITH
        user_purchase_amount AS(
         SELECT
            user_id
          , SUM(amount) AS purchase_amount
         FROM
            action_log
         WHERE
            action = 'purchase'
         GROUP BY
            user_id
        )
        , users_with_decile AS (
          SELECT
             user_id
           , purchase_amount
           , ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
          FROM
             user_purchase_amount
        )
        , decile_with_purchase_amount AS (
          SELECT
             decile
           , SUM(purchase_amount) AS amount
           , AVG(purchase_amount) AS avg_amount
           , SUM(SUM(purchase_amount)) OVER (ORDER BY decile) AS cumulative_amount
           , SUM(SUM(purchase_amount)) OVER () AS total_amount
          FROM
             users_with_decile
          GROUP BY
             decile
        )
        SELECT
           decile
         , amount
         , avg_amount
         , 100.0 * amount / total_amount AS total_ratio
         , 100.0 * cumulative_amount / total_amount AS cummulative_ratio
        FROM decile_with_purchase_amount
        ;
        """

select(query_1116)

Unnamed: 0,decile,amount,avg_amount,total_ratio,cummulative_ratio
0,1,5000.0,5000.0,71.428571,71.428571
1,2,2000.0,2000.0,28.571429,100.0


### [11-17] 사용자별로 RFM을 집계하는 쿼리

- Recency : 최근 구매일 / 최근 무언가를 구매한 사용자를 우량 고객으로 취급
- Frequency : 구매 횟수 / 사용자가 구매한 횟수를 세고, 많을수록 우량 고객으로 취급
- Monetary : 구매 금액 합계 / 사용자의 구매 금액 합계를 집계하고, 금액이 높을수록 우량 고객으로 취급

In [30]:
query_1117 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        SELECT *
        FROM user_rfm
        ;
        """

select(query_1117)

Unnamed: 0,user_id,recent_date,recency,frequency,monetary
0,U001,2016-11-04,2197,3,5000
1,U002,2016-11-04,2197,2,2000


### [11-18] 사용자들의 RFM 랭크를 계산하는 쿼리

In [31]:
query_1118 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        , user_rfm_rank AS (
          SELECT
             user_id
           , recent_date
           , recency
           , frequency
           , monetary
           , CASE
              WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1
             END AS r
           , CASE
              WHEN frequency >= 20 THEN 5
              WHEN frequency >= 10 THEN 4
              WHEN frequency >= 5 THEN 3
              WHEN frequency >= 2 THEN 2
              WHEN frequency = 1 THEN 1
             END AS f
           , CASE
              WHEN monetary >= 300000 THEN 5
              WHEN monetary >= 100000 THEN 4
              WHEN monetary >= 30000 THEN 3
              WHEN monetary >= 5000 THEN 2
              WHEN frequency = 1 THEN 1
              ELSE 1
             END AS m
          FROM
             user_rfm
        )
        SELECT *
        FROM user_rfm_rank
        ;
        """

select(query_1118)

Unnamed: 0,user_id,recent_date,recency,frequency,monetary,r,f,m
0,U001,2016-11-04,2197,3,5000,1,2,2
1,U002,2016-11-04,2197,2,2000,1,2,1


### [11-19] 각 그룹의 사람 수를 확인하는 쿼리

In [35]:
query_1119 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        , user_rfm_rank AS (
          SELECT
             user_id
           , recent_date
           , recency
           , frequency
           , monetary
           , CASE
              WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1
             END AS r
           , CASE
              WHEN frequency >= 20 THEN 5
              WHEN frequency >= 10 THEN 4
              WHEN frequency >= 5 THEN 3
              WHEN frequency >= 2 THEN 2
              WHEN frequency = 1 THEN 1
             END AS f
           , CASE
              WHEN monetary >= 300000 THEN 5
              WHEN monetary >= 100000 THEN 4
              WHEN monetary >= 30000 THEN 3
              WHEN monetary >= 5000 THEN 2
              WHEN frequency = 1 THEN 1
              ELSE 1
             END AS m
          FROM
             user_rfm
        )
        , mst_rfm_index AS (
          -- 1부터 5까지의 숫자를 가지는 테이블 만들기
                    SELECT 1 AS rfm_index        
          UNION ALL SELECT 2 AS rfm_index
          UNION ALL SELECT 3 AS rfm_index
          UNION ALL SELECT 4 AS rfm_index
          UNION ALL SELECT 5 AS rfm_index          
        )
        , rfm_flag AS (
          SELECT
             m.rfm_index
           , CASE WHEN m.rfm_index = r.r THEN 1 ELSE 0 END AS r_flag
           , CASE WHEN m.rfm_index = r.f THEN 1 ELSE 0 END AS f_flag           
           , CASE WHEN m.rfm_index = r.m THEN 1 ELSE 0 END AS m_flag
          FROM
             mst_rfm_index AS m
           CROSS JOIN
             user_rfm_rank AS r
        )
        SELECT
           rfm_index
         , SUM(r_flag) AS r
         , SUM(f_flag) AS f
         , SUM(m_flag) AS m
        FROM
           rfm_flag
        GROUP BY
           rfm_index
        ORDER BY
           rfm_index DESC
        ;
        """

select(query_1119)

Unnamed: 0,rfm_index,r,f,m
0,5,0,0,0
1,4,0,0,0
2,3,0,0,0
3,2,0,2,1
4,1,2,0,1


### [11-20] 통합 랭크를 계산하는 쿼리

In [36]:
query_1120 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        , user_rfm_rank AS (
          SELECT
             user_id
           , recent_date
           , recency
           , frequency
           , monetary
           , CASE
              WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1
             END AS r
           , CASE
              WHEN frequency >= 20 THEN 5
              WHEN frequency >= 10 THEN 4
              WHEN frequency >= 5 THEN 3
              WHEN frequency >= 2 THEN 2
              WHEN frequency = 1 THEN 1
             END AS f
           , CASE
              WHEN monetary >= 300000 THEN 5
              WHEN monetary >= 100000 THEN 4
              WHEN monetary >= 30000 THEN 3
              WHEN monetary >= 5000 THEN 2
              WHEN frequency = 1 THEN 1
              ELSE 1
             END AS m
          FROM
             user_rfm
        )
        SELECT
           r + f + m AS total_rank
         , r
         , f
         , m
         , COUNT(user_id)
        FROM
           user_rfm_rank
        GROUP BY
           r, f, m
        ORDER BY
           total_rank DESC, r DESC, f DESC, m DESC
        ;
        """

select(query_1120)

Unnamed: 0,total_rank,r,f,m,count
0,5,1,2,2,1
1,4,1,2,1,1


### [11-21] 종합 랭크별로 사용자 수를 집계하는 쿼리

In [37]:
query_1121 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        , user_rfm_rank AS (
          SELECT
             user_id
           , recent_date
           , recency
           , frequency
           , monetary
           , CASE
              WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1
             END AS r
           , CASE
              WHEN frequency >= 20 THEN 5
              WHEN frequency >= 10 THEN 4
              WHEN frequency >= 5 THEN 3
              WHEN frequency >= 2 THEN 2
              WHEN frequency = 1 THEN 1
             END AS f
           , CASE
              WHEN monetary >= 300000 THEN 5
              WHEN monetary >= 100000 THEN 4
              WHEN monetary >= 30000 THEN 3
              WHEN monetary >= 5000 THEN 2
              WHEN frequency = 1 THEN 1
              ELSE 1
             END AS m
          FROM
             user_rfm
        )
        SELECT
           r + f + m AS total_rank
         , COUNT(user_id)
        FROM
           user_rfm_rank
        GROUP BY
           -- SELECT 구문에서 정의한 별칭을 GROUP BY 구문에서 지정할 수 있음
           total_rank
        ORDER BY
           total_rank DESC
        ;
        """

select(query_1121)

Unnamed: 0,total_rank,count
0,5,1
1,4,1


### [11-22] R과 F를 사용해 2차원 사용자 층의 사용자 수를 집계하는 쿼리

In [38]:
query_1122 = """
        WITH
        purchase_log AS(
         SELECT
            user_id
          , amount
          , substring(stamp, 1, 10) AS dt
         FROM
            action_log
         WHERE
            action = 'purchase'
        )
        , user_rfm AS (
          SELECT
             user_id
           , MAX(dt) AS recent_date
           , CURRENT_DATE - MAX(dt::date) AS recency
           , COUNT(dt) AS frequency
           , SUM(amount) AS monetary
          FROM
             purchase_log
          GROUP BY
             user_id
        )
        , user_rfm_rank AS (
          SELECT
             user_id
           , recent_date
           , recency
           , frequency
           , monetary
           , CASE
              WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1
             END AS r
           , CASE
              WHEN frequency >= 20 THEN 5
              WHEN frequency >= 10 THEN 4
              WHEN frequency >= 5 THEN 3
              WHEN frequency >= 2 THEN 2
              WHEN frequency = 1 THEN 1
             END AS f
           , CASE
              WHEN monetary >= 300000 THEN 5
              WHEN monetary >= 100000 THEN 4
              WHEN monetary >= 30000 THEN 3
              WHEN monetary >= 5000 THEN 2
              WHEN frequency = 1 THEN 1
              ELSE 1
             END AS m
          FROM
             user_rfm
        )
        SELECT
           CONCAT('r_', r) AS r_rank
         , COUNT(CASE WHEN f = 5 THEN 1 END) AS f_5
         , COUNT(CASE WHEN f = 4 THEN 1 END) AS f_4
         , COUNT(CASE WHEN f = 3 THEN 1 END) AS f_3
         , COUNT(CASE WHEN f = 2 THEN 1 END) AS f_2
         , COUNT(CASE WHEN f = 1 THEN 1 END) AS f_1
        FROM
           user_rfm_rank
        GROUP BY
           r
        ORDER BY
           r_rank DESC
        ;
        """

select(query_1122)

Unnamed: 0,r_rank,f_5,f_4,f_3,f_2,f_1
0,r_1,0,0,0,2,0


## 12. 시계열에 따른 사용자 전체의 상태 변화 찾기