## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [1]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [2]:
def select(query):
    return pd.read_sql(query, con)

In [3]:
pd.options.display.max_rows = 10

## 11. 사용자 전체의 특징과 경향 찾기

### [11-1] 액션 수와 비율을 계산하는 쿼리

In [4]:
select('SELECT * FROM mst_users;')

Unnamed: 0,user_id,sex,birth_date,register_date,register_device,withdraw_date
0,U001,M,1977-06-17,2016-10-01,pc,
1,U002,F,1953-06-12,2016-10-01,sp,2016-10-10
2,U003,M,1965-01-06,2016-10-01,pc,
3,U004,F,1954-05-21,2016-10-05,pc,
4,U005,M,1987-11-23,2016-10-05,sp,
5,U006,F,1950-01-21,2016-10-10,pc,2016-10-10
6,U007,F,1950-07-18,2016-10-10,app,
7,U008,F,2006-12-09,2016-10-10,sp,
8,U009,M,2004-10-23,2016-10-15,pc,
9,U010,F,1987-03-18,2016-10-16,pc,


In [5]:
select('SELECT * FROM action_log;')

Unnamed: 0,session,user_id,action,category,products,amount,stamp
0,989004ea,U001,purchase,drama,"D001,D002",2000.0,2016-11-03 18:10:00
1,989004ea,U001,view,,,,2016-11-03 18:00:00
2,989004ea,U001,favorite,drama,D001,,2016-11-03 18:00:00
3,989004ea,,review,drama,D001,,2016-11-03 18:00:00
4,989004ea,U001,add_cart,drama,D001,,2016-11-03 18:00:00
...,...,...,...,...,...,...,...
15,87b5725f,U001,add_cart,action,A004,,2016-11-04 12:00:00
16,87b5725f,,add_cart,action,A005,,2016-11-04 12:00:00
17,87b5725f,U001,add_cart,action,A006,,2016-11-04 12:00:00
18,9afaf87c,U002,purchase,drama,D002,1000.0,2016-11-04 13:00:00


UU : Unique Users를 나타내는 중복 없이 집계된 사용자 수

In [6]:
query_111 = """
        WITH
        stats AS (
          -- 로그 전체의 유니크 사용자 수 구하기
          SELECT COUNT(DISTINCT session) AS total_uu
          FROM action_log
        )
        SELECT
           l.action
           -- 액션 UU
         , COUNT(DISTINCT l.session) AS action_uu
           -- 액션의 수
         , COUNT(*) AS action_count
           -- 전체 UU
         , s.total_uu
           -- 사용률 : <액션 UU> / <전체 UU>
         , 100.0 * COUNT(DISTINCT l.session) / s.total_uu AS usage_rate
           -- 사용률 : <액션 수> / <액션 UU>
         , 1.0 * COUNT(*) / COUNT(DISTINCT l.session) AS count_per_user
        FROM
           action_log AS l
        CROSS JOIN
           stats AS s
        GROUP BY
           l.action, s.total_uu
        ;
        """

select(query_111)

Unnamed: 0,action,action_uu,action_count,total_uu,usage_rate,count_per_user
0,add_cart,3,12,4,75.0,4.0
1,favorite,1,1,4,25.0,1.0
2,purchase,3,5,4,75.0,1.666667
3,review,1,1,4,25.0,1.0
4,view,1,1,4,25.0,1.0


### [11-2] 로그인 상태를 판별하는 쿼리

In [7]:
query_112 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- user_id가 NULL 또는 빈 문자가 아닌 경우 login이라고 판정하기
           , CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END
             AS login_status
          FROM action_log
        )
        SELECT *
        FROM action_log_with_status
        ;
        """

select(query_112)

Unnamed: 0,session,user_id,action,login_status
0,989004ea,U001,purchase,login
1,989004ea,U001,view,login
2,989004ea,U001,favorite,login
3,989004ea,,review,guest
4,989004ea,U001,add_cart,login
...,...,...,...,...
15,87b5725f,U001,add_cart,login
16,87b5725f,,add_cart,guest
17,87b5725f,U001,add_cart,login
18,9afaf87c,U002,purchase,login


### [11-3] 로그인 상태에 따라 액션 수 등을 따로 집계하는 쿼리

In [8]:
query_113 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- user_id가 NULL 또는 빈 문자가 아닌 경우 login이라고 판정하기
           , CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END
             AS login_status
          FROM action_log
        )
        SELECT
           COALESCE(action, 'all') AS action
         , COALESCE(login_status, 'all') AS login_status
         , COUNT(DISTINCT session) AS action_uu
         , COUNT(*) AS action_count
        FROM
           action_log_with_status
        GROUP BY
           ROLLUP(action, login_status)
        ;
        """

select(query_113)

Unnamed: 0,action,login_status,action_uu,action_count
0,add_cart,guest,1,1
1,add_cart,login,3,11
2,add_cart,all,3,12
3,favorite,login,1,1
4,favorite,all,1,1
...,...,...,...,...
7,review,guest,1,1
8,review,all,1,1
9,view,login,1,1
10,view,all,1,1


### [11-4] 회원 상태를 판별하는 쿼리

In [9]:
query_114 = """
        WITH
        action_log_with_status AS (
          SELECT
             session
           , user_id
           , action
             -- 로그를 타임스태프 순서로 나열하고, 한 번이라도 로그인한 사용자일 경우
             -- 이후의 모든 로그 상태를 member로 설정
           , CASE
              WHEN
               COALESCE(MAX(user_id)
                OVER(PARTITION BY session ORDER BY stamp
                 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
                , '') <> ''
               THEN 'member'
              ELSE 'none'
             END AS member_status
           , stamp
          FROM
             action_log
        )
        SELECT *
        FROM action_log_with_status
        ;
        """

select(query_114)

Unnamed: 0,session,user_id,action,member_status,stamp
0,47db0370,U002,add_cart,member,2016-11-03 19:00:00
1,47db0370,U002,purchase,member,2016-11-03 20:00:00
2,47db0370,U002,add_cart,member,2016-11-03 20:30:00
3,87b5725f,,add_cart,none,2016-11-04 12:00:00
4,87b5725f,U001,add_cart,member,2016-11-04 12:00:00
...,...,...,...,...,...
15,989004ea,U001,add_cart,member,2016-11-03 18:02:00
16,989004ea,U001,purchase,member,2016-11-03 18:10:00
17,989004ea,U001,purchase,member,2016-11-03 18:10:00
18,9afaf87c,U002,purchase,member,2016-11-04 13:00:00


### [11-5] 사용자의 생일을 계산하는 쿼리

In [11]:
query_115 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
           , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
             mst_users_with_int_birth_date
        )  
        SELECT
           user_id, sex, birth_date, age
        FROM
           mst_users_with_age
        ;
        """

select(query_115)

Unnamed: 0,user_id,sex,birth_date,age
0,U001,M,1977-06-17,39.0
1,U002,F,1953-06-12,63.0
2,U003,M,1965-01-06,51.0
3,U004,F,1954-05-21,62.0
4,U005,M,1987-11-23,29.0
5,U006,F,1950-01-21,66.0
6,U007,F,1950-07-18,66.0
7,U008,F,2006-12-09,10.0
8,U009,M,2004-10-23,12.0
9,U010,F,1987-03-18,29.0


### [11-6] 성별과 연령으로 연령별 구분을 계산하는 쿼리

In [12]:
query_116 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT *
        FROM
           mst_users_with_category
        ;
        """

select(query_116)

Unnamed: 0,user_id,sex,age,category
0,U001,M,39.0,M2
1,U002,F,63.0,F3
2,U003,M,51.0,M3
3,U004,F,62.0,F3
4,U005,M,29.0,M1
5,U006,F,66.0,F3
6,U007,F,66.0,F3
7,U008,F,10.0,C
8,U009,M,12.0,C
9,U010,F,29.0,F1


### [11-7] 연령별 구분의 사람 수를 계산하는 쿼리

In [14]:
query_117 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT
           category
         , COUNT(*) AS user_count
        FROM
           mst_users_with_category
        GROUP BY
           category
        ;
        """

select(query_117)

Unnamed: 0,category,user_count
0,M1,1
1,M2,1
2,C,2
3,M3,1
4,F1,1
5,F3,4


### [11-8] 연령별 구분과 카테고리를 집계하는 쿼리

In [16]:
query_118 = """
        WITH
        mst_users_with_int_birth_date AS (
          SELECT
             *
             -- 특정 날짜(2017년 1월 1일)의 정수 표현
           , 20170101 AS int_specific_date
             -- 문자열로 구성된 생년월일을 정수 표현으로 변환하기
           , CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
          FROM
             mst_users
        )
        , mst_users_with_age AS(
          SELECT
            *
            -- 특정 날짜(2017년 1월 1일)의 나이
          , floor((int_specific_date - int_birth_date) / 10000) AS age
          FROM
            mst_users_with_int_birth_date
        )
        , mst_users_with_category AS (
           SELECT
             user_id
           , sex
           , age
           , CONCAT(
               CASE
                WHEN 20 <= age THEN sex
                ELSE ''
               END
             , CASE
                WHEN age BETWEEN 4 AND 12 THEN 'C'
                WHEN age BETWEEN 13 AND 19 THEN 'T'                
                WHEN age BETWEEN 20 AND 34 THEN '1'
                WHEN age BETWEEN 35 AND 49 THEN '2'                
                WHEN age >= 50 THEN '3'
               END
             ) AS category
           FROM
             mst_users_with_age
        )
        SELECT
           p.category AS product_category
         , u.category AS user_category
         , COUNT(*) AS purchase_count
        FROM
           action_log AS p
         JOIN
           mst_users_with_category AS u
         ON p.user_id = u.user_id
        WHERE
           action = 'purchase'
        GROUP BY
           p.category, u.category
        ORDER BY
           p.category, u.category
        ;
        """

select(query_118)

Unnamed: 0,product_category,user_category,purchase_count
0,action,M2,1
1,drama,F3,2
2,drama,M2,2
