## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [1]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [2]:
def select(query):
    return pd.read_sql(query, con)

In [3]:
pd.options.display.max_rows = 10

## 12. 시계열에 따른 사용자 전체의 상태 변화 찾기

### [12-1] 날짜별 등록 수의 추이를 집계하는 쿼리

In [10]:
select('SELECT * FROM mst_users;')

Unnamed: 0,user_id,sex,birth_date,register_date,register_device,withdraw_date
0,U001,M,1977-06-17,2016-10-01,pc,
1,U002,F,1953-06-12,2016-10-01,sp,2016-10-10
2,U003,M,1965-01-06,2016-10-01,pc,
3,U004,F,1954-05-21,2016-10-05,pc,
4,U005,M,1987-11-23,2016-10-05,sp,
...,...,...,...,...,...,...
25,U026,M,1969-02-21,2016-11-10,sp,
26,U027,F,2001-07-10,2016-11-10,pc,
27,U028,M,1976-05-26,2016-11-15,app,
28,U029,M,1964-04-06,2016-11-28,pc,


In [11]:
select('SELECT * FROM action_log;')

Unnamed: 0,session,user_id,action,stamp
0,989004ea,U001,view,2016-10-01 18:00:00
1,989004ea,U001,view,2016-10-01 18:01:00
2,989004ea,U001,view,2016-10-01 18:10:00
3,47db0370,U001,follow,2016-10-05 19:00:00
4,47db0370,U001,view,2016-10-05 19:10:00
...,...,...,...,...
9,87b5725f,U002,follow,2016-10-01 12:00:00
10,87b5725f,U002,follow,2016-10-01 12:01:00
11,87b5725f,U002,follow,2016-10-01 12:02:00
12,9afaf87c,U002,view,2016-10-02 13:00:00


In [12]:
query_121 = """
        SELECT
           register_date
         , COUNT(DISTINCT user_id) AS register_count
        FROM
           mst_users
        GROUP BY
           register_date
        ORDER BY
           register_date
        ;
        """

select(query_121)

Unnamed: 0,register_date,register_count
0,2016-10-01,3
1,2016-10-05,2
2,2016-10-10,3
3,2016-10-15,1
4,2016-10-16,1
...,...,...
10,2016-11-04,1
11,2016-11-05,2
12,2016-11-10,2
13,2016-11-15,1


### [12-2] 매달 등록 수와 전월비를 계산하는 쿼리

In [13]:
query_122 = """
        WITH
        mst_users_with_year_month AS (
         SELECT
            *
          , substring(register_date, 1, 7) AS year_month
         FROM
            mst_users
        )
        SELECT
           year_month
         , COUNT(DISTINCT user_id) AS register_count
         , LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY year_month)
           AS last_month_count
         , 1.0
           * COUNT(DISTINCT user_id)
           / LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY year_month)
           AS month_over_month_ratio
        FROM
           mst_users_with_year_month
        GROUP BY
           year_month
        ;
        """

select(query_122)

Unnamed: 0,year_month,register_count,last_month_count,month_over_month_ratio
0,2016-10,14,,
1,2016-11,16,14.0,1.142857


### [12-3] 디바이스들의 등록 수를 집계하는 쿼리 

In [14]:
query_123 = """
        WITH
        mst_users_with_year_month AS (
         SELECT
            *
          , substring(register_date, 1, 7) AS year_month
         FROM
            mst_users
        )
        SELECT
           year_month
         , COUNT(DISTINCT user_id) AS register_count
         , COUNT(DISTINCT CASE WHEN register_device = 'pc' THEN user_id END) AS register_pc
         , COUNT(DISTINCT CASE WHEN register_device = 'sp' THEN user_id END) AS register_sp
         , COUNT(DISTINCT CASE WHEN register_device = 'app' THEN user_id END) AS register_app
        FROM
           mst_users_with_year_month
        GROUP BY
           year_month
        ;
        """

select(query_123)

Unnamed: 0,year_month,register_count,register_pc,register_sp,register_app
0,2016-10,14,7,4,3
1,2016-11,16,4,4,8


### [12-4] '로그 최근 일자'와 '사용자별 등록일의 다음날'을 계산하는 쿼리

- 지속률 : 등록일 기준으로 이후 지정일 동안 사용자가 서비스를 얼마나 이용했는지 나타내는 지표
    - 지속률(Repeat) -> 사용자가 매일 사용했으면 하는 서비스
- 정착률 : 등록일 기준으로 이후 지정한 7일 동안 사용자가 서비스를 사용했는지 나타내는 지표
    - 정착률(Retention) -> 사용자에게 어떤 목적이 생겼을 때 사용했으면 하는 서비스

In [None]:
query_124 = """
        WITH
        mst_users_with_year_month AS (
         SELECT
            *
          , substring(register_date, 1, 7) AS year_month
         FROM
            mst_users
        )
        SELECT
           year_month
         , COUNT(DISTINCT user_id) AS register_count
         , COUNT(DISTINCT CASE WHEN register_device = 'pc' THEN user_id END) AS register_pc
         , COUNT(DISTINCT CASE WHEN register_device = 'sp' THEN user_id END) AS register_sp
         , COUNT(DISTINCT CASE WHEN register_device = 'app' THEN user_id END) AS register_app
        FROM
           mst_users_with_year_month
        GROUP BY
           year_month
        ;
        """

select(query_124)