## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [1]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [2]:
def select(query):
    return pd.read_sql(query, con)

In [3]:
pd.options.display.max_rows = 10

## 14. 사이트 전체의 특징/경향 찾기

### [14-1] 날짜별 접근 데이터를 집계하는 쿼리

In [4]:
select('SELECT * FROM access_log;')

Unnamed: 0,stamp,short_session,long_session,url,referrer
0,2016-10-01 12:00:00,0CVKaz,1CwlSX,http://www.example.com/?utm_source=google&utm_...,http://www.google.co.jp/xxx
1,2016-10-01 13:00:00,0CVKaz,1CwlSX,http://www.example.com/detail?id=1,
2,2016-10-01 13:00:00,1QceiB,3JMO2k,http://www.example.com/list/cd,
3,2016-10-01 14:00:00,1QceiB,3JMO2k,http://www.example.com/detail?id=1,http://search.google.co.jp/xxx
4,2016-10-01 15:00:00,1hI43A,6SN6DD,http://www.example.com/list/newly,
...,...,...,...,...,...
14,2016-10-02 18:00:00,690mvB,FGkTe9,http://www.example.com/list/dvd?utm_source=yah...,http://www.yahoo.co.jp/xxx
15,2016-10-03 12:00:00,6oABhM,3JMO2k,http://www.example.com/detail?id=3,http://search.yahoo.co.jp/xxx
16,2016-10-03 13:00:00,7jjxQX,KKTw9P,http://www.example.com/?utm_source=mynavi&utm_...,http://www.mynavi.jp/xxx
17,2016-10-03 14:00:00,AAuoEU,6SN6DD,http://www.example.com/list/dvd,https://www.facebook.com/xxx


In [5]:
select('SELECT * FROM purchase_log;')

Unnamed: 0,stamp,short_session,long_session,purchase_id,amount
0,2016-10-01 15:00:00,0CVKaz,1CwlSX,1,1000
1,2016-10-01 16:00:00,2is8PX,7Dn99b,2,1000
2,2016-10-01 20:00:00,2is8PX,7Dn99b,3,1000
3,2016-10-02 14:00:00,2is8PX,7Dn99b,4,1000


#### 지표 정의

- 방문자 수 : 브라우저를 꺼도 사라지지 않는 쿠키의 유니크 수
- 방문 횟수 : 브라우저를 껐을 때 사라지는 쿠키의 유니크 수
- 페이지 뷰 : 페이지를 출력한 로그의 줄 수

In [6]:
query_141 = """
        SELECT
           substring(stamp, 1, 10) AS dt
           -- 쿠키 계산하기
         , COUNT(DISTINCT long_session) AS access_users
           -- 방문 횟수 계산하기
         , COUNT(DISTINCT short_session) AS access_count
           -- 페이지 뷰 계산하기
         , COUNT(*) AS page_view
           -- 1인당 페이지 뷰 수
         , 1.0 * COUNT(*) / NULLIF(COUNT(DISTINCT long_session), 0) AS pv_per_user
        FROM
           access_log
        GROUP BY
           dt
        ORDER BY
           dt
        ;
        """

select(query_141)

Unnamed: 0,dt,access_users,access_count,page_view,pv_per_user
0,2016-10-01,4,5,8,2.0
1,2016-10-02,4,5,7,1.75
2,2016-10-03,3,3,4,1.333333


### [14-2] URL별로 집계하는 쿼리

In [7]:
query_142 = """
        SELECT
           url
         , COUNT(DISTINCT short_session) AS access_count         
         , COUNT(DISTINCT long_session) AS access_users
         , COUNT(*) AS page_view
        FROM
           access_log
        GROUP BY
           url
        ;
        """

select(query_142)

Unnamed: 0,url,access_count,access_users,page_view
0,http://www.example.com/,3,3,3
1,http://www.example.com/?utm_source=google&utm_...,1,1,1
2,http://www.example.com/?utm_source=mynavi&utm_...,1,1,1
3,http://www.example.com/detail?id=1,2,2,2
4,http://www.example.com/detail?id=2,2,2,2
5,http://www.example.com/detail?id=3,1,1,1
6,http://www.example.com/list/cd,3,3,3
7,http://www.example.com/list/dvd,2,2,2
8,http://www.example.com/list/dvd?utm_source=yah...,1,1,1
9,http://www.example.com/list/newly,3,2,3


### [14-3] 경로별로 집계하는 쿼리

In [8]:
query_143 = """
        WITH
        access_log_with_path AS (
           SELECT
              *
            , substring(url from '//[^/]+([^?#]+)') AS url_path
           FROM
              access_log
        )
        SELECT
           url_path
         , COUNT(DISTINCT short_session) AS access_count         
         , COUNT(DISTINCT long_session) AS access_users
         , COUNT(*) AS page_view
        FROM
           access_log_with_path
        GROUP BY
           url_path
        ;
        """

select(query_143)

Unnamed: 0,url_path,access_count,access_users,page_view
0,/,5,4,5
1,/detail,5,3,5
2,/list/cd,3,3,3
3,/list/dvd,3,2,3
4,/list/newly,3,2,3


### [14-4] URL에 의미를 부여해서 집계하는 쿼리

In [9]:
query_144 = """
        WITH
        access_log_with_path AS (
          SELECT
             *
           , substring(url from '//[^/]+([^?#]+)') AS url_path
          FROM
             access_log
        )
        , access_log_with_split_path AS (
          SELECT
             *
             -- split_part로 n번쨰 요소 추출하기
           , split_part(url_path, '/', 2) AS path1
           , split_part(url_path, '/', 3) AS path2
          FROM
             access_log_with_path
        )
        , access_log_with_page_name AS (
          SELECT
             -- 결로를 슬래시로 분할하고, 조건에 따라 페이지 이름 붙이기
             *
           , CASE
              WHEN path1 = 'list' THEN
               CASE
                WHEN path2 = 'newly' THEN 'newly_list'
                ELSE 'category_list'
               END
              -- 이외에 경우는 경로를 그대로 사용하기
              ELSE url_path
             END AS page_name
          FROM
             access_log_with_split_path
        )
        SELECT
           page_name
         , COUNT(DISTINCT short_session) AS access_count         
         , COUNT(DISTINCT long_session) AS access_users
         , COUNT(*) AS page_view
        FROM
           access_log_with_page_name
        GROUP BY
           page_name
        ORDER BY
           page_name        
        ;
        """

select(query_144)

Unnamed: 0,page_name,access_count,access_users,page_view
0,/,5,4,5
1,/detail,5,3,5
2,category_list,6,4,6
3,newly_list,3,2,3


### [14-5] 유입원별로 방문 횟수 집계하는 쿼리

레퍼러(referer) : 직전 페이지의 URL

유입 경로
- 검색 연동 광고
- 제휴 마케팅 사이트
- AD 네트워크
- 검색 엔진
- 소셜 미디어
- 기타 사이트

In [10]:
query_145 = """
        WITH
        access_log_with_parse_info AS (
          -- 유입원 정보 추출하기
          SELECT
             *
           , substring(url from 'https?://([^/]*)') AS url_domain
           , substring(url from 'utm_source=([^&]*)') AS url_utm_source
           , substring(url from 'utm_medium=([^&]*)') AS url_utm_medium
           , substring(referrer from 'https?://([^/]*)') AS referrer_domain
          FROM
             access_log
        )
        , access_log_with_via_info AS (
          SELECT
             *
           , ROW_NUMBER() OVER(ORDER BY stamp) AS log_id
           , CASE
              --  <> : !=
              WHEN url_utm_source <> '' AND url_utm_medium <> ''
               THEN concat(url_utm_source, '-', url_utm_medium)
              WHEN referrer_domain IN ('search.yahoo.co.jp', 'www.google.co.jp') THEN 'search'
              WHEN referrer_domain IN ('twitter.com', 'www.facebook.com') THEN 'social'
              ELSE 'other'
             END AS via
          FROM
             access_log_with_parse_info
        )
        SELECT
           via
         , COUNT(*) AS access_count         
        FROM
           access_log_with_via_info
        GROUP BY
           via
        ORDER BY
           access_count DESC        
        ;
        """

select(query_145)

Unnamed: 0,via,access_count
0,other,11
1,social,3
2,search,2
3,mynavi-affiliate,1
4,yahoo-search,1
5,google-search,1


### [14-6] 각 방문에서 구매한 비율(CVR)을 집계하는 쿼리

CVR : 각 방문에서 구매한 비율

In [11]:
query_146 = """
        WITH
        access_log_with_parse_info AS (
          -- 유입원 정보 추출하기
          SELECT
             *
           , substring(url from 'https?://([^/]*)') AS url_domain
           , substring(url from 'utm_source=([^&]*)') AS url_utm_source
           , substring(url from 'utm_medium=([^&]*)') AS url_utm_medium
           , substring(referrer from 'https?://([^/]*)') AS referrer_domain
          FROM
             access_log
        )
        , access_log_with_via_info AS (
          SELECT
             *
           , ROW_NUMBER() OVER(ORDER BY stamp) AS log_id
           , CASE
              --  <> : !=
              WHEN url_utm_source <> '' AND url_utm_medium <> ''
               THEN concat(url_utm_source, '-', url_utm_medium)
              WHEN referrer_domain IN ('search.yahoo.co.jp', 'www.google.co.jp') THEN 'search'
              WHEN referrer_domain IN ('twitter.com', 'www.facebook.com') THEN 'social'
              ELSE 'other'
             END AS via
          FROM
             access_log_with_parse_info
        )
        , access_log_with_purchase_amount AS (
          SELECT
             a.log_id
           , a.via
           , SUM(
              CASE
               WHEN p.stamp::date BETWEEN a.stamp::date AND a.stamp::date + '1 day'::interval
                THEN amount
              END
             ) AS amount
          FROM
             access_log_with_via_info AS a
          LEFT OUTER JOIN
             purchase_log AS p
           ON a.long_session = p.long_session
          GROUP BY
             a.log_id, a.via
        )
        SELECT
           via
         , COUNT(*) AS via_count
           -- 실제 구매를 한 세션의 수
         , COUNT(amount) AS conversions
         , AVG(100.0 * SIGN(COALESCE(amount, 0))) AS cvr
         , SUM(COALESCE(amount, 0)) AS amount
         , AVG(1.0 * COALESCE(amount, 0)) AS avg_amount
        FROM
           access_log_with_purchase_amount
        GROUP BY
           via
        ORDER BY
           cvr DESC        
        ;
        """

select(query_146)

Unnamed: 0,via,via_count,conversions,cvr,amount,avg_amount
0,google-search,1,1,100.0,1000.0,1000.0
1,social,3,1,33.333333,3000.0,1000.0
2,other,11,2,18.181818,2000.0,181.818182
3,search,2,0,0.0,0.0,0.0
4,mynavi-affiliate,1,0,0.0,0.0,0.0
5,yahoo-search,1,0,0.0,0.0,0.0


### [14-7] 요일/시간대별 방문 수를 집계하는 쿼리

#### 00::00:00부터의 경과 시간을 초 단위로 계산하기

CAST(substring(stamp, 12, 2) AS int) * 60 * 60 + 
  
CAST(substring(stamp, 15, 2) AS int) * 60 + 
  
CAST(substring(stamp, 18, 2) AS int)

#### 초를 다시 타임스탬프 형식으로 변환하기

lpad(floor(floor_seconds / (60 * 60))::text      , 2, '0') || ':' || 

lpad(floor(floor_seconds % (60 * 60) / 60)::text , 2, '0') || ':' || 

lpad(floor(floor_seconds % 60)::text             , 2, '0')

In [13]:
query_147 = """
        WITH
        access_log_with_dow AS (
          SELECT
             stamp
             -- 월요일(0)부터 토요일(6)까지의 요일 번호 추출하기
           , date_part('dow', stamp::timestamp) AS dow
             -- 00::00:00부터의 경과 시간을 초 단위로 계산하기
           ,  CAST(substring(stamp, 12, 2) AS int) * 60 * 60
            + CAST(substring(stamp, 15, 2) AS int) * 60
            + CAST(substring(stamp, 18, 2) AS int)
            AS whole_seconds
            -- 시간 간격 (30분/1800초)정하기
           , 30 * 60 AS interval_seconds
          FROM
             access_log
        )
        , access_log_with_floor_seconds AS (
          SELECT
             stamp
           , dow
             -- 00:00:00부터의 경과 시간을 interval_seconds로 나누기
           , CAST((floor(whole_seconds / interval_seconds) * interval_seconds) AS int)
             AS floor_seconds
          FROM
             access_log_with_dow
        )
        , access_log_with_index AS (
          SELECT
             stamp
           , dow
             -- 초를 다시 타임스탬프 형식으로 변환하기
           ,    lpad(floor(floor_seconds / (60 * 60))::text      , 2, '0') || ':'
             || lpad(floor(floor_seconds % (60 * 60) / 60)::text , 2, '0') || ':'
             || lpad(floor(floor_seconds % 60)::text             , 2, '0')
             AS index_time
          FROM
             access_log_with_floor_seconds
        )
        SELECT
           index_time
         , COUNT(CASE dow WHEN 0 THEN 1 END) AS sun
         , COUNT(CASE dow WHEN 1 THEN 1 END) AS mon
         , COUNT(CASE dow WHEN 2 THEN 1 END) AS tue
         , COUNT(CASE dow WHEN 3 THEN 1 END) AS wed
         , COUNT(CASE dow WHEN 4 THEN 1 END) AS thu
         , COUNT(CASE dow WHEN 5 THEN 1 END) AS fri
         , COUNT(CASE dow WHEN 6 THEN 1 END) AS sat
        FROM
           access_log_with_index
        GROUP BY
           index_time
        ORDER BY
           index_time      
        ;
        """

select(query_147)

Unnamed: 0,index_time,sun,mon,tue,wed,thu,fri,sat
0,12:00:00,1,1,0,0,0,0,1
1,13:00:00,1,1,0,0,0,0,2
2,14:00:00,1,1,0,0,0,0,1
3,15:00:00,1,1,0,0,0,0,1
4,16:00:00,1,0,0,0,0,0,1
5,17:00:00,1,0,0,0,0,0,1
6,18:00:00,1,0,0,0,0,0,1


## 15. 사이트 내의 사용자 행동 파악하기

In [29]:
select('SELECT * FROM activity_log;')

Unnamed: 0,stamp,session,action,option,path,search_type
0,2017-01-09 12:18:43,989004ea,view,search,/search_list,Area-L-with-Job
1,2017-01-09 12:19:27,989004ea,view,page,/search_input,
2,2017-01-09 12:20:03,989004ea,view,search,/search_list,Pref
3,2017-01-09 12:18:43,47db0370,view,search,/search_list,Area-S
4,2017-01-09 12:18:43,1cf7678e,view,detail,/detail,
...,...,...,...,...,...,...
31,2017-01-09 12:18:43,8cc03a54,view,search,/search_list,Area-L
32,2017-01-09 12:18:44,8cc03a54,view,page,/input,Area-L
33,2017-01-09 12:18:45,8cc03a54,view,page,/confirm,Area-L
34,2017-01-09 12:18:46,8cc03a54,view,page,/complete,Area-L


### [15-1] 각 세션의 입구 페이지와 출구 페이지 경로를 추출하는 쿼리

In [30]:
query_151 = """
        WITH
        activity_log_with_landing_exit AS (
           SELECT
              session
            , path
            , stamp
            , FIRST_VALUE(path)
               OVER(
                PARTITION BY session
                -- 각 세션 내부의 모든 행을 대상으로 지정
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING
              ) AS landing
            , LAST_VALUE(path)
               OVER(
                PARTITION BY session
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING              
              ) AS exit
           FROM
              activity_log
        )
        SELECT *
        FROM
           activity_log_with_landing_exit
        ;
        """

select(query_151)

Unnamed: 0,session,path,stamp,landing,exit
0,0fe39581,/search_list,2017-01-09 12:18:43,/search_list,/search_list
1,111f2996,/search_list,2017-01-09 12:18:43,/search_list,/search_input
2,111f2996,/search_input,2017-01-09 12:19:11,/search_list,/search_input
3,111f2996,/,2017-01-09 12:20:10,/search_list,/search_input
4,111f2996,/search_input,2017-01-09 12:21:14,/search_list,/search_input
...,...,...,...,...,...
31,9afaf87c,/complete,2017-01-09 12:23:00,/search_list,/complete
32,cabf98e8,/search_input,2017-01-09 12:18:43,/search_input,/search_input
33,d45ec190,/detail,2017-01-09 12:18:43,/detail,/detail
34,eee2bb21,/detail,2017-01-09 12:18:43,/detail,/detail


### [15-2] 각 세션의 입구 페이지와 출구 페이지를 기반으로 방문 횟수를 추출하는 쿼리

In [31]:
query_152 = """
        WITH
        activity_log_with_landing_exit AS (
           SELECT
              session
            , path
            , stamp
            , FIRST_VALUE(path)
               OVER(
                PARTITION BY session
                -- 각 세션 내부의 모든 행을 대상으로 지정
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING
              ) AS landing
            , LAST_VALUE(path)
               OVER(
                PARTITION BY session
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING              
              ) AS exit
           FROM
              activity_log
        )
        , landing_count AS (
          SELECT
             landing AS path
           , COUNT(DISTINCT session) AS count
          FROM
             activity_log_with_landing_exit
          GROUP BY landing
        )
        , exit_count AS (
          SELECT
             exit AS path
           , COUNT(DISTINCT session) AS count
          FROM
             activity_log_with_landing_exit
          GROUP BY exit
        )
        -- 입구 페이지와 출구 페이지 방문 횟수 결과를 한꺼번에 출력하기
        SELECT 'landing' AS type, * FROM landing_count
        UNION ALL
        SELECT 'exit' AS type, * FROM exit_count
        ;
        """

select(query_152)

Unnamed: 0,type,path,count
0,landing,/detail,8
1,landing,/search_input,1
2,landing,/search_list,7
3,exit,/,2
4,exit,/complete,2
5,exit,/detail,7
6,exit,/search_input,2
7,exit,/search_list,3


### [15-3] 세션별 입구 페이지와 출구 페이지의 조합을 집계하는 쿼리

In [32]:
query_153 = """
        WITH
        activity_log_with_landing_exit AS (
           SELECT
              session
            , path
            , stamp
            , FIRST_VALUE(path)
               OVER(
                PARTITION BY session
                -- 각 세션 내부의 모든 행을 대상으로 지정
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING
              ) AS landing
            , LAST_VALUE(path)
               OVER(
                PARTITION BY session
                ORDER BY stamp ASC
                 ROWS BETWEEN UNBOUNDED PRECEDING
                          AND UNBOUNDED FOLLOWING              
              ) AS exit
           FROM
              activity_log
        )
        SELECT
           landing
         , exit
         , COUNT(DISTINCT session) AS count
        FROM
           activity_log_with_landing_exit
        GROUP BY
           landing, exit
        ;
        """

select(query_153)

Unnamed: 0,landing,exit,count
0,/detail,/,2
1,/detail,/detail,6
2,/search_input,/search_input,1
3,/search_list,/complete,2
4,/search_list,/detail,1
5,/search_list,/search_input,1
6,/search_list,/search_list,3


### [15-4] 경로별 이탈률을 집계하는 쿼리

In [33]:
query_154 = """
        WITH
        activity_log_with_exit_flag AS (
           SELECT
              *
              -- 출구 페이지 판정
            , CASE
               WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) = 1 THEN 1
               ELSE 0
              END AS is_exit
           FROM
              activity_log
        )
        SELECT
           path
         , SUM(is_exit) As exit_count
         , COUNT(*) AS page_view
           -- 이탈률 = 출구수 / 페이지뷰
         , AVG(100.0 * is_exit) AS exit_ratio
        FROM
           activity_log_with_exit_flag
        GROUP BY
           path
        ;
        """

select(query_154)

Unnamed: 0,path,exit_count,page_view,exit_ratio
0,/detail,7,11,63.636364
1,/confirm,0,2,0.0
2,/search_input,2,5,40.0
3,/,2,4,50.0
4,/complete,2,2,100.0
5,/input,0,2,0.0
6,/search_list,3,10,30.0


### [15-5] 경로들의 직귀율을 집계하는 쿼리

직귀율 = 직귀수(한 페이지만을 조회한 방문 횟수) / 입구수

=> 순수하게 랜딩 페이지에서 다른 페이지로 이동하는지를 평가

또는

직귀율 = 직귀수 / 방문 횟수

In [34]:
query_155 = """
        WITH
        activity_log_with_landing_bounce_flag AS (
           SELECT
              *
              -- 입구 페이지 판정
            , CASE
               WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) = 1 THEN 1
               ELSE 0
              END AS is_landing
              -- 직귀 판정 / 한 페이지만을 조회한 방문 횟수
            , CASE
               WHEN COUNT(*) OVER(PARTITION BY session) = 1 THEN 1
               ELSE 0
              END AS is_bounce
           FROM
              activity_log
        )
        SELECT
           path
         , SUM(is_bounce) As bounce_count
         , SUM(is_landing) As landing_count
         , AVG(100.0 * CASE WHEN is_landing = 1 THEN is_bounce END) AS bounce_ratio
        FROM
           activity_log_with_landing_bounce_flag
        GROUP BY
           path
        ;
        """

select(query_155)

Unnamed: 0,path,bounce_count,landing_count,bounce_ratio
0,/detail,6,8,75.0
1,/confirm,0,0,
2,/search_input,1,1,100.0
3,/,0,0,
4,/complete,0,0,
5,/input,0,0,
6,/search_list,2,7,28.571429


### [15-6] 컨버전 페이지보다 이전 접근에 플래그를 추가하는 쿼리

컨버전 : 완료하면(/complete)에 도달하는 것

In [35]:
query_156 = """
        WITH
        activity_log_with_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
              -- 성과를 발생시키는 컨버전 페이지의 이전 접근에 플래그 추가하기
              -- 세션별로 /complete포함 이전 타임라인에 1플래그
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_conversion
           FROM
              activity_log
        )
        SELECT *
        FROM
           activity_log_with_conversion_flag
        ORDER BY
           session, stamp
        ;
        """

select(query_156)

Unnamed: 0,session,stamp,path,has_conversion
0,0fe39581,2017-01-09 12:18:43,/search_list,0.0
1,111f2996,2017-01-09 12:18:43,/search_list,0.0
2,111f2996,2017-01-09 12:19:11,/search_input,0.0
3,111f2996,2017-01-09 12:20:10,/,0.0
4,111f2996,2017-01-09 12:21:14,/search_input,0.0
...,...,...,...,...
31,9afaf87c,2017-01-09 12:23:00,/complete,1.0
32,cabf98e8,2017-01-09 12:18:43,/search_input,0.0
33,d45ec190,2017-01-09 12:18:43,/detail,0.0
34,eee2bb21,2017-01-09 12:18:43,/detail,0.0


### [15-7] 경로들의 방문 횟수와 구성 수를 집계하는 쿼리

In [36]:
query_157 = """
        WITH
        activity_log_with_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
              -- 성과를 발생시키는 컨버전 페이지의 이전 접근에 플래그 추가하기
              -- 세션별로 /complete포함 이전 타임라인에 1플래그
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_conversion
           FROM
              activity_log
        )
        SELECT
           path
           -- 방문 횟수
         , COUNT(DISTINCT session) AS session
           -- 성과 수
         , SUM(has_conversion) AS conversion
         , 1.0 * SUM(has_conversion) / COUNT(DISTINCT session) AS cvr
        FROM
           activity_log_with_conversion_flag
        GROUP BY
           path
        ;
        """

select(query_157)

Unnamed: 0,path,session,conversion,cvr
0,/,3,0.0,0.0
1,/complete,2,2.0,1.0
2,/confirm,2,2.0,1.0
3,/detail,10,1.0,0.1
4,/input,2,2.0,1.0
5,/search_input,4,0.0,0.0
6,/search_list,8,2.0,0.25


### [15-8] 페이지 가치 할당을 계산하는 쿼리

In [39]:
query_158 = """
        WITH
        activity_log_with_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
              -- 성과를 발생시키는 컨버전 페이지의 이전 접근에 플래그 추가하기
              -- 세션별로 /complete포함 이전 타임라인에 1플래그
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_conversion
           FROM
              activity_log
        )
        , activity_log_with_conversion_assing AS (
          SELECT
             session
           , stamp
           , path
             -- 성과에 이르기까지의 접근 로그를 오름차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) AS asc_order
             -- 성과에 이르기까지의 접근 로그를 내림차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) AS desc_order
             -- 성과에 이르기까지의 접근 수 세기
           , COUNT(*) OVER(PARTITION BY session) AS page_count
             -- 1. 성과에 이르기까지의 접근 로그에 균등한 가치 부여하기
           , 1000.0 / COUNT(*) OVER(PARTITION BY session) AS fair_assign
             -- 2. 성과에 이르기까지의 접근 로그의 첫 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS first_assign
             -- 3. 성과에 이르기까지의 접근 로그의 마지막 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS last_assign
             -- 4. 성과에 이르기까지의 접근 로그의 성과 지점에서 가까운 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS decrease_assign
             -- 5. 성과에 이르기까지의 접근 로그의 성과 지점에서 먼 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS increase_assign
          FROM
             activity_log_with_conversion_flag
          WHERE
             -- 컨버전으로 이어지는 세션 로그만 추출하기
             has_conversion = 1
             -- 입력, 확인, 완료 페이지 제외하기
             AND path NOT IN ('/input', '/confirm', '/complete')
        )
        SELECT
           session
         , asc_order
         , path
         , fair_assign AS fair_a
         , first_assign AS first_a
         , last_assign AS last_a
         , decrease_assign AS dec_a
         , increase_assign AS inc_a
        FROM
           activity_log_with_conversion_assing
        ORDER BY
           session, asc_order
        ;
        """

select(query_158)

Unnamed: 0,session,asc_order,path,fair_a,first_a,last_a,dec_a,inc_a
0,8cc03a54,1,/search_list,1000.0,1000.0,1000.0,1000.0,1000.0
1,9afaf87c,1,/search_list,500.0,1000.0,0.0,333.333333,666.666667
2,9afaf87c,2,/detail,500.0,0.0,1000.0,666.666667,333.333333


### [15-9] 경로별 페이지 가치 합계를 구하는 쿼리

In [42]:
query_159 = """
        WITH
        activity_log_with_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
              -- 성과를 발생시키는 컨버전 페이지의 이전 접근에 플래그 추가하기
              -- 세션별로 /complete포함 이전 타임라인에 1플래그
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_conversion
           FROM
              activity_log
        )
        , activity_log_with_conversion_assing AS (
          SELECT
             session
           , stamp
           , path
             -- 성과에 이르기까지의 접근 로그를 오름차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) AS asc_order
             -- 성과에 이르기까지의 접근 로그를 내림차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) AS desc_order
             -- 성과에 이르기까지의 접근 수 세기
           , COUNT(*) OVER(PARTITION BY session) AS page_count
             -- 1. 성과에 이르기까지의 접근 로그에 균등한 가치 부여하기
           , 1000.0 / COUNT(*) OVER(PARTITION BY session) AS fair_assign
             -- 2. 성과에 이르기까지의 접근 로그의 첫 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS first_assign
             -- 3. 성과에 이르기까지의 접근 로그의 마지막 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS last_assign
             -- 4. 성과에 이르기까지의 접근 로그의 성과 지점에서 가까운 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS decrease_assign
             -- 5. 성과에 이르기까지의 접근 로그의 성과 지점에서 먼 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS increase_assign
          FROM
             activity_log_with_conversion_flag
          WHERE
             -- 컨버전으로 이어지는 세션 로그만 추출하기
             has_conversion = 1
             -- 입력, 확인, 완료 페이지 제외하기
             AND path NOT IN ('/input', '/confirm', '/complete')
        )
        , page_total_values AS (
          SELECT
             path
           , SUM(fair_assign) AS sum_fair
           , SUM(first_assign) AS sum_first
           , SUM(last_assign) AS sum_last
           , SUM(decrease_assign) AS sum_dec
           , SUM(increase_assign) AS sum_inc
          FROM
             activity_log_with_conversion_assing
          GROUP BY
             path
        )
        SELECT *
        FROM
           page_total_values
        ;
        """

select(query_159)

Unnamed: 0,path,sum_fair,sum_first,sum_last,sum_dec,sum_inc
0,/detail,500.0,0.0,1000.0,666.666667,333.333333
1,/search_list,1500.0,2000.0,1000.0,1333.333333,1666.666667


### [15-10] 경로들의 평균 페이지 가치를 구하는 쿼리

In [46]:
query_1510 = """
        WITH
        activity_log_with_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
              -- 성과를 발생시키는 컨버전 페이지의 이전 접근에 플래그 추가하기
              -- 세션별로 /complete포함 이전 타임라인에 1플래그
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_conversion
           FROM
              activity_log
        )
        , activity_log_with_conversion_assing AS (
          SELECT
             session
           , stamp
           , path
             -- 성과에 이르기까지의 접근 로그를 오름차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) AS asc_order
             -- 성과에 이르기까지의 접근 로그를 내림차순으로 정렬하기
           , ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) AS desc_order
             -- 성과에 이르기까지의 접근 수 세기
           , COUNT(*) OVER(PARTITION BY session) AS page_count
             -- 1. 성과에 이르기까지의 접근 로그에 균등한 가치 부여하기
           , 1000.0 / COUNT(*) OVER(PARTITION BY session) AS fair_assign
             -- 2. 성과에 이르기까지의 접근 로그의 첫 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS first_assign
             -- 3. 성과에 이르기까지의 접근 로그의 마지막 페이지에 가치 부여하기
           , CASE
              WHEN ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC) = 1
               THEN 1000.0
              ELSE 0.0
             END AS last_assign
             -- 4. 성과에 이르기까지의 접근 로그의 성과 지점에서 가까운 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp ASC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS decrease_assign
             -- 5. 성과에 이르기까지의 접근 로그의 성과 지점에서 먼 페이지에 높은 가치 부여하기
           , 1000.0
             * ROW_NUMBER() OVER(PARTITION BY session ORDER BY stamp DESC)
               -- 순번 합계로 나누기 (N*(N+1)/2)
             / ( COUNT(*) OVER(PARTITION BY session)
                *(COUNT(*) OVER(PARTITION BY session) + 1)
                / 2)
             AS increase_assign
          FROM
             activity_log_with_conversion_flag
          WHERE
             -- 컨버전으로 이어지는 세션 로그만 추출하기
             has_conversion = 1
             -- 입력, 확인, 완료 페이지 제외하기
             AND path NOT IN ('/input', '/confirm', '/complete')
        )
        , page_total_values AS (
          SELECT
             path
           , SUM(fair_assign) AS sum_fair
           , SUM(first_assign) AS sum_first
           , SUM(last_assign) AS sum_last
           , SUM(decrease_assign) AS sum_dec
           , SUM(increase_assign) AS sum_inc
          FROM
             activity_log_with_conversion_assing
          GROUP BY
             path
        )
        , page_total_cnt AS (
          SELECT
             path
             -- 페이지 뷰
           , COUNT(*) AS access_cnt
          FROM
             activity_log
          GROUP BY
             path
        )
        SELECT
           -- 한 번의 방문에 따른 페이지 가치 계산하기
           s.path
         , s.access_cnt
         , v.sum_fair / s.access_cnt AS avg_fair
         , v.sum_first / s.access_cnt AS avg_first
         , v.sum_last / s.access_cnt AS avg_last
         , v.sum_dec / s.access_cnt AS avg_dec
         , v.sum_inc / s.access_cnt AS avg_inc
        FROM
           page_total_cnt AS s
        JOIN
           page_total_values AS v
         ON s.path = v.path
        ORDER BY
           s.access_cnt DESC
        ;
        """

select(query_1510)

Unnamed: 0,path,access_cnt,avg_fair,avg_first,avg_last,avg_dec,avg_inc
0,/detail,11,45.454545,0.0,90.909091,60.606061,30.30303
1,/search_list,10,150.0,200.0,100.0,133.333333,166.666667


### [15-11] 클릭 플래그와 컨버전 플래그를 계산하는 쿼리

CTR : 상세 페이지로 이동한 비율

CVR : 상세 페이지 조회 후에 성과로 이어지는 비율

In [49]:
query_1511 = """
        WITH
        activity_log_with_session_click_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
            , search_type
              -- 상세 페이지 이전 접근에 플래그 추가하기
            , SIGN(SUM(CASE WHEN path = '/detail' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_session_click
              -- 성과를 발생시키는 페이지 이전 접근에 플래그 추가하기
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_session_conversion
           FROM
              activity_log
        )
        SELECT
           session
         , stamp
         , path
         , search_type
         , has_session_click AS click
         , has_session_conversion AS cnv
        FROM
           activity_log_with_session_click_conversion_flag
        ORDER BY
           session, stamp
        ;
        """

select(query_1511)

Unnamed: 0,session,stamp,path,search_type,click,cnv
0,0fe39581,2017-01-09 12:18:43,/search_list,Area-S,0.0,0.0
1,111f2996,2017-01-09 12:18:43,/search_list,Pref,0.0,0.0
2,111f2996,2017-01-09 12:19:11,/search_input,,0.0,0.0
3,111f2996,2017-01-09 12:20:10,/,,0.0,0.0
4,111f2996,2017-01-09 12:21:14,/search_input,,0.0,0.0
...,...,...,...,...,...,...
31,9afaf87c,2017-01-09 12:23:00,/complete,,0.0,1.0
32,cabf98e8,2017-01-09 12:18:43,/search_input,,0.0,0.0
33,d45ec190,2017-01-09 12:18:43,/detail,,1.0,0.0
34,eee2bb21,2017-01-09 12:18:43,/detail,,1.0,0.0


### [15-12] 검색 타입별 CTR, CVR을 집계하는 쿼리

In [50]:
query_1512 = """
        WITH
        activity_log_with_session_click_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
            , search_type
              -- 상세 페이지 이전 접근에 플래그 추가하기
            , SIGN(SUM(CASE WHEN path = '/detail' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_session_click
              -- 성과를 발생시키는 페이지 이전 접근에 플래그 추가하기
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_session_conversion
           FROM
              activity_log
        )
        SELECT
           search_type
         , COUNT(*) AS count
         , SUM(has_session_click) AS detail
         , AVG(has_session_click) AS ctr
         , SUM(CASE WHEN has_session_click = 1 THEN has_session_conversion END) AS conversion
         , AVG(CASE WHEN has_session_click = 1 THEN has_session_conversion END) AS cvr         
        FROM
           activity_log_with_session_click_conversion_flag
        WHERE
           path = '/search_list'
        GROUP BY
           search_type
        ORDER BY
           count DESC
        ;
        """

select(query_1512)

Unnamed: 0,search_type,count,detail,ctr,conversion,cvr
0,Area-S,2,0.0,0.0,,
1,Pref,2,0.0,0.0,,
2,Area-L-with-Job,1,0.0,0.0,,
3,Line,1,1.0,1.0,0.0,0.0
4,Pref-with-Job,1,1.0,1.0,0.0,0.0
5,,1,1.0,1.0,1.0,1.0
6,Station-with-Job,1,1.0,1.0,0.0,0.0
7,Area-L,1,0.0,0.0,,


### [15-13] 클릭 플래그를 직전 페이지에 한정하는 쿼리

In [51]:
query_1513 = """
        WITH
        activity_log_with_session_click_conversion_flag AS (
           SELECT
              session
            , stamp
            , path
            , search_type
              -- 상세 페이지의 직전 접근에 플래그 추가하기
            , CASE
               WHEN LAG(path) OVER(PARTITION BY session ORDER BY stamp DESC) = '/detail'
                THEN 1
               ELSE 0
              END AS has_session_click
              -- 성과를 발생시키는 페이지 이전 접근에 플래그 추가하기
            , SIGN(SUM(CASE WHEN path = '/complete' THEN 1 ELSE 0 END)
               OVER(PARTITION BY session ORDER BY stamp DESC
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))
              AS has_session_conversion
           FROM
              activity_log
        )
        SELECT
           session
         , stamp
         , path
         , search_type
         , has_session_click AS click
         , has_session_conversion AS cnv
        FROM
           activity_log_with_session_click_conversion_flag
        ORDER BY
           session, stamp
        ;
        """

select(query_1513)

Unnamed: 0,session,stamp,path,search_type,click,cnv
0,0fe39581,2017-01-09 12:18:43,/search_list,Area-S,0,0.0
1,111f2996,2017-01-09 12:18:43,/search_list,Pref,0,0.0
2,111f2996,2017-01-09 12:19:11,/search_input,,0,0.0
3,111f2996,2017-01-09 12:20:10,/,,0,0.0
4,111f2996,2017-01-09 12:21:14,/search_input,,0,0.0
...,...,...,...,...,...,...
31,9afaf87c,2017-01-09 12:23:00,/complete,,0,1.0
32,cabf98e8,2017-01-09 12:18:43,/search_input,,0,0.0
33,d45ec190,2017-01-09 12:18:43,/detail,,0,0.0
34,eee2bb21,2017-01-09 12:18:43,/detail,,0,0.0
