## 데이터 분석을 위한 SQL 레시피

Data soruce : https://hanbit.co.kr/support/supplement_survey.html?pcode=B8585882565

System : PostgreSQL

In [2]:
import pandas as pd
import psycopg2 as pg2
from sqlalchemy import create_engine

engine = create_engine('postgresql://testuser:testpass@localhost:5432/postgresql_analysis')

con = pg2.connect(host='localhost',
                  user='testuser',
                  password='testpass',
                  database='postgresql_analysis')
con.autocommit = True
cur = con.cursor()

In [3]:
def select(query):
    return pd.read_sql(query, con)

### [5-1] 코드를 레이블로 변경하는 쿼리

In [9]:
select('SELECT * FROM mst_users;')

Unnamed: 0,user_id,register_date,register_device
0,U001,2016-08-26,1
1,U002,2016-08-26,2
2,U003,2016-08-27,3


In [4]:
query_51 = """
        SELECT 
          user_id
        , CASE
           WHEN register_device  = 1 THEN '데스크톱'
           WHEN register_device  = 2 THEN '스마트폰'
           WHEN register_device  = 3 THEN '애플리케이션'
           ELSE ''
          END AS device_name
        FROM mst_users
        ;
        """

select(query_51)

Unnamed: 0,user_id,device_name
0,U001,데스크톱
1,U002,스마트폰
2,U003,애플리케이션


### [5-2] 레퍼러 도메인은 추출하는 쿼리

In [11]:
select('SELECT * FROM access_log;')

Unnamed: 0,stamp,referrer,url
0,2016-08-26 12:02:00,http://www.other.com/path1/index.php?k1=v1&k2=...,http://www.example.com/video/detail?id=001
1,2016-08-26 12:02:01,http://www.other.net/path1/index.php?k1=v1&k2=...,http://www.example.com/video#ref
2,2016-08-26 12:02:01,https://www.other.com/,http://www.example.com/book/detail?id=002


In [12]:
query_52 = """
        SELECT 
          stamp
        , substring(referrer from 'https?://([^/]*)') AS referrer_host
        FROM access_log
        ;
        """

select(query_52)

Unnamed: 0,stamp,referrer_host
0,2016-08-26 12:02:00,www.other.com
1,2016-08-26 12:02:01,www.other.net
2,2016-08-26 12:02:01,www.other.com


### [5-3] URL 경로와 GET 매개변수에 있는 특정 키 값을 추출하는 쿼리

In [15]:
query_53 = """
        SELECT 
          stamp
        , url
        , substring(url from '//[^/]+([^?#]+)') AS path
        , substring(url from 'id=([^&]*)') AS id
        FROM access_log
        ;
        """

select(query_53)

Unnamed: 0,stamp,url,path,id
0,2016-08-26 12:02:00,http://www.example.com/video/detail?id=001,/video/detail,1.0
1,2016-08-26 12:02:01,http://www.example.com/video#ref,/video,
2,2016-08-26 12:02:01,http://www.example.com/book/detail?id=002,/book/detail,2.0


### [5-4] URL 경로를 슬래시로 분할해서 계층을 추출하는 쿼리

In [17]:
query_54 = """
        SELECT 
          stamp
        , url
        , split_part(substring(url from '//[^/]+([^?#]+)'), '/', 2) AS path1
        , split_part(substring(url from '//[^/]+([^?#]+)'), '/', 3) AS path2
        FROM access_log
        ;
        """

select(query_54)

Unnamed: 0,stamp,url,path1,path2
0,2016-08-26 12:02:00,http://www.example.com/video/detail?id=001,video,detail
1,2016-08-26 12:02:01,http://www.example.com/video#ref,video,
2,2016-08-26 12:02:01,http://www.example.com/book/detail?id=002,book,detail


### [5-5] 현재 날짜와 타임스탬프를 추출하는 쿼리

In [18]:
query_55 = """
        SELECT
          CURRENT_DATE AS dt
        , CURRENT_TIMESTAMP AS stamp
        ;
        """

select(query_55)

Unnamed: 0,dt,stamp
0,2022-10-28,2022-10-28 06:13:37.423744+00:00


### [5-6] 문자열을 날짜 자료형, 타임스탬프 자료형으로 변환하는 쿼리

In [20]:
query_56 = """
        SELECT 
          CAST('2016-01-30' AS date) AS dt
        , CAST('2016-01-30 12:00:00' AS timestamp) AS stamp
        ;
        """

select(query_56)

Unnamed: 0,dt,stamp
0,2016-01-30,2016-01-30 12:00:00


### [5-7] 타임스탬프 자료형의 데이터에서 연, 월, 일 등을 추출하는 쿼리

In [21]:
query_57 = """
        SELECT 
           stamp
         , EXTRACT(YEAR FROM stamp) AS year
         , EXTRACT(MONTH FROM stamp) AS month
         , EXTRACT(DAY FROM stamp) AS day
         , EXTRACT(HOUR FROM stamp) AS hour
        FROM
         (SELECT CAST('2016-01-30 12:00:00' AS timestamp) AS stamp) AS t
        ;
        """

select(query_57)

Unnamed: 0,stamp,year,month,day,hour
0,2016-01-30 12:00:00,2016.0,1.0,30.0,12.0


### [5-8] 타임스탬프를 나타내는 문자열에서 연, 월, 일 등을 추출하는 쿼리

In [23]:
query_58 = """
        SELECT 
           stamp
         , substring(stamp, 1, 4) AS year
         , substring(stamp, 6, 2) AS month
         , substring(stamp, 9, 2) AS day
         , substring(stamp, 12, 2) AS hour
         , substring(stamp, 1, 7) AS year_month
        FROM
         (SELECT CAST('2016-01-30 12:00:00' AS text) AS stamp) AS t
        ;
        """

select(query_58)

Unnamed: 0,stamp,year,month,day,hour,year_month
0,2016-01-30 12:00:00,2016,1,30,12,2016-01


### [5-9] 구매액에서 할인 쿠폰 값을 제외한 매출 금액을 구하는 쿼리

In [24]:
select('SELECT * FROM purchase_log_with_coupon;')

Unnamed: 0,purchase_id,amount,coupon
0,10001,3280,
1,10002,4650,500.0
2,10003,3870,


In [27]:
query_59 = """
        SELECT 
           purchase_id
         , amount
         , coupon
         , amount - coupon AS discount_amount1
         , amount - COALESCE(coupon, 0) AS discount_amount2
         -- NULL일 때 COALESCE 함수를 사용해 0으로 대치
        FROM
           purchase_log_with_coupon
        ;
        """

select(query_59)

Unnamed: 0,purchase_id,amount,coupon,discount_amount1,discount_amount2
0,10001,3280,,,3280
1,10002,4650,500.0,4150.0,4150
2,10003,3870,,,3870


### [6-1] 문자열을 연결하는 쿼리

In [28]:
select('SELECT * FROM mst_user_location;')

Unnamed: 0,user_id,pref_name,city_name
0,U001,서울특별시,강서구
1,U002,경기도수원시,장안구
2,U003,제주특별자치도,서귀포시


In [29]:
query_61 = """
        SELECT 
           user_id
         , CONCAT(pref_name, city_name) AS pref_city
        FROM
           mst_user_location
        ;
        """

select(query_61)

Unnamed: 0,user_id,pref_city
0,U001,서울특별시강서구
1,U002,경기도수원시장안구
2,U003,제주특별자치도서귀포시


### [6-2] q1, q2 컬럼을 비교하는 쿼리

In [30]:
select('SELECT * FROM quarterly_sales;')

Unnamed: 0,year,q1,q2,q3,q4
0,2015,82000,83000,78000.0,83000.0
1,2016,85000,85000,80000.0,81000.0
2,2017,92000,81000,,


In [34]:
query_62 = """
        SELECT 
           year
         , q1
         , q2
         , CASE
            WHEN q1 < q2 THEN '+'
            WHEN q1 = q2 THEN ' '
            ELSE '-'
           END AS judge_q1_q2
         , q2 - q1 AS diff_q2_q1
         -- q1과 q2의 매출 변화를 1, 0, -1로 표현하기
         , SIGN(q2 - q1) AS sign_q2_q1
        FROM
           quarterly_sales
        ;
        """

select(query_62)

Unnamed: 0,year,q1,q2,judge_q1_q2,diff_q2_q1,sign_q2_q1
0,2015,82000,83000,+,1000,1.0
1,2016,85000,85000,,0,0.0
2,2017,92000,81000,-,-11000,-1.0


### [6-3] 연간 최대/최소 4분기 매출을 찾는 쿼리

In [33]:
query_63 = """
        SELECT 
           year
         , greatest(q1, q2, q3, q4) AS greatest_sales
         , least(q1, q2, q3, q4) AS least_sales
        FROM
           quarterly_sales
        ;
        """

select(query_63)

Unnamed: 0,year,greatest_sales,least_sales
0,2015,83000,78000
1,2016,85000,80000
2,2017,92000,81000


### [6-4] 단순한 연산으로 평균 4분기 매출을 구하는 쿼리

In [35]:
query_64 = """
        SELECT 
           year
         , (q1 + q2 + q3 + q4) / 4 AS average
        FROM
           quarterly_sales
        ;
        """

select(query_64)

Unnamed: 0,year,average
0,2015,81500.0
1,2016,82750.0
2,2017,


### [6-5] COALESCE를 사용해 NULL을 0으로 변환하고 평균값을 구하는 쿼리

In [36]:
query_65 = """
        SELECT 
           year
         , (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / 4
           AS average
        FROM
           quarterly_sales
        ;
        """

select(query_65)

Unnamed: 0,year,average
0,2015,81500
1,2016,82750
2,2017,43250


### [6-6] NULL이 아닌 컬럼만 사용해서 평균값을 구하는 쿼리

In [38]:
query_66 = """
        SELECT 
           year
         , (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0))
           / (SIGN(COALESCE(q1, 0)) + SIGN(COALESCE(q2, 0)) 
            + SIGN(COALESCE(q3, 0)) + SIGN(COALESCE(q4, 0)))
           AS average
        FROM
           quarterly_sales
        ;
        """

select(query_66)

Unnamed: 0,year,average
0,2015,81500.0
1,2016,82750.0
2,2017,86500.0


### [6-7] 정수 자료형의 데이터로 나누는 쿼리

In [41]:
select('SELECT * FROM advertising_stats;')

Unnamed: 0,dt,ad_id,impressions,clicks
0,2017-04-01,1,100000,3000
1,2017-04-01,2,120000,1200
2,2017-04-01,3,500000,10000
3,2017-04-02,1,0,0
4,2017-04-02,2,130000,1400
5,2017-04-02,3,620000,15000


In [46]:
query_67 = """
        SELECT
           dt
         , ad_id
         -- 정수 자료형 / 정수 자료형 -> 결과값 0이 나옴
         , CAST(clicks AS double precision) / impressions AS ctr
         -- 실수를 상수로 앞에 두고 계산하면 암묵적으로 자료형 변환이 일어남   
         , 100.0 * clicks / impressions AS ctr_as_percent
        FROM
           advertising_stats
        WHERE
           dt = '2017-04-01'
        ORDER BY
           ad_id
        ;
        """

select(query_67)

Unnamed: 0,dt,ad_id,ctr,ctr_as_percent
0,2017-04-01,1,0.03,3.0
1,2017-04-01,2,0.01,1.0
2,2017-04-01,3,0.02,2.0


### [6-8] 0으로 나누는 것을 피해 CTR을 계산하는 쿼리

In [50]:
query_68 = """
        SELECT
           dt
         , ad_id
         , CASE
            WHEN impressions > 0 THEN 100.0 * clicks / impressions
           END AS ctr_as_percent_by_case
         -- 분모가 0이라면 NULL로 변환해서, 0으로 나누지 않게 만드는 방법
         , 100.0 * clicks / NULLIF(impressions, 0) AS ctr_as_percent_ny_null
        FROM
           advertising_stats
        ORDER BY
           dt, ad_id
        ;
        """

select(query_68)

Unnamed: 0,dt,ad_id,ctr_as_percent_by_case,ctr_as_percent_ny_null
0,2017-04-01,1,3.0,3.0
1,2017-04-01,2,1.0,1.0
2,2017-04-01,3,2.0,2.0
3,2017-04-02,1,,
4,2017-04-02,2,1.076923,1.076923
5,2017-04-02,3,2.419355,2.419355
