#### Sglplot 라이브러리를 활용한 통계 기반 쿼리 분석 수행

- 인덱스 설계를 위해 자주 사용되는 컬럼을 알아보기 위해서 해당 분석 수행

In [1]:
import pandas as pd 
import json
import sqlglot
from sqlglot import exp
from collections import Counter

In [2]:
df = pd.DataFrame()

In [3]:
# 데이터 출처 : https://github.com/glee4810/ehrsql-2024/tree/master/data/mimic_iv

with open(r"C:\last_project\mimic_query_data\mimic_iv\train\label.json", "r", encoding='utf-8') as f:
    train = json.load(f)

with open(r"C:\last_project\mimic_query_data\mimic_iv\valid\label.json", "r", encoding='utf-8') as f:
    valid = json.load(f)

with open(r"C:\last_project\mimic_query_data\mimic_iv\test\label.json", "r", encoding='utf-8') as f:
    test = json.load(f)

In [4]:
for i in [train, valid, test]:
    l = [[j] for j in list(i.values())]
    x = pd.DataFrame(l, columns=['쿼리'])
    df = pd.concat([df,x])

df.head()

Unnamed: 0,쿼리
0,SELECT DISTINCT prescriptions.route FROM presc...
1,SELECT DISTINCT prescriptions.route FROM presc...
2,SELECT DISTINCT prescriptions.route FROM presc...
3,SELECT DISTINCT prescriptions.route FROM presc...
4,SELECT DISTINCT prescriptions.route FROM presc...


In [5]:
df = df[(~ df['쿼리'].isnull()) & (~ df['쿼리'].str.contains('null'))]
df.shape

(6519, 1)

In [6]:
# where절에 자주 쓰이는 컬럼 추출하기

def extract_oracle_access(sql, table_name):
    tree = sqlglot.parse_one(sql, read='oracle')

    eq_cols = []
    in_cols = []
    orderby_cols = []
    range_cols = []
    where_cols = []

    for node in tree.walk():
        # = 조건
        if isinstance(node, exp.EQ):
            col = node.left
            if isinstance(col, exp.Column) and col.table.lower() == table_name:
                eq_cols.append(col.name.lower())

        # IN 조건
        if isinstance(node, exp.In):
            col = node.this
            if isinstance(col, exp.Column) and col.table.lower() == table_name:
                in_cols.append(col.name.lower())
                

        # ORDER BY
        if isinstance(node, exp.Order):
            for e in node.expressions:
                col = e.this
                if isinstance(col, exp.Column) and col.table.lower() == table_name:
                    orderby_cols.append(col.name.lower())
        
        if (isinstance(node, exp.Between) or isinstance(node, exp.GT) or isinstance(node, exp.LT)):
            col = node.this
            if isinstance(col, exp.Column) and col.table.lower() == table_name:
                range_cols.append(col.name.lower())

    for where in tree.find_all(exp.Where):
        for node in where.walk():
            if isinstance(node, exp.Column) and node.table.lower() == table_name:
                where_cols.append(node.name.lower())

    return {
        "eq_cols": eq_cols,
        "in_cols": in_cols,
        "orderby_cols": orderby_cols,
        "range_cols": range_cols,
        "where_cols" : where_cols
    }

In [7]:
table_name = 'labevents'

clean_q = (
    df[df['쿼리'].str.contains(table_name)]['쿼리']
    .str.replace('\n', '', regex=True)
)

eq_cols = []
in_cols = []
orderby_cols = []
range_cols = []
where_cols = []

fail = 0

for query in clean_q:
    try:
        cond = extract_oracle_access(query, table_name)
        eq_cols += cond['eq_cols']
        in_cols += cond['in_cols']
        orderby_cols += cond['orderby_cols']
        range_cols += cond['range_cols']
        where_cols += cond['where_cols']
    except:
        fail += 1
        continue

print(f'에러난 쿼리 개수 : {fail}')

에러난 쿼리 개수 : 132


In [8]:
eq_cols_cnt = Counter(eq_cols) #  = 조건 쓴 컬럼이름
in_cols_cnt = Counter(in_cols) # in 조건 쓴 컬럼이름
orderby_cols_cnt = Counter(orderby_cols) # order by 쓴 컬럼 이름
range_cols_cnt = Counter(range_cols) # between, >= 등 조건 쓴 컬럼 이름
where_cols_cnt = Counter(where_cols) # where 절 안에 있는 전체 컬럼

In [9]:
statistic = [eq_cols_cnt, in_cols_cnt, orderby_cols_cnt, range_cols_cnt, where_cols_cnt]
name = ['= 조건에 쓰인 컬럼','in 조건 쓴 컬럼','order by 쓴 컬럼','범위 조건 쓴 컬럼', 'where 절 전체 컬럼']

for i in range(5):
    print(name[i], statistic[i])

= 조건에 쓰인 컬럼 Counter({'charttime': 272, 'hadm_id': 53})
in 조건 쓴 컬럼 Counter({'itemid': 981, 'hadm_id': 948})
order by 쓴 컬럼 Counter({'charttime': 453, 'valuenum': 137})
범위 조건 쓴 컬럼 Counter()
where 절 전체 컬럼 Counter({'itemid': 1554, 'hadm_id': 1376, 'charttime': 1357, 'row_id': 7})
