In [1]:
import hashlib
import re

import pandas as pd
import sqlparse

In [2]:
pgfile = 'data/extracted/simple/postgresql-2021-12-06_160210.csv'

In [3]:
# https://www.postgresql.org/docs/13/runtime-config-logging.html#RUNTIME-CONFIG-LOGGING-CSVLOG
PG_LOG_COLUMNS = [
    'log_time',
    'user_name',
    'database_name',
    'process_id',
    'connection_from',
    'session_id',
    'session_line_num',
    'command_tag',
    'session_start_time',
    'virtual_transaction_id',
    'transaction_id',
    'error_severity',
    'sql_state_code',
    'message',
    'detail',
    'hint',
    'internal_query',
    'internal_query_pos',
    'context',
    'query',
    'query_pos',
    'location',
    'application_name',
    'backend_type',
]


df = pd.read_csv(pgfile,
                 names=PG_LOG_COLUMNS,
                 parse_dates=['log_time', 'session_start_time'],
                 header=None,
                 index_col=False)
print(df.shape)
print(df.columns)
print(set(df['command_tag']))

(14805, 24)
Index(['log_time', 'user_name', 'database_name', 'process_id',
       'connection_from', 'session_id', 'session_line_num', 'command_tag',
       'session_start_time', 'virtual_transaction_id', 'transaction_id',
       'error_severity', 'sql_state_code', 'message', 'detail', 'hint',
       'internal_query', 'internal_query_pos', 'context', 'query', 'query_pos',
       'location', 'application_name', 'backend_type'],
      dtype='object')
{nan, 'UPDATE', 'SET', 'SELECT', 'ROLLBACK', 'SHOW', 'DELETE', 'INSERT', 'COMMIT', 'BEGIN'}


## Extracting the relevant queries.

In [4]:
commands = ['SELECT', 'INSERT', 'UPDATE', 'DELETE']

def extract_query(message):
    for command in commands:
        idx = message.find(command)
        if idx != -1:
            query = message[idx:]
            return query
    return ''

df['query'] = df['message'].apply(extract_query)
df['query']

0        UPDATE order_line   SET OL_DELIVERY_D = '2021-...
1        SELECT SUM(OL_AMOUNT) AS OL_TOTAL   FROM order...
2        UPDATE customer   SET C_BALANCE = C_BALANCE + ...
3        SELECT NO_O_ID FROM new_order WHERE NO_D_ID = ...
4        DELETE FROM new_order WHERE NO_O_ID = 2926    ...
                               ...                        
14800                                                     
14801                                                     
14802                                                     
14803                                                     
14804                                                     
Name: query, Length: 14805, dtype: object

## Anonymizer: salt and hash non-date non-digit strings.

In [5]:
SALT = 'andycannotsay.com'.encode('utf-8')
DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}.*')
DIGITS_REGEX = re.compile(r'\d+\.?\d*')

def anonymize(sql):
    cleaned_tokens = []

    # TODO(WAN): sqlparse.parse is actually quite slow.
    # Do we really need this?
    parsed = sqlparse.parse(sql)
    if len(parsed) == 0:
        return ''
    
    assert len(parsed) == 1
    tokens = parsed[0].flatten()
    for token in tokens:
        token = str(token)

        single_quoted = token.startswith("'") and token.endswith("'")
        double_quoted = token.startswith('"') and token.endswith('"')
        not_quoted = not single_quoted and not double_quoted

        is_date = DATE_REGEX.search(token) is not None
        is_digits = DIGITS_REGEX.search(token) is not None

        if not_quoted or is_date or is_digits:
            cleaned_tokens.append(token)
            continue

        sha = hashlib.sha256(SALT + token.encode('utf-8')).hexdigest()
        clean_token = "'{}\\{}'".format(len(token) - 2, sha)
        cleaned_tokens.append(clean_token)

    return ''.join(cleaned_tokens)

df['query_anon'] = df['query'].apply(anonymize)
df['query_anon']

0        UPDATE order_line   SET OL_DELIVERY_D = '2021-...
1        SELECT SUM(OL_AMOUNT) AS OL_TOTAL   FROM order...
2        UPDATE customer   SET C_BALANCE = C_BALANCE + ...
3        SELECT NO_O_ID FROM new_order WHERE NO_D_ID = ...
4        DELETE FROM new_order WHERE NO_O_ID = 2926    ...
                               ...                        
14800                                                     
14801                                                     
14802                                                     
14803                                                     
14804                                                     
Name: query_anon, Length: 14805, dtype: object

## Pre-processor: extracting query templates.

In [6]:
STRING_REGEX = r'([^\\])\'((\')|(.*?([^\\])\'))'
DOUBLE_QUOTE_STRING_REGEX = r'([^\\])"((")|(.*?([^\\])"))'
INT_REGEX = r'([^a-zA-Z])-?\d+(\.\d+)?'
HASH_REGEX = r'(\'\d+\\.*?\')'

def extract_template(query):
    template = query
    template = re.sub(HASH_REGEX, r"@@@", template)
    template = re.sub(STRING_REGEX, r"\1&&&", template)
    template = re.sub(DOUBLE_QUOTE_STRING_REGEX, r"\1&&&", template)
    template = re.sub(INT_REGEX, r"\1#", template)
    return template

df['query_template'] = df['query_anon'].apply(extract_template)
df['query_template']

0        UPDATE order_line   SET OL_DELIVERY_D = &&&  W...
1        SELECT SUM(OL_AMOUNT) AS OL_TOTAL   FROM order...
2        UPDATE customer   SET C_BALANCE = C_BALANCE + ...
3        SELECT NO_O_ID FROM new_order WHERE NO_D_ID = ...
4        DELETE FROM new_order WHERE NO_O_ID = #    AND...
                               ...                        
14800                                                     
14801                                                     
14802                                                     
14803                                                     
14804                                                     
Name: query_template, Length: 14805, dtype: object

In [7]:
df['log_time_s'] = df['log_time'].round('S')
df['log_time_s']

0       2021-12-06 16:02:11-05:00
1       2021-12-06 16:02:11-05:00
2       2021-12-06 16:02:11-05:00
3       2021-12-06 16:02:11-05:00
4       2021-12-06 16:02:11-05:00
                   ...           
14800   2021-12-06 16:02:54-05:00
14801   2021-12-06 16:02:54-05:00
14802   2021-12-06 16:02:54-05:00
14803   2021-12-06 16:02:54-05:00
14804   2021-12-06 16:02:54-05:00
Name: log_time_s, Length: 14805, dtype: datetime64[ns, pytz.FixedOffset(-300)]

In [8]:
gb = df.groupby(['query_template', 'log_time_s'])
for (query_template, timestamp), data in gb:
    # Output the (number of timestamps, query template).
    print(len(data), query_template)
    # Output all the timestamps for the given query template.
    print(data['log_time'].values)

508 
['2021-12-06T21:02:10.789000000' '2021-12-06T21:02:10.789000000'
 '2021-12-06T21:02:10.791000000' '2021-12-06T21:02:10.791000000'
 '2021-12-06T21:02:10.794000000' '2021-12-06T21:02:10.794000000'
 '2021-12-06T21:02:10.799000000' '2021-12-06T21:02:10.799000000'
 '2021-12-06T21:02:10.804000000' '2021-12-06T21:02:10.804000000'
 '2021-12-06T21:02:10.805000000' '2021-12-06T21:02:10.806000000'
 '2021-12-06T21:02:10.807000000' '2021-12-06T21:02:10.807000000'
 '2021-12-06T21:02:10.808000000' '2021-12-06T21:02:10.808000000'
 '2021-12-06T21:02:10.810000000' '2021-12-06T21:02:10.810000000'
 '2021-12-06T21:02:10.811000000' '2021-12-06T21:02:10.811000000'
 '2021-12-06T21:02:10.812000000' '2021-12-06T21:02:10.813000000'
 '2021-12-06T21:02:10.814000000' '2021-12-06T21:02:10.814000000'
 '2021-12-06T21:02:10.817000000' '2021-12-06T21:02:10.817000000'
 '2021-12-06T21:02:10.821000000' '2021-12-06T21:02:10.821000000'
 '2021-12-06T21:02:10.822000000' '2021-12-06T21:02:10.822000000'
 '2021-12-06T21:02:1

## Clusterer

In [9]:
# TODO(WAN): Port online_clustering.py.
# TODO(WAN): I would be somewhat surprised if sklearn doesn't have this built in... We'll see

## Forecaster

In [10]:
# TODO(WAN): Port exp_multi_online_continuous.py