In [1]:
from forecast.preprocessor import Preprocessor
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

In [2]:
parquet = r'./preprocessed.parquet.gzip'
preprocessor = Preprocessor(parquet_path=parquet)
df = preprocessor.get_dataframe()
empties = df['query_template'] == ''
print(f"Removing {sum(empties)} empty query template values.")
df = df[:][~empties]
df.shape

Removing 38 empty query template values.


(759272, 5)

In [3]:
le = LabelEncoder()
le.fit(df['query_template'])
df['query_template_enc'] = le.transform(df['query_template'])

In [4]:
data = {}
for vtxid, group in df.groupby('virtual_transaction_id'):
    group = group.sort_values(['log_time', 'session_line_num'])
    data[vtxid] = group['query_template_enc'].values
print(len(data), " unique transactions")
data

26354  unique transactions


{'3/1000': array([ 0, 42, 31, 38, 23, 19, 34,  3,  1]),
 '3/10000': array([ 0, 18, 32, 22, 37,  5,  4, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30,
        24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24,
        30, 24, 30,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,  1]),
 '3/10001': array([ 0, 42, 31, 38, 23, 19, 34,  3,  1]),
 '3/10002': array([ 0, 42, 31, 38, 23, 20, 34,  3,  1]),
 '3/10003': array([ 0, 18, 32, 22, 37,  5,  4, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30,
        24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24, 30, 24,
        30,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, 41, 41,
        41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,  1]),
 '3/10004': array([ 0, 42, 31, 38, 23, 19, 34,  3,  1]),
 '3/10005': array([ 0, 42, 31, 38, 23, 19, 34,  3,  1]),
 '3/10006': array([ 0, 42, 31, 38, 23, 19, 34,  3,  1]),
 '3/10007': array([ 0, 18, 32, 22, 37,  5

In [5]:
trajs = set()

for values in data.values():
    trajs.add(','.join(str(v) for v in values.tolist()))

print(len(trajs), " unique trajectories")

41  unique trajectories


In [6]:
for s in sorted(trajs):
    print(s)

0,1
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,1
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,7
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,6,6,6,6,6,6,6,6,6,6,6,6,6,6,41,41,41,41,41,41,41,41,41,41,41,41,41,41,1
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,7
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,6,6,6,6,6,6,6,6,6,6,6,6,6,41,41,41,41,41,41,41,41,41,41,41,41,41,1
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,7
0,18,32,22,37,5,4,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,24,30,6,6,6,6,6,6,6,6,6,6,6,6,41,41,41,41,41,41,41,41,41,41,41,41,1
0,18,32,22,37,5,4,2

In [7]:
le.inverse_transform([0,39,27,25,36,29,2,39,27,2,25,40,36,29,40,39,27,2,25,29,39,27,2,25,36,29,40,40,39,39,27,2,25,36,29,40,27,36,25,25,2,27,36,40,29,39,2,36,36,29,40,39,27,2,25,40,39,27,2,25,36,29,40,29,36,25,2,27,39,40,29,1])

array(['BEGIN',
       'UPDATE oorder SET O_CARRIER_ID = $1 WHERE O_ID = $2 AND O_D_ID = $3 AND O_W_ID = $4',
       'SELECT O_C_ID FROM oorder WHERE O_ID = $1 AND O_D_ID = $2 AND O_W_ID = $3',
       'SELECT NO_O_ID FROM new_order WHERE NO_D_ID = $1 AND NO_W_ID = $2 ORDER BY NO_O_ID ASC LIMIT $3',
       'UPDATE customer SET C_BALANCE = C_BALANCE + $1, C_DELIVERY_CNT = C_DELIVERY_CNT + $2 WHERE C_W_ID = $3 AND C_D_ID = $4 AND C_ID = $5',
       'SELECT SUM(OL_AMOUNT) AS OL_TOTAL FROM order_line WHERE OL_O_ID = $1 AND OL_D_ID = $2 AND OL_W_ID = $3',
       'DELETE FROM new_order WHERE NO_O_ID = $1 AND NO_D_ID = $2 AND NO_W_ID = $3',
       'UPDATE oorder SET O_CARRIER_ID = $1 WHERE O_ID = $2 AND O_D_ID = $3 AND O_W_ID = $4',
       'SELECT O_C_ID FROM oorder WHERE O_ID = $1 AND O_D_ID = $2 AND O_W_ID = $3',
       'DELETE FROM new_order WHERE NO_O_ID = $1 AND NO_D_ID = $2 AND NO_W_ID = $3',
       'SELECT NO_O_ID FROM new_order WHERE NO_D_ID = $1 AND NO_W_ID = $2 ORDER BY NO_O_ID ASC L

In [8]:
from pomegranate import DiscreteDistribution, ConditionalProbabilityTable, MarkovChain

In [9]:
n = len(le.classes_)
startdist = {i : 0 for i in range(n)}
startdist[le.transform(['BEGIN'])[0]] = 1.0
dist = DiscreteDistribution(startdist)

cptt = [[i, j, 1 if i == j else 0] for i in range(n) for j in range(n)]
cpt = ConditionalProbabilityTable(cptt, [dist])

mc = MarkovChain([dist, cpt])
samples = list(data.values())
mc = mc.from_samples(samples)
mc.distributions

[{
     "class" : "Distribution",
     "dtype" : "numpy.int64",
     "name" : "DiscreteDistribution",
     "parameters" : [
         {
             "0" : 0.999620550960006,
             "1" : 0.0,
             "2" : 0.0,
             "3" : 0.0,
             "4" : 3.794490399939288e-05,
             "5" : 0.0,
             "6" : 0.0,
             "7" : 0.0,
             "8" : 3.794490399939288e-05,
             "9" : 3.794490399939288e-05,
             "10" : 3.794490399939288e-05,
             "11" : 3.794490399939288e-05,
             "12" : 3.794490399939288e-05,
             "13" : 3.794490399939288e-05,
             "14" : 3.794490399939288e-05,
             "15" : 3.794490399939288e-05,
             "16" : 0.0,
             "17" : 0.0,
             "18" : 0.0,
             "19" : 0.0,
             "20" : 0.0,
             "21" : 0.0,
             "22" : 0.0,
             "23" : 0.0,
             "24" : 0.0,
             "25" : 0.0,
             "26" : 0.0,
             "27" : 0.0,

In [10]:
cpt = mc.distributions[1]
cptd = cpt.to_dict()
dada = pd.DataFrame(cptd['table'], columns=['src','dst','val'])
accs = []
for i in range(n):
    dasub = dada[dada['src'] == str(i)]
    if dasub['val'].nunique() == 1:
        accs.append(str(i))

dada.loc[dada['src'].isin(accs), 'val'] = 0
for i in accs:
    dada.loc[(dada['src'] == str(i)) & (dada['dst'] == str(i)), 'val'] = 1
dadal = dada.to_dict(orient='list')

sdv = [[s,d,v] for s,d,v in zip(dadal['src'], dadal['dst'], dadal['val'])]
cptd['table'] = sdv
mc.distributions[1] = cpt.from_dict(cptd)

print('Had to fix distribution for ')
print(le.inverse_transform([int(x) for x in accs]))

mc.distributions

Had to fix distribution for 
['COMMIT' 'ROLLBACK' 'SELECT * FROM pg_stat_archiver'
 'SELECT * FROM pg_stat_bgwriter' 'SELECT * FROM pg_stat_database'
 'SELECT * FROM pg_stat_database_conflicts'
 'SELECT * FROM pg_stat_user_indexes' 'SELECT * FROM pg_stat_user_tables'
 'SELECT * FROM pg_statio_user_indexes'
 'SELECT * FROM pg_statio_user_tables' 'SELECT version()']


[{
     "class" : "Distribution",
     "dtype" : "numpy.int64",
     "name" : "DiscreteDistribution",
     "parameters" : [
         {
             "0" : 0.999620550960006,
             "1" : 0.0,
             "2" : 0.0,
             "3" : 0.0,
             "4" : 3.794490399939288e-05,
             "5" : 0.0,
             "6" : 0.0,
             "7" : 0.0,
             "8" : 3.794490399939288e-05,
             "9" : 3.794490399939288e-05,
             "10" : 3.794490399939288e-05,
             "11" : 3.794490399939288e-05,
             "12" : 3.794490399939288e-05,
             "13" : 3.794490399939288e-05,
             "14" : 3.794490399939288e-05,
             "15" : 3.794490399939288e-05,
             "16" : 0.0,
             "17" : 0.0,
             "18" : 0.0,
             "19" : 0.0,
             "20" : 0.0,
             "21" : 0.0,
             "22" : 0.0,
             "23" : 0.0,
             "24" : 0.0,
             "25" : 0.0,
             "26" : 0.0,
             "27" : 0.0,

In [11]:
probs = [mc.log_probability(sample) for sample in samples]
min(probs), max(probs), sum(probs)

(-20.572061079963355, -1.5390154933476388, -164449.41193814046)

In [12]:
sample = mc.sample(dist)
print(sample)
le.inverse_transform(mc.sample(dist))

[0]


array(['BEGIN'], dtype=object)

In [13]:

# def sampler(mc):
#     starts = le.transform(['BEGIN'])
#     stops = le.transform(['COMMIT', 'ROLLBACK'])
    
#     # Get a start token.
#     seq = [mc.distributions[0].sample()]
#     while seq not in starts:
#         seq = [mc.distributions[0].sample()]

    
#     while True:
#         n = len(le.classes_)
#         distdict = {i: 0 for i in range(n)}
#         distdict[seq[-1]] = 1
#         dist = DiscreteDistribution(distdict)
#         new_state = mc.distributions[1].sample({mc.distributions[1] : seq[-1]})
#         seq.append(new_state)
#         if new_state in stops:
#             return seq
    
#     for j, distribution in enumerate(mc.distributions[1:]):
#         parents = {mc.distributions[l] : seq[l] for l in range(j+1)}
#         new_state = distribution.sample(parents)
#         seq.append(new_state)
#         print('moo', le.inverse_transform([new_state]))
#         if new_state in stops:
#             return seq
    
#     l = 0
#     while True:
#         parents = {mc.distributions[k] : seq[l+k+1] for k in range(len(mc.distributions) - 1)}
#         new_state = mc.distributions[-1].sample(parents)
#         seq.append(new_state)
#         print('meow', le.inverse_transform([new_state]))
#         l += 1
#         if new_state in stops:
#             return seq

# print(len(le.inverse_transform(sampler(mc))))


def sampler(mc):
    pdd = pd.DataFrame(mc.distributions[1].to_dict()['table'], columns=['src', 'dst', 'val'])
    pdd['src'] = pdd['src'].astype(int)
    pdd['dst'] = pdd['dst'].astype(int)
    pdd['val'] = pdd['val'].astype(float)

    def sample(current):
        pddd = pdd[pdd['src'] == current]
        return pddd.sample(weights=pddd['val'])['dst'].values[0]
    
    starts = le.transform(['BEGIN'])
    stops = le.transform(['COMMIT', 'ROLLBACK'])
    
    # Get a start token.
    
    seq = [mc.distributions[0].sample()]
    while seq not in starts:
        print('Warning: got start ', seq)
        seq = [mc.distributions[0].sample()]

    while True:
        new_state = sample(seq[-1])
        seq.append(new_state)
        if new_state in stops:
            return seq

sampler(mc)

[0, 42, 31, 38, 23, 19, 34, 3, 1]

In [14]:
fakes = [sampler(mc) for i in range(500)]

In [15]:
import json
table = np.array(json.loads(mc.distributions[1].to_json())['table'])
table
for k,v,x in table:
    print(k,v,x)

0 0 0.0
0 1 3.795930762222899e-05
0 2 0.0
0 3 0.0
0 4 0.0
0 5 0.0
0 6 0.0
0 7 0.0
0 8 0.0
0 9 0.0
0 10 0.0
0 11 0.0
0 12 0.0
0 13 0.0
0 14 0.0
0 15 0.0
0 16 0.0
0 17 0.0
0 18 0.4527027027027027
0 19 0.02334497418767082
0 20 0.015829031278469474
0 21 0.039477679927118134
0 22 0.0
0 23 0.0
0 24 0.0
0 25 0.04004706954145156
0 26 0.0
0 27 0.0
0 28 0.0
0 29 0.0
0 30 0.0
0 31 0.0
0 32 0.0
0 33 0.0
0 34 0.0
0 35 0.0
0 36 0.0
0 37 0.0
0 38 0.0
0 39 0.0
0 40 0.0
0 41 0.0
0 42 0.4285605830549651
1 0 0.0
1 1 1.0
1 2 0.0
1 3 0.0
1 4 0.0
1 5 0.0
1 6 0.0
1 7 0.0
1 8 0.0
1 9 0.0
1 10 0.0
1 11 0.0
1 12 0.0
1 13 0.0
1 14 0.0
1 15 0.0
1 16 0.0
1 17 0.0
1 18 0.0
1 19 0.0
1 20 0.0
1 21 0.0
1 22 0.0
1 23 0.0
1 24 0.0
1 25 0.0
1 26 0.0
1 27 0.0
1 28 0.0
1 29 0.0
1 30 0.0
1 31 0.0
1 32 0.0
1 33 0.0
1 34 0.0
1 35 0.0
1 36 0.0
1 37 0.0
1 38 0.0
1 39 0.0
1 40 0.0
1 41 0.0
1 42 0.0
2 0 0.0
2 1 0.0
2 2 0.0
2 3 0.0
2 4 0.0
2 5 0.0
2 6 0.0
2 7 0.0
2 8 0.0
2 9 0.0
2 10 0.0
2 11 0.0
2 12 0.0
2 13 0.0
2 14 0.0
2 15 0.

37 19 0.0
37 20 0.0
37 21 0.0
37 22 0.0
37 23 0.0
37 24 0.0
37 25 0.0
37 26 0.0
37 27 0.0
37 28 0.0
37 29 0.0
37 30 0.0
37 31 0.0
37 32 0.0
37 33 0.0
37 34 0.0
37 35 0.0
37 36 0.0
37 37 0.0
37 38 0.0
37 39 0.0
37 40 0.0
37 41 0.0
37 42 0.0
38 0 0.0
38 1 0.0
38 2 0.0
38 3 0.0
38 4 0.0
38 5 0.0
38 6 0.0
38 7 0.0
38 8 0.0
38 9 0.0
38 10 0.0
38 11 0.0
38 12 0.0
38 13 0.0
38 14 0.0
38 15 0.0
38 16 0.0
38 17 0.0
38 18 0.0
38 19 0.0
38 20 0.0
38 21 0.0
38 22 0.0
38 23 1.0
38 24 0.0
38 25 0.0
38 26 0.0
38 27 0.0
38 28 0.0
38 29 0.0
38 30 0.0
38 31 0.0
38 32 0.0
38 33 0.0
38 34 0.0
38 35 0.0
38 36 0.0
38 37 0.0
38 38 0.0
38 39 0.0
38 40 0.0
38 41 0.0
38 42 0.0
39 0 0.0
39 1 0.0
39 2 0.0
39 3 0.0
39 4 0.0
39 5 0.0
39 6 0.0
39 7 0.0
39 8 0.0
39 9 0.0
39 10 0.0
39 11 0.0
39 12 0.0
39 13 0.0
39 14 0.0
39 15 0.0
39 16 0.0
39 17 0.0
39 18 0.0
39 19 0.0
39 20 0.0
39 21 0.0
39 22 0.0
39 23 0.0
39 24 0.0
39 25 0.0
39 26 0.0
39 27 0.0
39 28 0.0
39 29 0.0
39 30 0.0
39 31 0.0
39 32 0.0
39 33 0.0
39 34 0.0


In [16]:
%matplotlib inline
from graphviz import Digraph


def printer(string, every=24):
    return '\n'.join(string[i:i+every] for i in range(0, len(string), every))

def draw_graph(table):
    f = Digraph('tpcc', filename='tpcc', format='pdf')
    
    letters = np.unique(table[:, 0])

    def inv(s):
        if False:
            return s
        return printer(le.inverse_transform([int(s)])[0])
    
    for state in letters:
        f.node(inv(state))

    for row in table:
        if float(row[2]) > 0.00001:
            src = inv(row[0])
            dst = inv(row[1])
            if int(row[0]) in le.transform(['COMMIT', 'ROLLBACK']):
                assert row[0] == row[1] or float(row[2]) <= 0.001, row
                continue
            if int(row[1]) in le.transform(['BEGIN']):
                assert row[0] == row[1] or float(row[2]) <= 0.001, row
                continue
            f.edge(src, dst, label=f'{float(row[2]):.2f}', penwidth=f'{2.5*float(row[2])}')

    return f


g = draw_graph(table)

In [17]:
g.render(directory='./')

'tpcc.pdf'

In [18]:
# le.transform(['COMMIT', 'ROLLBACK'])

In [19]:
# for vtxid, v in data.items():
#     if 1 in v[0:-1] or 7 in v[0:-1] or 0 in v[1:]:
#         print(vtxid, v)
#         print(le.inverse_transform(v))
