# Test script for log entropy analysis

### Log parser

In [85]:
def log_parser(l):
    time_line_str = l.split("] ")[0].strip('[')
    time_str = ":".join(time_line_str.split(":")[:-1])
    line = int(time_line_str.split(":")[-1])
    log = l.split("] ")[1].strip()
    return time_str, line, log
log_parser("[2021-12-08 02:07:56,589:32] top3")

('2021-12-08 02:07:56,589', 32, 'top3')

### Probability

In [86]:
from typing import List
from functools import reduce
import logging
import numpy as np


def prob(val_list: List[bool], data: np.array):
    non_none_idx = [i for i, v in enumerate(val_list) if v is not None]
    non_none_val = [val_list[i] for i in non_none_idx]
    # print(1, val_list, non_none_idx, non_none_val)
    assert(len(non_none_idx))
    # print(2,  data[:, non_none_idx])
    # print(3, data[:, non_none_idx] == non_none_val)
    # print(4, np.all(data[:, non_none_idx] == non_none_val, axis=0))
    # print(5, np.all(data[:, non_none_idx] == non_none_val, axis=1))
    return np.sum(data[np.all(data[:, non_none_idx] == non_none_val, axis=1), -1])

data = np.array([
    [0, 1, 1, 0.3],
    [0, 1, 0, 0.5],
    [0, 0, 1, 0.2],
])
print(prob([0, 1, 1], data))
print(prob([0, 1, None], data))
print(prob([None, 1, 1], data))

0.3
0.8
0.3


### Find all combination

In [87]:
import numpy as np

# get value combination of keys given set of keys
def get_comb_set(X, data, key_list):
    key_idx_list = [i for i, key in enumerate(key_list) if key in X]
    # print(key_idx_list)
    uniq_list = np.unique(data[:, key_idx_list], axis=0)
    # print(2, uniq_list)
    ret = []
    for val_list in uniq_list:
        comb = np.array([None] * len(key_list))
        comb[key_idx_list] = val_list
        ret.append(comb)
    return np.array(ret)

data_temp = np.array([
    [0, 1, 1, 0.3],
    [0, 1, 0, 0.5],
    [0, 0, 1, 0.2],
])
print(get_comb_set([17, 19], data_temp, [17, 19, 20]))

[[0.0 0.0 None]
 [0.0 1.0 None]]


### Calc entropy

In [113]:
# Calculate H(X)
def entropy(X, data, key_list):
    comb_set = get_comb_set(X, data, key_list)
    # print(comb_set)
    # print(prob(comb_set[0], data))
    H = 0
    for comb in comb_set:
        p_X = prob(comb, data)
        logging.debug(comb, "->", p_X)
        H -= p_X * np.log2(p_X)
    return H

print("H(17, 19) =", entropy([17, 19], data_temp, [17, 19, 20]))


H(17, 19) = 0.7219280948873623


### Calc conditional entropy

In [114]:
# Calculate H(X|Y)
def cond_entropy_for_check(X, Y, data, key_list):
    XY = list(set(X + Y))
    Y_idx_list = [i for i, key in enumerate(key_list) if key in Y]
    key_size = len(key_list)
    XY_comb_set = get_comb_set(XY, data, key_list)
    H = 0
    for comb in XY_comb_set:
        p_XY = prob(comb, data)
        comb_Y = [comb[i] if i in Y_idx_list else None for i in range(key_size)]
        p_Y = prob(comb_Y, data)
        logging.debug(comb, comb_Y, "->", p_XY, p_Y)
        H += p_XY * np.log2(p_Y / p_XY)
    return H

print("H(17, 19) =", entropy([17, 19], data_temp, [17, 19, 20]))
print("H(17) =", entropy([17], data_temp, [17, 19, 20]))
print("H(19|17) =", cond_entropy_for_check([19], [17], data_temp, [17, 19, 20]))

# Calculate H(X|Y)
def cond_entropy(X, Y, data, key_list):
    XY = list(set(X + Y))
    return entropy(XY, data, key_list) - entropy(Y, data, key_list)

print("H(19|17) =", cond_entropy([19], [17], data_temp, [17, 19, 20]))

H(17, 19) = 0.7219280948873623
H(17) = 0.0
H(19|17) = 0.7219280948873623
H(19|17) = 0.7219280948873623


## Setting list

In [115]:
import os

os.chdir("/Users/bohrok/Documents/LCA/example/log")
os.listdir()

['77-31-12',
 '97-42-67',
 '28-11-62',
 '15-82-93',
 '14-60-75',
 '25-67-10',
 '97-27-52',
 '37-3-8',
 '95-32-8',
 '52-59-46']

## Choose one setting

In [116]:
setting = os.listdir()[0]
print(setting)
os.chdir(setting)

77-31-12


## Generate data and log key list

In [117]:
data_dict = {}
key_set = set()
total_log = 0
for log_path in os.listdir():
    total_log += 1
    with open(log_path) as f:
        key_tup = tuple(log_parser(l)[1] for l in f.readlines())
        if key_tup not in data_dict:
            data_dict[key_tup] = 1
        else:
            data_dict[key_tup] += 1
        key_set = key_set.union(set(key_tup))
key_list = sorted(list(key_set))
data = np.array(
    [
        [key in k for key in key_list] + [v / total_log]
        for k, v in data_dict.items()
    ]
)
print(key_list)
print(data[0])
print(data[2])
print(prob([1] + [None] * (len(key_list) - 1), data))
print(prob([0, 1] + [None] * (len(key_list) - 2), data))
print(prob([1, 1] + [None] * (len(key_list) - 2), data))


[17, 19, 21, 23, 25, 27, 30, 32]
[1.     1.     0.     1.     0.     1.     1.     1.     0.0559]
[1.     1.     1.     0.     0.     1.     0.     1.     0.2084]
1.0
0.0
0.7737999999999999


### Entropy analysis

In [118]:
entropy(key_list, data, key_list)

1.9896197363524322

In [159]:
for key in key_list:
    print(key, entropy([key], data, key_list))

17 0.0
19 0.7713280051425913
21 0.79085809055281
23 0.9961946033330253
25 0.7713280051425913
27 0.0
30 0.529935486004238
32 0.0


In [122]:
sum_H = 0
agg_key = []
for key in key_list:
    if len(agg_key) == 0:
        H = entropy([key], data, key_list)
        print(f"H({key}) = {H}", end="    ")
        
    else:
        H = cond_entropy([key], agg_key, data, key_list)
        print(
            f"H({key}|{agg_key}) = {H}", end="    "
        )
    sum_H += H
    print(f"sum_H = {sum_H}")
    agg_key.append(key)

H(17) = 0.0    sum_H = 0.0
H(19|[17]) = 0.7713280051425913    sum_H = 0.7713280051425913
H(21|[17, 19]) = 0.6883679396102069    sum_H = 1.4596959447527982
H(23|[17, 19, 21]) = 0.0    sum_H = 1.4596959447527982
H(25|[17, 19, 21, 23]) = 0.0    sum_H = 1.4596959447527982
H(27|[17, 19, 21, 23, 25]) = 0.0    sum_H = 1.4596959447527982
H(30|[17, 19, 21, 23, 25, 27]) = 0.529923791599634    sum_H = 1.9896197363524322
H(32|[17, 19, 21, 23, 25, 27, 30]) = 0.0    sum_H = 1.9896197363524322


In [125]:
print(cond_entropy([21], [17, 19], data, key_list))
print(cond_entropy([21], [19], data, key_list))

0.6883679396102069
0.6883679396102069


In [128]:
print(cond_entropy([23], [17, 19, 21], data, key_list))
print(cond_entropy([23], [19, 21], data, key_list))
print(cond_entropy([23], [21], data, key_list))
print(cond_entropy([23], [19], data, key_list))

0.0
0.0
0.6688378541999882
0.6883679396102069


In [135]:
print(cond_entropy([25], [17, 19, 21, 23], data, key_list))
print(cond_entropy([25], [19, 21, 23], data, key_list))
print(cond_entropy([25], [21, 23], data, key_list))
print(cond_entropy([25], [21], data, key_list))
print(cond_entropy([25], [23], data, key_list))

print(cond_entropy([25], [19], data, key_list))
print(cond_entropy([25], [17], data, key_list))

0.0
0.0
0.0
0.6688378541999882
0.46350134141977284
0.0
0.7713280051425913


In [140]:
print(cond_entropy([27], [17, 19, 21, 23, 25], data, key_list))
print(cond_entropy([27], [19, 21, 23, 25], data, key_list))
print(cond_entropy([27], [21, 23, 25], data, key_list))
print(cond_entropy([27], [23, 25], data, key_list))
print(cond_entropy([27], [25], data, key_list))
print(entropy([27], data, key_list))

0.0
0.0
0.0
0.0
0.0
0.0


In [139]:
print(cond_entropy([30], [17, 19, 21, 23, 25, 27], data, key_list))
print(cond_entropy([30], [27], data, key_list))

0.529923791599634
0.529935486004238


In [142]:
print(cond_entropy([32], [17, 19, 21, 23, 25, 27, 30], data, key_list))
print(entropy([32], data, key_list))

0.0
0.0


In [148]:
print(entropy([21], data, key_list))
print(entropy([19], data, key_list))
print(cond_entropy([21], [19], data, key_list))
print(entropy([19, 21], data, key_list))

0.79085809055281
0.7713280051425913
0.6883679396102069
1.4596959447527982


In [155]:
print(entropy([23], data, key_list))
print(entropy([30], data, key_list))
print(cond_entropy([30], [23], data, key_list))
print(entropy([23, 30], data, key_list))

0.9961946033330253
0.529935486004238
0.5299296013077401
1.5261242046407655


In [158]:
print(entropy([21], data, key_list))
print(cond_entropy([21], [30], data, key_list))
print(cond_entropy([21], [19], data, key_list))

0.79085809055281
0.7908466362896371
0.6883679396102069


## Setting2 for example2

In [161]:
import os

os.chdir("/Users/bohrok/Documents/LCA/example/log2")
# os.listdir()
data_dict = {}
key_set = set()
total_log = 0
for log_path in os.listdir():
    total_log += 1
    with open(log_path) as f:
        key_tup = tuple(log_parser(l)[1] for l in f.readlines())
        if key_tup not in data_dict:
            data_dict[key_tup] = 1
        else:
            data_dict[key_tup] += 1
        key_set = key_set.union(set(key_tup))
key_list = sorted(list(key_set))
data = np.array(
    [
        [key in k for key in key_list] + [v / total_log]
        for k, v in data_dict.items()
    ]
)
print(key_list)
print(data[0])
print(data[2])
print(prob([1] + [None] * (len(key_list) - 1), data))
print(prob([0, 1] + [None] * (len(key_list) - 2), data))
print(prob([1, 1] + [None] * (len(key_list) - 2), data))


[16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 41, 43, 45, 47, 49, 51, 53, 55, 57, 58]
[1.     0.     0.     0.     0.     0.     0.     0.     0.     0.
 1.     1.     1.     0.     0.     0.     0.     0.     1.     1.
 0.     1.     0.0081]
[1.     1.     0.     0.     0.     0.     0.     0.     0.     1.
 0.     1.     1.     0.     0.     0.     0.     0.     1.     0.
 0.     1.     0.0167]
1.0
0.0
0.49440000000000006


### Entropy analysis

In [162]:
entropy(key_list, data, key_list)

5.147567392676745

In [163]:
for key in key_list:
    print(key, entropy([key], data, key_list))

16 0.0
18 0.9999095122751833
20 0.8082527838031908
22 0.5415960579317732
24 0.34001887854711793
26 0.1968931689468837
28 0.20775654405105218
30 0.3317961201533855
32 0.5401860940599541
34 0.8053609204921137
36 0.9999095122751833
38 0.0
41 0.9999981533495597
43 0.9162490663435757
45 0.7195215929897254
47 0.5017230909122596
49 0.50292491124756
51 0.7183134545725995
53 0.9188614586689446
55 0.9144147093999843
57 0.8076123172572023
58 0.0


In [164]:
sum_H = 0
agg_key = []
for key in key_list:
    if len(agg_key) == 0:
        H = entropy([key], data, key_list)
        print(f"H({key}) = {H}", end="    ")
        
    else:
        H = cond_entropy([key], agg_key, data, key_list)
        print(
            f"H({key}|{agg_key}) = {H}", end="    "
        )
    sum_H += H
    print(f"sum_H = {sum_H}")
    agg_key.append(key)

H(16) = 0.0    sum_H = 0.0
H(18|[16]) = 0.9999095122751833    sum_H = 0.9999095122751833
H(20|[16, 18]) = 0.4943952727121168    sum_H = 1.4943047849873001
H(22|[16, 18, 20]) = 0.24809927312776225    sum_H = 1.7424040581150624
H(24|[16, 18, 20, 22]) = 0.12427440632410636    sum_H = 1.8666784644391687
H(26|[16, 18, 20, 22, 24]) = 0.0631447464012731    sum_H = 1.9298232108404418
H(28|[16, 18, 20, 22, 24, 26]) = 0.0    sum_H = 1.9298232108404418
H(30|[16, 18, 20, 22, 24, 26, 28]) = 0.0    sum_H = 1.9298232108404418
H(32|[16, 18, 20, 22, 24, 26, 28, 30]) = 0.0    sum_H = 1.9298232108404418
H(34|[16, 18, 20, 22, 24, 26, 28, 30, 32]) = 0.0    sum_H = 1.9298232108404418
H(36|[16, 18, 20, 22, 24, 26, 28, 30, 32, 34]) = 0.0    sum_H = 1.9298232108404418
H(38|[16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36]) = 0.0    sum_H = 1.9298232108404418
H(41|[16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38]) = 0.9994608449121598    sum_H = 2.9292840557526016
H(43|[16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,

In [196]:
E_before = cond_entropy([18, 20, 22, 26, 41, 43, 46, 47, 55, 57], [18, 20, 22], data, key_list)
E_after = cond_entropy([18, 20, 22, 26, 41, 43, 46, 47, 55, 57], [18, 20, 22, 26], data, key_list)
print(E_before)
print(E_after)
print(E_before - E_after)
print((E_before - E_after) / E_before)

2.577783180949175
2.477862939168026
0.09992024178114889
0.038762081512362455
