# Log Colorizer
Learn from statistical properties in a trace of logs

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from eliana.preprocessing import LogColorizer
from nb_utils import showAttribs
showAttribs(LogColorizer)

LogColorizer        : Model to learn tokenizers from a dataset of traces.

Constructor
__init__            : Initializes the LogColorizer with a tokenizer or a custom tokenization function.

Properties
regexps             : List of regular expressions used by the tokenizer, including post-processing patterns.
special             : Symbol used as a placeholder for numeric or variable values in templates. Default is '§'.
templates           : List of templates used for matching traces.
wildcard            : Placeholder for variable text in templates. Default is "{}".

Methods
fit                 : Learns templates and tokenization rules from a dataset of traces.
fit_on_traces       : None
save                : Saves the tokenizer object to a file.
tokenize            : Tokenizes a word or phrase using the provided tokenizer and post-processing steps.


## Example with ParlogsObservations

In [3]:
from eliana.datasets import ParlogsObservations
from eliana.preprocessing import LogColorizer, VltTokenizer, load_tokenizer

config = {
    'system': 'PIONIER',
    'period': '1w'
}
logs = ParlogsObservations(**config)
traces = logs.traces()

# LogColorizer needs a pandas dataframe with a column named 'event' and 'trace_id'
traces['event'] = traces['logtext']

print(f"About to colorize {len(traces)} traces over col=logtext, which has uniques={len(traces['logtext'].unique())}")

About to colorize 89408 traces over col=logtext, which has uniques=19412


In [4]:
%%time
model = LogColorizer(tokenizer=VltTokenizer())
x = model.fit(traces, warn=True)




CPU times: user 9.39 s, sys: 164 ms, total: 9.56 s
Wall time: 9.64 s


In [5]:
# See learnt tokenization in action. Use swifter for faster processing if you need
#!pip install swifter
#traces['colorized'] = traces['logtext'].swifter.apply(lambda x: model.tokenize(x))

traces['colorized'] = traces['logtext'].apply(lambda x: model.tokenize(x))
traces['color_id'] = traces['colorized'].apply(lambda x: model.vocab_dict_[x])

traces[['logtext', 'colorized', 'color_id']][50:70]

Unnamed: 0,logtext,colorized,color_id
50,Reply (not last) to 'STATUS' received: 'INS.MO...,reply not last to status received ins mode obs...,35
51,"INS.MODE ""OBSERV-H"" (SpringGreen4)",ins mode observ h springgreen4,36
52,beginning exposure 1 of 1 (2019-04-02T07:40:48...,beginning exposure {} of {} {} underlined,10
53,SETUP -expoId 0 -function INS.MODE OBSERV-H DE...,setup expoid {} function ins mode observ h det...,37
54,Executing SETUP command ...,executing setup command,12
55,Forward(b) SETUP to DCS,forward b {} to dcs,13
56,1 - ic0fbControlSrv.C:695: Changing substate f...,{} ic0fbcontrolsrv c {} changing substate from...,16
57,Motion execution.,motion execution,18
58,1 - ic0fbiDevDrvDiscreteMotor.C:1768: CFOU: Al...,{} ic0fbidevdrvdiscretemotor c {} cfou already...,28
59,1 - ic0fbControlSrv.C:695: Changing substate f...,{} ic0fbcontrolsrv c {} changing substate from...,19


### Save the tokenizer

In [6]:
model.save('data/models/example_logcolorizer.pkl')
!ls -lh data/models/

total 520
-rw-r--r--  1 jgil  5000   229K Jan  2 01:43 example_logcolorizer.pkl


### Load the tokenizer

In [7]:
model = load_tokenizer('data/models/example_logcolorizer.pkl')
model.vocab_dict_['<unk>']

0

### Inspect inner workings of the model

In [8]:
# First 20 items of the vocabulary
list(model.vocab_dict_.items())[:20]

[('<unk>', 0),
 ('pionier_gen_tec_setup instrument setup yellow', 1),
 ('started at {} underlined', 2),
 ('seq win single f', 3),
 ('ins mode observ h', 4),
 ('ins disp name free', 5),
 ('status function ins mode blue', 6),
 ('send command status ins mode false to sub system ics', 7),
 ('reply not last to status received ins mode park len {} last reply will be ignored',
  8),
 ('ins mode park springgreen4', 9),
 ('beginning exposure {} of {} {} underlined', 10),
 ('setup expoid {} function ins mode park det scan st f blue', 11),
 ('executing setup command', 12),
 ('forward b {} to dcs', 13),
 ('{} ic0fbinsmodel c {} notice irrelevant setup key received ins shut{} st',
  14),
 ('{} ic0fbinsmodel c {} notice irrelevant setup key s contained in setup buffer',
  15),
 ('{} ic0fbcontrolsrv c {} changing substate from idle to busy', 16),
 ('motor offset done', 17),
 ('motion execution', 18),
 ('{} ic0fbcontrolsrv c {} changing substate from busy to idle', 19)]

In [9]:
model.regexps

[('([lw]a{0,1}t)[0-9]([a-z]+)', '\\1{}\\2'),
 ('(\\W)cmd\\d+', '\\1cmd{}'),
 ('(\\s*?[a-z][a-z0-9]{2,})_[0-9]{3,}(\\s*)', '\\1_{}\\2'),
 ('\\.\\.\\.(\\s+\\S+){5}.*', '_setup_parameters_ommited_'),
 ('\\d{4}-\\d{2}-\\d{2}[ tT]\\d{2}:\\d{2}:\\d{2}(\\.\\d{0,3})?', '{}'),
 ('\\b-?\\d+(\\.\\d+)?([eE][-+]?\\d+(\\.\\d+)?)?\\b', '{}'),
 ('-{}', '{}'),
 ('[\\"\'!,;:\\+\\*\\$<>\\.\\-|/\\\\=\\[\\]\\()#]', ' '),
 ('(\\{\\})+', '{}'),
 [('(\\{\\})+', '{}'),
  (' [0-9]+x[0-9]+a[0-9]+ ', ' {}x{}a{} '),
  (' [0-9]+xf[0-9]+ ', ' {}xf{} '),
  (' [0-9]+pho ', ' {}pho '),
  (' [0-9]+x[0-9]+ ', ' {}x{} '),
  (' a[0-9]+ ', ' a{} '),
  (' beam[0-9]+ ', ' beam{} '),
  (' cfg[0-9]+ ', ' cfg{} '),
  (' cs[0-9]+ ', ' cs{} '),
  (' cu[0-9]+ ', ' cu{} '),
  (' dpnics[0-9]+ ', ' dpnics{} '),
  (' g[0-9]+ ', ' g{} '),
  (' ic[0-9]+fbcontrolsrv_standbycb ', ' ic{}fbcontrolsrv_standbycb '),
  (' ic[0-9]+fbdevice ', ' ic{}fbdevice '),
  (' ic[0-9]+fbdevsrv_onlinecb ', ' ic{}fbdevsrv_onlinecb '),
  (' ic[0-9]+fbidiscret

In [10]:
model.templates

['tplexectimestats tpl id § estimated {} {} {} real {} {} {} diff {} seconds {} {} {}',
 'send command status det § § false to sub system dcs',
 'send command status ins2 § § false to sub system acs',
 'send a{} no kill mjaxos{}wnc{}w§ reply to {}',
 'obs name § obs id {}',
 'status function det § § blue',
 'status function ins2 § § blue',
 'det § § {} springgreen4',
 'forward a § to dcs',
 'forward b § to dcs',
 'ins cfg{} ttm§ xref {}',
 'ins cfg{} ttm§ yref {}',
 'det § § {}',
 'last reply to status from acs received ins2 § § {} len {}',
 'last reply to status from dcs received det § § {} len {}',
 '{} ic0fbcontrolsrv c {} device § changed state from standby to standby',
 '{} ic0fbdevsrv_standbycb c {} brought device § to standby state']