# Clusterlogs Notebook

In [1]:
import pandas as pd
from clusterlogs import pipeline

### 1. Download data from file and create pandas DataFrame with index 

In [2]:
df = pd.read_csv('test_data.csv', index_col=0).set_index('pandaid')

In [3]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization pipeline

In [20]:
target = 'exeerrordiag'

In [21]:
cluster = pipeline.ml_clustering(df, target)

### 3. Execute clusterization pipeline

In [22]:
cluster.process()

<clusterlogs.pipeline.ml_clustering at 0x1c26877048>

### 4. Get clusters statistics

Clusters Statistics returns DataFrame with statistic for all clusters:
- "cluster_name" - name of a cluster
- "cluster_size" - number of log messages in cluster
- "first_entry" - first log message in cluster
- "vocab" - all tokens in error messasges
- "vocab_length" - the length of cluster's vocabulary
- "mean_length" - average length of log messages in cluster
- "std_length" - standard deviation of length of log messages in cluster
- "mean_similarity" - average similarity of log messages in cluster
(calculated as the levenshtein distances between the 1st and all other log messages)
- "std_similarity" - standard deviation of similarity of log messages in cluster

In [23]:
stat = cluster.statistics()

In [24]:
stat_df = pd.DataFrame.from_dict(stat)

In [25]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,first_entry,vocab,vocab_length,mean_length,mean_similarity,std_length,std_similarity
4,4,2,File data18_13TeV.00363033.physics_Main.merge....,{data18_13TeV.00363033.physics_Main.merge.AOD....,12,75.0,50.0,30.0,50.0
1,1,18,No events to process: 16000 (skipEvents) >= 23...,"{2400, 4000, 14000, 2800, 2500, HITS, process,...",26,68.28,94.44,0.45,3.67
2,2,20,Non-zero return code from AODtoDAOD (8); Logfi...,"{list, data, G4, message, tag, 8, slimming, sm...",63,156.9,95.0,39.9,3.32
3,3,23,"Fatal error in athena logfile: ""G4 exception a...","{G4, 20330, 3325, message, 2374, 2561, 3245, a...",36,98.26,95.65,2.38,2.39
0,0,115,Non-zero return code from EVNTtoHITS (8); Logf...,"{RAWtoESD, COOLOFL_INDET/OFLP200, StatusCode, ...",137,155.97,99.13,43.09,0.15
5,5,48,Missing AthenaMP outputs file athenaMP-outputs...,"{AthenaMP, Missing, crashed, file, outputs, at...",8,85.0,100.0,0.0,0.0
6,6,22,LRMS error: (271) Job was cancelled,"{LRMS, error, Job, 271, cancelled}",5,35.0,100.0,0.0,0.0
7,7,1,No events to process: 2000 (skipEvents) >= 200...,"{EVNT, events, inputEvents, skipEvents, 2000, ...",6,68.0,100.0,0.0,0.0


### Timings for all stages of clusterization pipeline

process - timing of all process

In [26]:
cluster.timings

{'data_preparation': 0.0033,
 'tokenization': 0.0263,
 'tokens_vectorization': 0.0444,
 'sentence_vectorization': 0.033,
 'kneighbors': 0.0023,
 'epsilon_search': 0.001,
 'dbscan': 0.1094,
 'process': 0.2198}

### Get all error messages in single cluster

In [29]:
cluster.in_cluster(1)

['No events to process: 16000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 14000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 8000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 10000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 4000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 12000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 18000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2400 (skipEvents) >= 2

### Output clusters - mode == 'ALL'  (for cluster '2')

In [30]:
cluster.clustered_output(mode='ALL')['1']

[{'starttime': '2019-10-01T10:18:49',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 16000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster': 1},
 {'starttime': '2019-10-01T10:15:46',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 14000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster': 1},
 {'starttime': '2019-10-01T10:14:58',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 8000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster': 1},
 {'starttime': '2019-10-01T10:15:11',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 10000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster': 1},
 {'starttime': '2019-10-01T10:14:54',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 4000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster': 1},
 {'starttime': '2019-10-01T10:15:26',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 12000 (skipEvents) >= 2343 (inputEvents of HITS',
  'cluster'

### Output clusters - mode == 'INDEX' (for cluster '2')

In [31]:
cluster.clustered_output(mode='INDEX')['1']

[4499004301,
 4499004297,
 4499004287,
 4499004291,
 4499004281,
 4499004294,
 4499004304,
 4498117632,
 4498117637,
 4498117639,
 4498117633,
 4498117636,
 4498117643,
 4498117635,
 4498117638,
 4498117644,
 4498117640,
 4498117645]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [32]:
cluster.clustered_output(mode='TARGET')['1']

['No events to process: 16000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 14000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 8000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 10000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 4000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 12000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 18000 (skipEvents) >= 2343 (inputEvents of HITS',
 'No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2400 (skipEvents) >= 2

### Output clusters - cluster labels

In [15]:
cluster.cluster_labels

array([0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, 2,
       2, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 2, 0, 0,
       0, 3, 0, 0, 2, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1, 1, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 6, 6, 1, 1, 1, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       6, 6, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 1, 5, 5, 5, 5, 5,
       5, 6, 6, 6, 6, 6, 6, 1, 1, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 3,
       3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 5, 0, 3, 0, 3, 3, 0, 0, 5, 5, 0, 0,
       0, 0, 0, 5, 5, 3, 3, 0, 3, 3, 3, 5, 0, 2, 0, 0, 0, 0, 0, 0, 3, 3,
       3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 5, 5, 5, 0, 0, 5, 0, 3, 2, 3, 0, 0,
       5, 0, 0, 5, 0, 0, 5])

### Vocabulary of all tokens with frequencies

In [16]:
cluster.get_w2v_vocabulary()

{'Non-zero': 109,
 'return': 109,
 'code': 114,
 'from': 109,
 'EVNTtoHITS': 10,
 '(': 269,
 '8': 35,
 ')': 250,
 ';': 107,
 'Logfile': 113,
 'error': 169,
 'in': 204,
 'log.EVNTtoHITS': 8,
 ':': 233,
 '``': 145,
 'ValueError': 1,
 'ATLAS-P2-ITK-17-04-02': 1,
 'is': 4,
 'not': 21,
 'the': 7,
 'expected': 1,
 'type': 1,
 'and/or': 1,
 'value': 1,
 'allowed': 1,
 'for': 34,
 'UID.SimFlags.SimLayout': 1,
 "''": 145,
 'Missing': 54,
 'AthenaMP': 54,
 'outputs': 54,
 'file': 65,
 'athenaMP-outputs-EVNTtoHITS-sim': 2,
 'probably': 59,
 'athena': 86,
 'crashed': 54,
 'No': 19,
 'events': 19,
 'to': 27,
 'process': 19,
 '16000': 1,
 'skipEvents': 19,
 '>': 20,
 '=': 20,
 '2343': 7,
 'inputEvents': 19,
 'of': 19,
 'HITS': 7,
 'AODtoDAOD': 74,
 'log.AODtoDAOD': 72,
 'UID': 101,
 "'module": 24,
 "'": 53,
 'object': 24,
 'has': 24,
 'no': 24,
 'attribute': 24,
 "'UID": 24,
 '14000': 1,
 'Smart': 6,
 'slimming': 12,
 'container': 6,
 'BTagging_UID': 6,
 'does': 12,
 'exist': 6,
 'or': 65,
 'have': 

### Get all tokenized error messages

In [17]:
cluster.tokenized[0:1]

[['Non-zero',
  'return',
  'code',
  'from',
  'EVNTtoHITS',
  '(',
  '8',
  ')',
  ';',
  'Logfile',
  'error',
  'in',
  'log.EVNTtoHITS',
  ':',
  '``',
  'ValueError',
  ':',
  'ATLAS-P2-ITK-17-04-02',
  'is',
  'not',
  'the',
  'expected',
  'type',
  'and/or',
  'the',
  'value',
  'is',
  'not',
  'allowed',
  'for',
  ':',
  'UID.SimFlags.SimLayout',
  "''"]]

### Get epsilon value (which was used in DBSCAN algorithm)

In [18]:
cluster.epsilon

0.08106837664192053

### Get word2vec model

In [19]:
cluster.word2vec

<gensim.models.word2vec.Word2Vec at 0x1c269550f0>