# Clusterlogs Notebook

In [2]:
import pandas as pd
from clusterlogs import pipeline



P.S. fuzzywuzzy library will be removed in the next release.

### 1. Download data from file and create pandas DataFrame with index 

In [3]:
df = pd.read_csv('test_data.csv', index_col=0).set_index('pandaid')

In [4]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization parameters

    clustering_parameters = {'tokenizer':'nltk|pyonmttok',
                             'w2v_size': <size of vector for each token (i.e. 100-300)>,
                             'w2v_window': <size of slicing window for NN algorithms (i.e. 5-10)>,
                             'min_samples': <minimum size of cluster, it's better to set it as 1>}

In [5]:
clustering_parameters = {'tokenizer':'nltk',
                         'w2v_size': 200,
                         'w2v_window': 5,
                         'min_samples': 1}

### 3. Initialize clusterization pipeline

     target = '<target column with error messages>'

In [6]:
cluster = pipeline.ml_clustering(df, 'exeerrordiag', clustering_parameters)

### 4. Execute clusterization pipeline

In [7]:
cluster.process()

<clusterlogs.pipeline.ml_clustering at 0x1c23d75828>

### 5. Get clusters statistics

Clusters Statistics returns DataFrame with statistic for all clusters:
- "cluster_name" - name of a cluster
- "cluster_size" = number of log messages in cluster
- "first_entry" - first log message in cluster
- "mean_length" - average length of log messages in cluster
- "std_length" - standard deviation of length of log messages in cluster
- "mean_similarity" - average similarity of log messages in cluster
(calculated as the levenshtein distances between the 1st and all other log messages)
- "std_similarity" - standard deviation of similarity of log messages in cluster

In [8]:
stat = cluster.statistics()

In [11]:
stat_df = pd.DataFrame.from_dict(stat)

In [13]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,first_entry,mean_length,mean_similarity,std_length,std_similarity
0,0,235,Non-zero return code from EVNTtoHITS (8); Logf...,121.98,31.37,53.3,15.28
1,1,2,File data18_13TeV.00363033.physics_Main.merge....,75.0,54.5,30.0,45.5
2,2,11,No events to process: 2100 (skipEvents) >= 200...,68.0,98.73,0.0,0.86
3,3,1,No events to process: 2000 (skipEvents) >= 200...,68.0,100.0,0.0,0.0


### Timings for all stages of clusterization pipeline

process - timing of all process

In [21]:
cluster.timings

{'data_preparation': 0.0039,
 'tokenization': 0.0238,
 'tokens_vectorization': 0.0616,
 'sentence_vectorization': 0.0354,
 'kneighbors': 0.0145,
 'epsilon_search': 0.0024,
 'dbscan': 0.1139,
 'process': 0.2556}

### Get all error messages in single cluster

In [15]:
cluster.in_cluster(2)

['No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2400 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2700 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3300 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2900 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3400 (skipEvents) >= 2000 (inputEvents of EVNT']

### Output clusters - mode == 'ALL'  (for cluster '2')

In [33]:
cluster.clustered_output(mode='ALL')['2']

[{'starttime': '2019-10-01T09:00:05',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2},
 {'starttime': '2019-10-01T09:02:55',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2},
 {'starttime': '2019-10-01T09:02:46',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2},
 {'starttime': '2019-10-01T09:00:06',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2},
 {'starttime': '2019-10-01T09:00:41',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2},
 {'starttime': '2019-10-01T09:00:38',
  'exeerrorcode': 15,
  'exeerrordiag': 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
  'cluster': 2}

### Output clusters - mode == 'INDEX' (for cluster '2')

In [34]:
cluster.clustered_output(mode='INDEX')['2']

[4498117632,
 4498117637,
 4498117639,
 4498117633,
 4498117636,
 4498117643,
 4498117635,
 4498117638,
 4498117644,
 4498117640,
 4498117645]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [35]:
cluster.clustered_output(mode='TARGET')['2']

['No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2400 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2700 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3300 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2900 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3400 (skipEvents) >= 2000 (inputEvents of EVNT']

### Output clusters - cluster labels

In [19]:
cluster.cluster_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

### Vocabulary of all tokens with frequencies

In [20]:
cluster.get_vocabulary()

{'Non-zero': 109,
 'return': 109,
 'code': 114,
 'from': 109,
 'EVNTtoHITS': 10,
 '(': 269,
 '8': 35,
 ')': 250,
 ';': 107,
 'Logfile': 113,
 'error': 169,
 'in': 204,
 'log.EVNTtoHITS': 8,
 ':': 233,
 '``': 145,
 'ValueError': 1,
 'ATLAS-P2-ITK-17-04-02': 1,
 'is': 4,
 'not': 21,
 'the': 7,
 'expected': 1,
 'type': 1,
 'and/or': 1,
 'value': 1,
 'allowed': 1,
 'for': 34,
 'UID.SimFlags.SimLayout': 1,
 "''": 145,
 'Missing': 54,
 'AthenaMP': 54,
 'outputs': 54,
 'file': 65,
 'athenaMP-outputs-EVNTtoHITS-sim': 2,
 'probably': 59,
 'athena': 86,
 'crashed': 54,
 'No': 19,
 'events': 19,
 'to': 27,
 'process': 19,
 '16000': 1,
 'skipEvents': 19,
 '>': 20,
 '=': 20,
 '2343': 7,
 'inputEvents': 19,
 'of': 19,
 'HITS': 7,
 'AODtoDAOD': 74,
 'log.AODtoDAOD': 72,
 'UID': 101,
 "'module": 24,
 "'": 53,
 'object': 24,
 'has': 24,
 'no': 24,
 'attribute': 24,
 "'UID": 24,
 '14000': 1,
 'Smart': 6,
 'slimming': 12,
 'container': 6,
 'BTagging_UID': 6,
 'does': 12,
 'exist': 6,
 'or': 65,
 'have': 

### Get all tokenized error messages

In [37]:
cluster.tokenized[0:1]

[['Non-zero',
  'return',
  'code',
  'from',
  'EVNTtoHITS',
  '(',
  '8',
  ')',
  ';',
  'Logfile',
  'error',
  'in',
  'log.EVNTtoHITS',
  ':',
  '``',
  'ValueError',
  ':',
  'ATLAS-P2-ITK-17-04-02',
  'is',
  'not',
  'the',
  'expected',
  'type',
  'and/or',
  'the',
  'value',
  'is',
  'not',
  'allowed',
  'for',
  ':',
  'UID.SimFlags.SimLayout',
  "''"]]

### Get epsilon value (which was used in DBSCAN algorithm)

In [24]:
cluster.epsilon

0.062307177769663075

### Get word2vec model

In [25]:
cluster.word2vec

<gensim.models.word2vec.Word2Vec at 0x1c24724320>