# Clusterlogs Notebook

In [1]:
import pandas as pd
from clusterlogs import pipeline

### 1. Download data from file and create pandas DataFrame with index 

In [3]:
df = pd.read_csv('test/test_data.csv', index_col=0).set_index('pandaid')

In [4]:
df.shape

(249, 3)

In [5]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization pipeline

In [6]:
target = 'exeerrordiag'

In [13]:
cluster = pipeline.ml_clustering(df, target, mode='process', model_name='test/word2vec.model')

### 3. Execute clusterization pipeline

In [14]:
cluster.process()

<clusterlogs.pipeline.ml_clustering at 0x1c26e7c908>

In [15]:
cluster.w2v_size

15

### 4. Get clusters statistics

Clusters Statistics returns DataFrame with statistic for all clusters:
- "cluster_name" - name of a cluster
- "cluster_size" - number of log messages in cluster
- "first_entry" - first log message in cluster
- "vocab" - all tokens in error messasges
- "vocab_length" - the length of cluster's vocabulary
- "mean_length" - average length of log messages in cluster
- "std_length" - standard deviation of length of log messages in cluster
- "mean_similarity" - average similarity of log messages in cluster
(calculated as the levenshtein distances between the 1st and all other log messages)
- "std_similarity" - standard deviation of similarity of log messages in cluster

In [16]:
stat = cluster.statistics()

In [17]:
stat_df = pd.DataFrame.from_dict(stat)

In [19]:
stat_df.sort_values(by='cluster_size', ascending=False)

Unnamed: 0,cluster_name,cluster_size,first_entry,vocab,vocab_length,mean_length,mean_similarity,std_length,std_similarity
29,29,48,Missing AthenaMP outputs file athenaMP-outputs...,"{Missing, file, athena, athenaMP-outputs-HITto...",8,85.0,100.0,0.0,0.0
7,7,35,Non-zero return code from AODtoDAOD (65); Logf...,"{Logfile, abnormally, finished, waiting, code,...",14,187.0,100.0,0.0,0.0
3,3,24,Non-zero return code from AODtoDAOD (8); Logfi...,"{Logfile, 'module, attribute, code, log.AODtoD...",12,150.0,100.0,0.0,0.0
30,30,22,LRMS error: (271) Job was cancelled,"{error, Job, cancelled, LRMS}",4,35.0,100.0,0.0,0.0
32,32,17,Non-zero return code from HITtoRDO (65); Logfi...,"{Logfile, abnormally, finished, waiting, log.H...",14,153.0,100.0,0.0,0.0
31,31,15,"Fatal error in athena logfile: ""Long ERROR mes...","{athena, logfile, see, line, error, Long, Fata...",11,100.0,97.47,0.0,1.15
28,28,12,No events to process: 2100 (skipEvents) >= 200...,"{EVNT, process, skipEvents, inputEvents, events}",5,68.0,98.28,0.0,0.81
10,10,8,"Fatal error in athena logfile: ""G4 exception a...","{athena, G4, logfile, exception, see, line, er...",10,95.0,96.18,0.0,1.74
2,2,7,No events to process: 16000 (skipEvents) >= 23...,"{process, skipEvents, HITS, inputEvents, events}",5,68.71,98.34,0.45,0.93
4,4,6,Non-zero return code from AODtoDAOD (8); Logfi...,"{slimming, Logfile, Smart, exist, list, code, ...",15,190.0,100.0,0.0,0.0


### Timings for all stages of clusterization pipeline

process - timing of all process

In [None]:
cluster.timings

### Get all error messages in single cluster

In [21]:
cluster.in_cluster(22)

['Non-zero return code from AODtoDAOD (65); Long FATAL message at line LINE_NUMBER (see jobReport for further details)']

In [20]:
cluster.in_cluster(7)

['Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnormally"',
 'Non-zero return code from AODtoDAOD (65); Logfile error in log.AODtoDAOD: "UID ERROR Failure in waiting or sub-process finished abnorm

### Output clusters - mode == 'ALL'  (for cluster '2')

In [None]:
cluster.clustered_output(mode='ALL')['1']

### Output clusters - mode == 'INDEX' (for cluster '2')

In [None]:
cluster.clustered_output(mode='INDEX')['1']

### Output clusters - mode == 'TARGET' (for cluster '2')

In [None]:
cluster.clustered_output(mode='TARGET')['1']

### Output clusters - cluster labels

In [None]:
cluster.cluster_labels

### Vocabulary of all tokens with frequencies

In [None]:
cluster.get_w2v_vocabulary()

### Get all tokenized error messages

In [None]:
cluster.tokenized[0:1]

### Get epsilon value (which was used in DBSCAN algorithm)

In [None]:
cluster.epsilon

### Get word2vec model

In [None]:
cluster.word2vec