# Clusterlogs Notebook

In [1]:
import pandas as pd
from clusterlogs import pipeline, cluster_output

### 1. Download data from file and create pandas DataFrame with index 

In [2]:
df = pd.read_csv('test/test_data.csv', index_col=0).set_index('pandaid')

In [3]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization pipeline

In [4]:
target = 'exeerrordiag'

In [5]:
cluster = pipeline.ml_clustering(df, target, mode='process', model_name='test/word2vec.model')

### 3. Execute clusterization pipeline

In [6]:
cluster.process()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


<clusterlogs.pipeline.ml_clustering at 0x1c1b5a51d0>

In [7]:
cluster.w2v_size

39

### 4. Get clusters statistics

In [8]:
output = cluster_output.Output(cluster.df, 
                               cluster.target, 
                               cluster.tokenizer, 
                               cluster.messages, 
                               cluster.cluster_labels)

In [9]:
stat = output.statistics()

In [10]:
stat_df = pd.DataFrame.from_dict(stat)

In [11]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,pattern,vocab,vocab_length,mean_length,mean_similarity,std_length,std_similarity
10,10,8,"Fatal error in athena logfile: ""G4 exception a...","{see, error, jobReport, athena, logfile, detai...",10,95.0,96.84,0.0,1.39
34,34,15,"Fatal error in athena logfile: ""Long ERROR mes...","{message, see, error, jobReport, athena, Long,...",11,100.0,97.8,0.0,0.91
29,29,10,No events to process: 00 (skipEvents) >= 2000 ...,"{inputEvents, events, skipEvents, EVNT, process}",5,68.0,98.38,0.0,0.79
2,2,7,No events to process: 000 (skipEvents) >= 2343...,"{HITS, inputEvents, events, skipEvents, process}",5,68.71,98.55,0.45,0.68
30,30,48,Missing AthenaMP outputs file athenaMP-outputs...,"{probably, athena, crashed, AthenaMP, athenaMP...",8,85.0,100.0,0.0,0.0
43,43,1,"Fatal error in athena logfile: ""Logfile error ...","{log.EVNTtoHITS, error, IOVDbSvc, athena, tag,...",16,190.0,100.0,0.0,0.0
39,39,1,Non-zero return code from AODtoDAOD (8); Logfi...,"{error, Non-zero, return, 'nentries, code, AOD...",9,95.0,100.0,0.0,0.0
0,0,1,Non-zero return code from EVNTtoHITS (8); Logf...,"{log.EVNTtoHITS, JobProperties.SimFlags.SimLay...",15,206.0,100.0,0.0,0.0
26,26,1,Input file argument testRTT.RDO.pool.root glob...,"{input, probably, globbed, Input, files, argum...",9,102.0,100.0,0.0,0.0
25,25,1,Non-zero return code from RDOtoRDOTrigger (33)...,"{matching, error, FATAL, Non-zero, return, TE,...",17,227.0,100.0,0.0,0.0


In [12]:
patterns = stat_df.sort_values(by='mean_similarity')[['pattern']].values

In [13]:
patterns

array([['Fatal error in athena logfile: "G4 exception at line  (see jobReport for further details)"'],
       ['Fatal error in athena logfile: "Long ERROR message at line  (see jobReport for further details)"'],
       ['No events to process: 00 (skipEvents) >= 2000 (inputEvents of EVNT'],
       ['No events to process: 000 (skipEvents) >= 2343 (inputEvents of HITS'],
       ['Missing AthenaMP outputs file athenaMP-outputs-HITtoRDO-h2r (probably athena crashed)'],
       ['Fatal error in athena logfile: "Logfile error in log.EVNTtoHITS: "IOVDbSvc            ERROR Could not retrieve COOL data for folder /LAR/ElecCalibMC/MphysOverMcal tag  validityKeys [[],[]]""'],
       ['Non-zero return code from AODtoDAOD (8); Logfile error in log.AODtoDAOD: "KeyError: \'nentries\'"'],
       ['Non-zero return code from EVNTtoHITS (8); Logfile error in log.EVNTtoHITS: "ValueError:  ATLAS-P2-ITK-17-04-02 is not the expected type and/or the value is not allowed for: JobProperties.SimFlags.SimLayout"'],

### Timings for all stages of clusterization pipeline

process - timing of all process

In [14]:
cluster.timings

{'data_preparation': 0.0026,
 'tokenization': 0.0261,
 'tokens_vectorization': 0.0357,
 'sentence_vectorization': 0.0336,
 'kneighbors': 0.0124,
 'epsilon_search': 0.0006,
 'dbscan': 0.1075,
 'process': 0.2187}

### Get all error messages in single cluster

In [15]:
output.in_cluster(10)

['Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"']

In [16]:
output.in_cluster(34)

['Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further deta

### Output clusters - mode == 'ALL'  (for cluster '2')

In [17]:
output.clustered_output(mode='ALL')['1']

[{'starttime': '2019-10-01T09:13:17',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1},
 {'starttime': '2019-10-01T10:26:22',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1}]

### Output clusters - mode == 'INDEX' (for cluster '2')

In [18]:
output.clustered_output(mode='INDEX')['1']

[4498851809, 4499017793]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [19]:
output.clustered_output(mode='TARGET')['1']

['Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)']

### Output clusters - cluster labels

In [20]:
cluster.cluster_labels

array([ 0,  1,  2,  3,  2,  4,  3,  3,  3,  3,  3,  5,  6,  7,  8,  9,  7,
       10,  7,  7, 11, 12, 13, 13, 13,  2,  3,  4,  2,  3, 14,  3,  1,  7,
        7,  7, 15,  7,  7,  7, 16, 17,  7, 18, 19, 10,  3,  3,  4,  2,  3,
        3,  4,  3, 20,  9,  7,  7,  7,  7, 10,  7,  9,  7, 21, 22,  4,  3,
        3,  3,  4,  3,  2, 23, 24,  7,  7,  7,  9, 25,  7, 26, 18, 22, 13,
       27, 27, 28,  2,  3,  3,  3,  3,  3,  3,  7,  7,  6, 10,  7,  7,  7,
        7,  7, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 29,
       32, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 29, 29,
       30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 33, 29, 30, 30, 30, 30,
       30, 30, 31, 31, 31, 31, 31, 31, 29, 29, 30, 30, 30, 30, 30, 30, 30,
       31, 31, 31,  7,  6, 34, 34,  7, 35, 35, 35, 36, 10, 37, 35,  7, 30,
        7, 34,  7, 34, 34, 35,  7, 30, 30, 35, 35, 38, 39, 35, 30, 30, 10,
       10, 37, 34, 34, 34, 30, 35, 40, 35, 35, 36, 41, 41, 22, 34, 34, 34,
       34, 10, 34, 42, 35

### Get epsilon value (which was used in DBSCAN algorithm)

In [21]:
cluster.epsilon

0.05653438724138101