# Clusterlogs Notebook

In [1]:
import pandas as pd
from clusterlogs import pipeline

### 1. Download data from file and create pandas DataFrame with index 

In [2]:
df = pd.read_csv('test/test_data.csv', index_col=0).set_index('pandaid')

In [3]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization pipeline

In [4]:
target = 'exeerrordiag'

In [5]:
cluster = pipeline.ml_clustering(df, target, mode='process', model_name='test/word2vec.model')

### 3. Execute clusterization pipeline

In [6]:
cluster.process()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


<clusterlogs.pipeline.ml_clustering at 0x1c2886ccf8>

In [7]:
cluster.w2v_size

39

### 4. Get clusters statistics

In [8]:
stat = cluster.statistics()

In [9]:
stat_df = pd.DataFrame.from_dict(stat)

In [10]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,stems,vocab,vocab_length,mean_length,mean_similarity,std_length,std_similarity
10,10,8,"Fatal error in athena logfile: ""G4 exception a...","{see, logfile, athena, line, G4, error, Fatal,...",10,95.0,96.18,0.0,1.74
34,34,15,"Fatal error in athena logfile: ""Long ERROR mes...","{see, ERROR, logfile, athena, line, Long, erro...",11,100.0,97.47,0.0,1.15
29,29,10,00 (skipEvents) >= 2000 (inputEvents of EVNT,"{process, EVNT, inputEvents, events, skipEvents}",5,68.0,98.24,0.0,0.88
2,2,7,000 (skipEvents) >= 2343 (inputEvents of HITS,"{process, inputEvents, events, HITS, skipEvents}",5,68.71,98.34,0.45,0.93
30,30,48,Missing AthenaMP outputs file athenaMP-outputs...,"{AthenaMP, outputs, athena, Missing, athenaMP-...",8,85.0,100.0,0.0,0.0
43,43,1,"Fatal error in athena logfile: ""Logfile error ...","{IOVDbSvc, ERROR, logfile, athena, validityKey...",16,190.0,100.0,0.0,0.0
39,39,1,Non-zero return code from AODtoDAOD (8); Logfi...,"{return, AODtoDAOD, Non-zero, code, 'nentries,...",9,95.0,100.0,0.0,0.0
0,0,1,Non-zero return code from EVNTtoHITS (8); Logf...,"{ValueError, return, and/or, Non-zero, EVNTtoH...",15,206.0,100.0,0.0,0.0
26,26,1,Input file argument testRTT.RDO.pool.root glob...,"{testRTT.RDO.pool.root, Input, files, input, p...",9,102.0,100.0,0.0,0.0
25,25,1,Non-zero return code from RDOtoRDOTrigger (33)...,"{return, log.RDOtoRDOTrigger, Non-zero, Signat...",17,227.0,100.0,0.0,0.0


In [21]:
stems = stat_df.sort_values(by='mean_similarity')[['cluster_name', 'cluster_size', 'stems']]

In [23]:
stems.to_dict()

{'cluster_name': {10: '10',
  34: '34',
  29: '29',
  2: '2',
  30: '30',
  43: '43',
  39: '39',
  0: '0',
  26: '26',
  25: '25',
  24: '24',
  23: '23',
  21: '21',
  17: '17',
  19: '19',
  42: '42',
  16: '16',
  15: '15',
  14: '14',
  12: '12',
  11: '11',
  8: '8',
  20: '20',
  40: '40',
  33: '33',
  5: '5',
  7: '7',
  3: '3',
  31: '31',
  35: '35',
  4: '4',
  9: '9',
  13: '13',
  36: '36',
  6: '6',
  41: '41',
  22: '22',
  1: '1',
  18: '18',
  27: '27',
  37: '37',
  38: '38',
  28: '28',
  32: '32',
  44: '44'},
 'cluster_size': {10: 8,
  34: 15,
  29: 10,
  2: 7,
  30: 48,
  43: 1,
  39: 1,
  0: 1,
  26: 1,
  25: 1,
  24: 1,
  23: 1,
  21: 1,
  17: 1,
  19: 1,
  42: 1,
  16: 1,
  15: 1,
  14: 1,
  12: 1,
  11: 1,
  8: 1,
  20: 1,
  40: 1,
  33: 1,
  5: 1,
  7: 35,
  3: 24,
  31: 22,
  35: 17,
  4: 6,
  9: 4,
  13: 4,
  36: 4,
  6: 4,
  41: 3,
  22: 3,
  1: 2,
  18: 2,
  27: 2,
  37: 2,
  38: 2,
  28: 2,
  32: 1,
  44: 1},
 'stems': {10: 'Fatal error in athena logfil

### Timings for all stages of clusterization pipeline

process - timing of all process

In [11]:
cluster.timings

{'data_preparation': 0.0038,
 'tokenization': 0.0263,
 'tokens_vectorization': 0.0411,
 'sentence_vectorization': 0.0335,
 'kneighbors': 0.012,
 'epsilon_search': 0.0006,
 'dbscan': 0.1109,
 'process': 0.2282}

### Get all error messages in single cluster

In [12]:
cluster.in_cluster(10)

['Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "G4 exception at line LINE_NUMBER (see jobReport for further details)"']

In [13]:
cluster.in_cluster(34)

['Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further details)"',
 'Fatal error in athena logfile: "Long ERROR message at line LINE_NUMBER (see jobReport for further deta

### Output clusters - mode == 'ALL'  (for cluster '2')

In [14]:
cluster.clustered_output(mode='ALL')['1']

[{'starttime': '2019-10-01T09:13:17',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1},
 {'starttime': '2019-10-01T10:26:22',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1}]

### Output clusters - mode == 'INDEX' (for cluster '2')

In [15]:
cluster.clustered_output(mode='INDEX')['1']

[4498851809, 4499017793]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [16]:
cluster.clustered_output(mode='TARGET')['1']

['Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)']

### Output clusters - cluster labels

In [17]:
cluster.cluster_labels

array([ 0,  1,  2,  3,  2,  4,  3,  3,  3,  3,  3,  5,  6,  7,  8,  9,  7,
       10,  7,  7, 11, 12, 13, 13, 13,  2,  3,  4,  2,  3, 14,  3,  1,  7,
        7,  7, 15,  7,  7,  7, 16, 17,  7, 18, 19, 10,  3,  3,  4,  2,  3,
        3,  4,  3, 20,  9,  7,  7,  7,  7, 10,  7,  9,  7, 21, 22,  4,  3,
        3,  3,  4,  3,  2, 23, 24,  7,  7,  7,  9, 25,  7, 26, 18, 22, 13,
       27, 27, 28,  2,  3,  3,  3,  3,  3,  3,  7,  7,  6, 10,  7,  7,  7,
        7,  7, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 29,
       32, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 29, 29,
       30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 33, 29, 30, 30, 30, 30,
       30, 30, 31, 31, 31, 31, 31, 31, 29, 29, 30, 30, 30, 30, 30, 30, 30,
       31, 31, 31,  7,  6, 34, 34,  7, 35, 35, 35, 36, 10, 37, 35,  7, 30,
        7, 34,  7, 34, 34, 35,  7, 30, 30, 35, 35, 38, 39, 35, 30, 30, 10,
       10, 37, 34, 34, 34, 30, 35, 40, 35, 35, 36, 41, 41, 22, 34, 34, 34,
       34, 10, 34, 42, 35

### Get epsilon value (which was used in DBSCAN algorithm)

In [18]:
cluster.epsilon

0.05653438724138101