# Clusterlogs Notebook

In [47]:
import pandas as pd
from clusterlogs import pipeline

### 1. Download data from file and create pandas DataFrame with index 

In [48]:
df = pd.read_csv('test/test_data.csv', index_col=0).set_index('pandaid')

In [49]:
df.head(10)

Unnamed: 0_level_0,starttime,exeerrorcode,exeerrordiag
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4498866929,2019-10-01T09:04:39,65,Non-zero return code from EVNTtoHITS (8); Logf...
4498851809,2019-10-01T09:13:17,11,Missing AthenaMP outputs file athenaMP-outputs...
4499004301,2019-10-01T10:18:49,15,No events to process: 16000 (skipEvents) >= 23...
4499011565,2019-10-01T10:25:17,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499004297,2019-10-01T10:15:46,15,No events to process: 14000 (skipEvents) >= 23...
4499011462,2019-10-01T10:22:04,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008082,2019-10-01T10:23:28,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011446,2019-10-01T10:21:21,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499011466,2019-10-01T10:23:13,65,Non-zero return code from AODtoDAOD (8); Logfi...
4499008070,2019-10-01T10:26:24,65,Non-zero return code from AODtoDAOD (8); Logfi...


### 2. Initialize clusterization pipeline

In [50]:
target = 'exeerrordiag'

In [58]:
cluster = pipeline.ml_clustering(df, target, mode='process', model_name='test/word2vec.model')

### 3. Execute clusterization pipeline

In [59]:
cluster.process()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


<clusterlogs.pipeline.ml_clustering at 0x1c27467cc0>

In [60]:
cluster.w2v_size

15

### 4. Get clusters statistics

In [61]:
stat = cluster.statistics()

In [62]:
stat_df = pd.DataFrame.from_dict(stat)

In [63]:
stat_df.sort_values(by='mean_similarity')

Unnamed: 0,cluster_name,cluster_size,first_entry,vocab,vocab_length,mean_length,mean_similarity,std_length,std_similarity
37,37,2,"Fatal error in athena logfile: ""Logfile error ...","{data, Logfile, athena, logfile, retrieve, COO...",18,200.5,91.47,10.5,8.53
10,10,8,"Fatal error in athena logfile: ""G4 exception a...","{athena, G4, logfile, exception, see, line, er...",10,95.0,96.18,0.0,1.74
31,31,15,"Fatal error in athena logfile: ""Long ERROR mes...","{athena, logfile, see, line, error, Long, Fata...",11,100.0,97.47,0.0,1.15
28,28,12,No events to process: 2100 (skipEvents) >= 200...,"{EVNT, process, skipEvents, inputEvents, events}",5,68.0,98.28,0.0,0.81
2,2,7,No events to process: 16000 (skipEvents) >= 23...,"{process, skipEvents, HITS, inputEvents, events}",5,68.71,98.34,0.45,0.93
9,9,5,"Fatal error in athena logfile: ""Logfile error ...","{Logfile, athena, logfile, caught, exception, ...",14,174.0,99.43,0.0,1.15
39,39,1,Error reading user generated output file list,"{generated, file, Error, list, reading, output...",7,45.0,100.0,0.0,0.0
0,0,1,Non-zero return code from EVNTtoHITS (8); Logf...,"{Logfile, ValueError, EVNTtoHITS, expected, lo...",15,206.0,100.0,0.0,0.0
20,20,1,Non-zero return code from RAWtoALL (65); Logfi...,"{Logfile, code, m_trackCollKey, evtStore, FILE...",19,251.0,100.0,0.0,0.0
25,25,1,Input file argument testRTT.RDO.pool.root glob...,"{Input, files, file, globbed, argument, testRT...",9,102.0,100.0,0.0,0.0


In [67]:
stat_df.sort_values(by='cluster_size', ascending=False).to_csv('cluster_stat_small_preptained.csv')

### Timings for all stages of clusterization pipeline

process - timing of all process

In [22]:
cluster.timings

{'data_preparation': 0.0029,
 'tokenization': 0.0247,
 'tokens_vectorization': 0.0171,
 'sentence_vectorization': 0.0399,
 'kneighbors': 0.0052,
 'epsilon_search': 0.0006,
 'dbscan': 0.1052,
 'process': 0.1957}

### Get all error messages in single cluster

In [24]:
cluster.in_cluster(28)

['No events to process: 2100 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2600 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2800 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2500 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3200 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2400 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2700 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2000 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3300 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 2900 (skipEvents) >= 2000 (inputEvents of EVNT',
 'No events to process: 3400 (skipEvents) >= 2000 (inputEvents of EVNT']

In [25]:
cluster.in_cluster(33)

['Missing AthenaMP outputs file athenaMP-outputs-UID-r2t (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-UID-r2t (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-UID-r2t (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-UID-r2t (probably athena crashed)']

### Output clusters - mode == 'ALL'  (for cluster '2')

In [26]:
cluster.clustered_output(mode='ALL')['1']

[{'starttime': '2019-10-01T09:13:17',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1},
 {'starttime': '2019-10-01T10:26:22',
  'exeerrorcode': 11,
  'exeerrordiag': 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
  'cluster': 1}]

### Output clusters - mode == 'INDEX' (for cluster '2')

In [27]:
cluster.clustered_output(mode='INDEX')['1']

[4498851809, 4499017793]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [28]:
cluster.clustered_output(mode='TARGET')['1']

['Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)',
 'Missing AthenaMP outputs file athenaMP-outputs-EVNTtoHITS-sim (probably athena crashed)']

### Output clusters - cluster labels

In [29]:
cluster.cluster_labels

array([ 0,  1,  2,  3,  2,  4,  3,  3,  3,  3,  3,  5,  6,  7,  8,  9,  7,
       10,  7,  7, 11, 12, 13, 13, 13,  2,  3,  4,  2,  3, 14,  3,  1,  7,
        7,  7, 15,  7,  7,  7, 16,  9,  7, 17, 18, 10,  3,  3,  4,  2,  3,
        3,  4,  3, 19,  9,  7,  7,  7,  7, 10,  7,  9,  7, 20, 21,  4,  3,
        3,  3,  4,  3,  2, 22, 23,  7,  7,  7,  9, 24,  7, 25, 17, 21, 13,
       26, 26, 27,  2,  3,  3,  3,  3,  3,  3,  7,  7,  6, 10,  7,  7,  7,
        7,  7, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 28,
       28, 28, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 28, 28,
       29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 28, 28, 29, 29, 29, 29,
       29, 29, 30, 30, 30, 30, 30, 30, 28, 28, 29, 29, 29, 29, 29, 29, 29,
       30, 30, 30,  7,  6, 31, 31,  7, 32, 32, 32, 33, 10, 34, 32,  7, 29,
        7, 31,  7, 31, 31, 32,  7, 29, 29, 32, 32, 35, 36, 32, 29, 29, 10,
       10, 34, 31, 31, 31, 29, 32, 37, 32, 32, 33, 38, 38, 21, 31, 31, 31,
       31, 10, 31, 39, 32

### Get epsilon value (which was used in DBSCAN algorithm)

In [31]:
cluster.epsilon

0.2557533191669861