# Clusterlogs Notebook

In [1]:
import pandas as pd
from clusterlogs import pipeline, cluster_output

### 1. Download data from file and create pandas DataFrame with index 

In [2]:
df = pd.read_csv('samples/fts_mess_panda.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,message,count,id
0,0,DESTINATION MAKE_PARENT srm-ifce err: Permissi...,27,1
1,1,TRANSFER globus_ftp_client: the server respon...,1467,2
2,2,TRANSFER globus_ftp_client: the server respon...,2,3
3,3,TRANSFER globus_ftp_client: the server respon...,34,4
4,4,TRANSFER globus_ftp_client: the server respon...,9,5
...,...,...,...,...
22787,22787,TRANSFER globus_ftp_client: the server respon...,1,22788
22788,22788,TRANSFER globus_ftp_client: the server respon...,1,22789
22789,22789,TRANSFER CHECKSUM MISMATCH USER_DEFINE and SRC...,3,22790
22790,22790,TRANSFER globus_ftp_client: the server respon...,1,22791


In [17]:
df = pd.read_csv('test/test_data.csv', index_col=0)

In [2]:
df = pd.read_csv('samples/fts_mess_panda.csv', index_col=0).set_index('id')

In [4]:
df.head()

Unnamed: 0,id,message
0,CERN_central_B|111388583,Payload execution error: returned non-zero
1,CERN_central_B|111388549,Payload execution error: returned non-zero
2,CERN_central_B|111388581,Payload execution error: returned non-zero
3,CERN_central_B|111388573,Payload execution error: returned non-zero
4,CERN_central_B|111389035,Payload execution error: returned non-zero


### 2. Initialize clusterization pipeline

In [4]:
target = 'message'

In [5]:
cluster = pipeline.ml_clustering(df, target, mode='create', model_name='test/harvester_test.model')

In [5]:
cluster = pipeline.ml_clustering(df, target, mode='create', model_name='test/fts.model')

### 3. Execute clusterization pipeline

In [6]:
cluster.process()

<clusterlogs.pipeline.ml_clustering at 0x116c80f10>

In [7]:
cluster.timings

{'data_preparation': 0.9532,
 'tokenization': 1.0552,
 'tokens_vectorization': 2.8309,
 'sentence_vectorization': 7.1339,
 'dimensionality_reduction': 0.1193,
 'hdbscan': 2.0598,
 'extract_patterns': 1.59,
 'reprocess': 0.5591,
 'statistics': 1.3373,
 'process': 17.6387}

In [8]:
cluster.df['cluster_2'].unique()

array(['54', '29', '43', '57', '60', '21', '40', '55', '45', '28', '61',
       '58', '59', '56', '50', '53', '39', '49', '52', '33', '1', '38',
       '34', '51', '24', '36', '47', '32', '44', '5', '42', '0', '48',
       '18', '35', '46', '19', '41', '12', '11', '37', '25', '23', '4',
       '22', '20', '30', '26', '27', '17', '7', '3', '10', '31', '6', '2',
       '9', '15', '8', '16', '14', '13'], dtype=object)

In [11]:
cluster.results.to_csv('harvester_results_hdbscan.csv')

In [9]:
cluster.results

Unnamed: 0,cluster_name,cluster_size,pattern,sequence,mean_similarity,std_similarity
38,43,8824,TRANSFER globus_ftp_client: the server respond...,"[TRANSFER, globus_ftp_client:, the, server, re...",0.91,0.11
53,57,3010,globus_ftp_client: the server server responded...,"[globus_ftp_client:, the, server, server, resp...",0.75,0.21
58,61,2368,and CHECKSUM and * are different. * * * *!= *,"[and, CHECKSUM, and, *, are, different., *, *,...",0.65,0.25
14,21,2119,globus_ftp_client: the server responded with w...,"[globus_ftp_client:, the, server, responded, w...",0.97,0.06
50,54,1944,"srm-ifce MAKE_PARENT srm-ifce denied, Permissi...","[srm-ifce, MAKE_PARENT, srm-ifce, denied,, Per...",0.93,0.09
...,...,...,...,...,...,...
5,13,1,SOURCE SRM_GET_TURL The source file is not ONLINE,"[SOURCE, SRM_GET_TURL, The, source, file, is, ...",1.00,0.00
6,14,1,TRANSFER globus_ftp_control: gss_init_sec_con...,"[TRANSFER, globus_ftp_control:, gss_init_sec_c...",1.00,0.00
45,5,1,Checksum value required if mode is not end to end,"[Checksum, value, required, if, mode, is, not,...",1.00,0.00
23,3,1,Destination file exists and overwrite is not e...,"[Destination, file, exists, and, overwrite, is...",1.00,0.00


In [10]:
cluster.results[['pattern']].values

array([['TRANSFER globus_ftp_client: the server responded with an error * Command failed.: open/create: [ERROR] Server responded with an error: [*] Unable to get space physical space /eulake/lcg/test/hammercloud/atlas/testd/rucio/*/ESD/*/*/* No left on on device'],
       ['globus_ftp_client: the server server responded with an * in * failed. error in Unable error System End. failed: connect: End. out timed Connection * Connection * system failed: call * failed: timed out out End. End.'],
       ['and CHECKSUM and * are different. * * * *!= *'],
       ['globus_ftp_client: the server responded with with an * * General problem: to to Connection timed timed out out'],
       ['srm-ifce MAKE_PARENT srm-ifce denied, Permission err: err: [SE][Mkdir][SRM_AUTHORIZATION_FAILURE] httpg://*/srm/* Permission Permission denied'],
       ['globus_ftp_client: the server responded with with an * * request: failed failed failed while attempting to perform request'],
       ['srm-ifce err: Communicatio

In [14]:
cluster.timings

{'data_preparation': 0.1625,
 'tokenization': 0.4147,
 'tokens_vectorization': 0.3608,
 'sentence_vectorization': 0.7098,
 'kneighbors': 1.9516,
 'epsilon_search': 0.0438,
 'dbscan': 0.3717,
 'extract_patterns': 0.3075,
 'reprocess': 0.1604,
 'statistics': 0.2275,
 'process': 4.7104}

In [11]:
x = cluster.in_cluster(4,2)

In [12]:
print(x)

[]


### 4. Get clusters statistics

In [8]:
output = cluster_output.Output(cluster.df, 
                               cluster.target)

In [9]:
mode = 'INDEX'

In [10]:
output.clustered_output(mode)
stats = output.statistics(output_mode='frame')

In [11]:
stats.sort_values(by=['mean_similarity'])

Unnamed: 0,cluster_name,cluster_size,pattern,mean_length,mean_similarity,std_length,std_similarity
35,35,9,Input file argument T globbed to NO input file...,100.78,0.90,3.99,0.03
65,65,9,Failed in data staging: Failed checking source...,181.56,0.94,7.23,0.03
111,111,2,Non-zero return code from OMerge (*); Logfile ...,95.00,0.94,2.00,0.00
41,41,3,File DAOD_D*.*._*.pool.root.* did not pass cor...,62.33,0.95,0.94,0.01
17,17,5,Non-zero return code from AODtoDAOD (*); Logfi...,129.40,0.95,3.20,0.03
...,...,...,...,...,...,...,...
92,92,9,Non-zero return code from EVNTtoHITS (*); Logf...,126.00,1.00,0.00,0.00
58,58,9,Non-zero return code from AODtoDAOD (*); Logfi...,116.00,1.00,0.00,0.00
54,54,8,"Fatal error in athena logfile: ""Long ERROR mes...",97.00,1.00,0.00,0.00
42,42,10,Non-zero return code from generate (*); Logfil...,138.00,1.00,0.00,0.00


In [12]:
patterns = stats.sort_values(by='mean_similarity')[['pattern']].values

In [13]:
patterns

array([['Input file argument T globbed to NO input files - probably the file(s) are missing'],
       ['Failed in data staging: Failed checking source replica * Failed to obtain information about file: No such file or directory: Failed to stat file *_session******..b.'],
       ['Non-zero return code from OMerge (*); Logfile error in log.OMerge: "KeyError: \'nentries\'"'],
       ['File DAOD_D*.*._*.pool.root.* did not pass corruption test'],
       ['Non-zero return code from AODtoDAOD (*); Logfile error in log.AODtoDAOD: "* FATAL Standard std::exception is caught"'],
       ['Non-zero return code from generate (*); Logfile error in log.generate: "*: MadSpin was run but can\'t find output folder Events**_decayed_*"'],
       ['Non-zero return code from AODtoDAOD (*); Logfile error in log.AODtoDAOD: "* fault: Event counter: *; Run: *; Evt: *; Current algorithm: tag; Current Function: unknown"'],
       ['Fatal error in athena logfile: "Logfile error in log.toA: "ToolSvc.HLTJetMon ERROR

In [14]:
stats_1 = output.postprocessing(1)

In [15]:
stats_1.sort_values(by=['mean_similarity'])

Unnamed: 0,cluster_name,cluster_size,pattern,mean_length,mean_similarity,std_length,std_similarity
0,0.0,1811,Non-zero return code from (*),122.31,0.43,29.71,0.14
2,2.0,205,"Fatal error in athena logfile: "" eo in o""",122.61,0.53,28.78,0.11
10,10.0,2,AODtoDAOD got a SIGKILL signal (exit code *); ...,134.5,0.68,6.5,0.0
7,7.0,61,Nonet code *); o at line * (see jobReport for ...,104.56,0.76,1.25,0.06
6,6.0,10,Failed in data staging: ld en source replica *...,180.4,0.77,7.68,0.15
4,4.0,36,File * did not pass corruption test,58.19,0.8,5.07,0.04
12,12.0,218,LRMS error: (*) Job,28.89,0.83,3.64,0.06
5,5.0,7,Failed in data staging: Failed to prepare dest...,198.57,0.84,28.71,0.08
19,19.0,1141,Input file argument globbed to NO input files...,99.07,0.9,0.51,0.0
1,1.0,4,Non-zero return code from (*),36.75,0.91,3.34,0.06


In [19]:
output.in_cluster(20,2)

array(['EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'ESDtoDPD got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIGKILL signal (exit code 137)',
       'EVNTtoHITS got a SIG

In [26]:
patterns1 = stats_1.sort_values(by='mean_similarity')[['pattern']].values

In [27]:
patterns1

array([['Non-zero return code from  (*)'],
       ['Non-zero return code from  (*); Logfile error in log.: "Eor: * u: *tretoOtion un'],
       ['Fatal error in athena logfile: "Logfile error in log.: "  e""'],
       ['AODtoDAOD got a SIGKILL signal (exit code *); Logfile error in log.AODtoDAOD: "* F tnrd oa"'],
       ['Failed in data staging: ld en source replica * ailed: No such file or directory: aid oat f'],
       ['Nonet code *); o at line * (see jobReport for further details)'],
       ['File * did not pass corruption test'],
       ['Failed in data staging: Failed to prepare destination * Failed to prepare destination: File exists: File requests have failed.: '],
       ['LRMS error: (*) Job '],
       ['Input file argument  globbed to NO input files - probably the file(s) are missing'],
       ['Non-zero return code from  (*)'],
       ['Eto got a SIGKILL signal (exit code *)'],
       ['Fatal error in athena logfile: "on at line * (see jobReport for further details)"'],
    

In [15]:
import difflib

In [16]:
sorted_df = output.stat_df.sort_values(by=['cluster_size'])[['cluster_size','cluster_name','pattern']]

In [135]:
sorted_df

Unnamed: 0,cluster_size,cluster_name,pattern
149,1,149,Non-zero return code from EVNTtoHITS (*); Logf...
138,1,138,Non-zero return code from * (*); Logfile error...
136,1,136,Non-zero return code from ESDtoAOD (*); Logfil...
139,1,139,Non-zero return code from * (*); Logfile error...
140,1,140,Non-zero return code from * (*); Logfile error...
...,...,...,...
29,140,29,Non-zero return code from EVNTtoHITS (*)
13,385,13,No events to process: * (skipEvents) >= * (inp...
10,549,10,Missing AthenaMP outputs file **r (probably at...
20,805,20,Non-zero return code from AODtoDAOD (*); Logfi...


In [136]:
def match(df, new_clusters):
    start = df.head(1)['pattern'].values[0]
#     print(start)
    ratio = [difflib.SequenceMatcher(None, start, x).ratio() for x in df['pattern']]
    df['ratio'] = ratio
    filtered = df[df['ratio'] > 0.6]['cluster_name'].values
    new_clusters.append(filtered)
#     print(filtered)
    df.drop(df[df['cluster_name'].isin(filtered)].index, inplace=True)
#     print(df)
    while df.shape[0] > 0:
        match(df, new_clusters)

In [137]:
new_clusters = []
# start = sorted_df.head(1)['pattern'].values[0]
# print(start)
match(sorted_df, new_clusters)

In [140]:
new_clusters

[array(['149', '136', '139', '133', '142', '143', '146', '144', '123',
        '105', '102', '51', '87', '79', '73', '72', '118', '112', '106',
        '48', '122', '109', '110', '108', '59', '90', '17', '3', '58',
        '93', '1', '36', '70', '39', '52', '74', '37', '24', '14', '45',
        '19'], dtype=object),
 array(['138', '97', '25', '86', '85', '9', '57'], dtype=object),
 array(['140'], dtype=object),
 array(['141', '117', '76', '67', '23', '28'], dtype=object),
 array(['135', '148', '30', '29'], dtype=object),
 array(['134', '81', '80'], dtype=object),
 array(['132', '95'], dtype=object),
 array(['131', '128', '91', '22'], dtype=object),
 array(['145', '137', '88'], dtype=object),
 array(['130', '84', '82', '101'], dtype=object),
 array(['129'], dtype=object),
 array(['127', '21', '99'], dtype=object),
 array(['126'], dtype=object),
 array(['124', '41', '8', '32', '26'], dtype=object),
 array(['107'], dtype=object),
 array(['120', '115', '69'], dtype=object),
 array(['121', 

In [145]:
a = []
for i in new_clusters:
    x = cluster.df[cluster.df['cluster'].isin(i)].index
    a.append({'cluster_name':i[-1],'idx':x})
print(a)

for i in a:
    cluster.df.loc[i['idx'], 'cluster_level_2'] = i['cluster_name']

[{'cluster_name': '19', 'idx': Int64Index([   2,    6,    7,    8,    9,   46,   59,   62,   63,   64,
            ...
            4861, 4874, 4875, 4897, 4912, 4915, 4937, 4949, 4956, 4967],
           dtype='int64', length=568)}, {'cluster_name': '57', 'idx': Int64Index([  50,   54,   55,  105,  275,  394,  580,  581,  645,  674,  914,
             923,  925,  926,  987,  993,  994, 1012, 1022, 1030, 1031, 1032,
            1037, 1246, 1268, 1343, 1389, 1555, 1843, 1845, 1863, 1944, 1948,
            1952, 1956, 1957, 1961, 2298, 2372, 2811, 2815, 2825, 2843, 2888,
            2913, 2915, 2926, 2932, 2933, 3022, 3251, 3524, 3572, 3763, 3772,
            3850, 3869, 3897, 3902, 3912, 4382, 4490, 4753, 4758, 4845, 4855,
            4883],
           dtype='int64')}, {'cluster_name': '140', 'idx': Int64Index([4037], dtype='int64')}, {'cluster_name': '28', 'idx': Int64Index([  90,  128,  260,  280,  290,  306,  408,  409,  410,  419,
            ...
            4374, 4390, 4403, 4405, 44

In [149]:
cluster.df['cluster_level_2'].describe()

count     5001
unique      43
top         62
freq      1141
Name: cluster_level_2, dtype: object

In [52]:
sorted_list = sorted_df['pattern'].tolist()

In [66]:
start = sorted_list[0]
print(start)
ratio = []
for x in sorted_list:
    ratio.append(difflib.SequenceMatcher(None, start, x).ratio())
print(ratio)
ids = [i for i,x in enumerate(ratio) if x > 0.6]
print(ids)

Non-zero return code from EVNTtoHITS (*); Logfile error in log.EVNTtoHITS: "StreamHITS FATAL * failed."
[1.0, 0.5638766519823789, 0.6181818181818182, 0.6272727272727273, 0.592057761732852, 0.1917808219178082, 0.44776119402985076, 0.4740740740740741, 0.6698113207547169, 0.6334841628959276, 0.32727272727272727, 0.4397163120567376, 0.6564102564102564, 0.3488372093023256, 0.39069767441860465, 0.2302158273381295, 0.5757575757575758, 0.3333333333333333, 0.5957446808510638, 0.6538461538461539, 0.17582417582417584, 0.15942028985507245, 0.663594470046083, 0.611353711790393, 0.5144927536231884, 0.5989304812834224, 0.455026455026455, 0.48, 0.2054794520547945, 0.5220588235294118, 0.39344262295081966, 0.5194805194805194, 0.273972602739726, 0.5886524822695035, 0.6255506607929515, 0.5841584158415841, 0.8682926829268293, 0.5517241379310345, 0.578397212543554, 0.5092250922509225, 0.784688995215311, 0.7159090909090909, 0.03289473684210526, 0.1793103448275862, 0.6666666666666666, 0.49122807017543857, 0.1

In [17]:
result = output.postprocessing()

In [19]:
result

matrix([[1.        , 0.18644068, 0.31446541, ..., 0.20979021, 0.19402985,
         0.1773399 ],
        [0.18644068, 1.        , 0.22564103, ..., 0.1452514 , 0.51973684,
         0.56903766],
        [0.33962264, 0.21538462, 1.        , ..., 0.23529412, 0.14977974,
         0.17283951],
        ...,
        [0.11188811, 0.16759777, 0.21568627, ..., 1.        , 0.14218009,
         0.20547945],
        [0.21641791, 0.53947368, 0.15859031, ..., 0.12322275, 1.        ,
         0.50922509],
        [0.1773399 , 0.56903766, 0.17283951, ..., 0.17808219, 0.50922509,
         1.        ]])

In [22]:
import numpy as np

In [27]:
threshold = 0.6
rows = np.where(result > threshold)

In [28]:
rows

(array([  0,   0,   0, ..., 149, 149, 149]),
 array([  0,  23,  51, ..., 139, 141, 149]))

In [20]:
similar

matrix([[1.        , 0.8159204 , 0.89010989, ..., 0.86829268, 0.62555066,
         1.        ]])

### Timings for all stages of clusterization pipeline

process - timing of all process

In [None]:
cluster.timings

### Get all error messages in single cluster

In [None]:
output.in_cluster(10)

In [None]:
output.in_cluster(34)

### Output clusters - mode == 'ALL'  (for cluster '2')

In [None]:
output.clustered_output(mode='ALL')['1']

### Output clusters - mode == 'INDEX' (for cluster '2')

In [None]:
output.clustered_output(mode='INDEX')['1']

### Output clusters - mode == 'TARGET' (for cluster '2')

In [None]:
output.clustered_output(mode='TARGET')['1']

### Output clusters - cluster labels

In [None]:
cluster.cluster_labels

### Get epsilon value (which was used in DBSCAN algorithm)

In [None]:
cluster.epsilon