In [17]:
import numpy as np
import pandas as pd
from clusterlogs import pipeline

In [18]:
pd.set_option('display.max_colwidth', -1)

In [19]:
df = pd.read_csv('patterns.txt', header=0, names=['error_message'])

In [20]:
df.head()

Unnamed: 0,error_message
0,: the server responded with an error 421 Service busy: Connection limit exceeded. Please try again later. Closing control connection.
1,Error on XrdCl:CopyProcess:Run(): [ERROR] Server responded with an error: []
2,"err: Communication error on send, err: [SE][][] httpg:/:/srm/: CGSI-gSOAP running on reports:"
3,Protocol not supported or path/url invalid: root:/:1094//atlasscratchdisk/rucio/tests///
4,SOURCE SRM_GET_TURL error on the turl request : [SE][][SRM_FILE_UNAVAILABLE] File is.


In [21]:
df.shape

(394, 1)

In [22]:
cluster = pipeline.Chain(df, target='error_message', mode='create')

In [23]:
cluster.process()

Tokenization finished
Found 392 equal groups
Matching Clusterization!
Postprocessed with 358 clusters


In [24]:
cluster.result

Unnamed: 0,pattern,indices,cluster_size
238,TRANSFER globus_ftp_client: the server responded with an error 451 General problem:,"[262, 110, 112, 258, 134, 98]",6
255,TRANSFER globus_ftp_client: the server responded with an error｟*｠,"[67, 240, 179]",3
261,TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : error in: system call failed: 500 End.,"[205, 175, 121]",3
0,(Neon): SSL handshake failed: Connection timed out during SSL handshake,"[300, 220]",2
36,: the server responded with an error 500｟*｠ server error,"[213, 267]",2
...,...,...,...
124,Error on XrdCl:CopyProcess:Run(): [ERROR] Server responded with an error: [3010] Unable to /dpm/home/atlas/atlasscratchdisk/rucio/tests/;,[12],1
123,Error on XrdCl:CopyProcess:Run(): [ERROR] Server responded with an error: [3010] Unable to //; permission denied,[7],1
122,Error on XrdCl:CopyProcess:Run(): [ERROR] Server responded with an error: [3010] Access denied: /pnfs/triumf.ca/data/atlas/atlasscratchdisk/rucio/tests/,[20],1
121,Error on XrdCl:CopyProcess:Run(): [ERROR] Server responded with an error: [3003] Output file /dteam:test1/domatest/ is already opened by 1 writer; open denied.,[44],1


In [25]:
print(cluster.in_cluster(cluster.result, 2),"\n") #some messages from the cluster 2
print(cluster.in_cluster(cluster.result, 38)) #messages from the cluster without a clear pattern

[': System error in send:｟*｠ globus_xio: A system call failed:｟*｠'] 

['CHECKSUM timeout of｟*｠']


In [26]:
cluster.result['pattern'].values

array(['TRANSFER globus_ftp_client: the server responded with an error 451 General problem:',
       'TRANSFER globus_ftp_client: the server responded with an error｟*｠',
       'TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : error in: system call failed: 500 End.',
       '(Neon): SSL handshake failed: Connection timed out during SSL handshake',
       ': the server responded with an error 500｟*｠ server error',
       'DESTINATION CHECKSUM checksum calculation for ADLER32 not supported for davs:/:/atlasscratchdisk/rucio/tests/',
       '[gfalt_copy_file][perform_copy] TRANSFER [gfal_http_copy] ERROR: Copy failed with mode 3rd push, with error: [davix2gliberr]',
       'TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. :',
       'DESTINATION SRM_PUTDONE Error on the surl srm:/dpm/home/ while putdone : [SE][PutDone][]',
       'TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. : E

In [27]:
clusters, outliers = cluster.split_clusters(cluster.result, 'cluster_size', threshold=30)
outliers #clusters containing fewer than 3 messages

In [28]:
top_clusters = cluster.validation(clusters).sort_values(by=["cluster_size"], ascending=False) #clusters sorted by similarity inside the cluster
top_clusters.head(20)

Unnamed: 0,cluster_name,cluster_size,pattern,mean_similarity,std_similarity
0,238,6,TRANSFER globus_ftp_client: the server responded with an error 451 General problem:,0.82,0.11
1,255,3,TRANSFER globus_ftp_client: the server responded with an error｟*｠,0.94,0.05
2,261,3,TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : error in: system call failed: 500 End.,0.88,0.08
24,196,2,"TRANSFER ERROR: Copy failed with mode 3rd push, with error: Transfer failed: failure: Failed to select｟*｠:｟*｠",0.9,0.1
13,109,2,DESTINATION SRM_PUT_TURL error on the turl request : [SE][][],0.89,0.11
12,262,2,TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : 500-globus_xio: System error in: 500-globus_xio: A system call failed: 500 End.,0.94,0.06
11,99,2,DESTINATION SRM_PUT_TURL error on the turl request : [SE][StatusOfPutRequest][SRM_FILE_BUSY] The surl srm:/ is currently busy (ongoing put,0.94,0.06
10,97,2,DESTINATION SRM_PUT_TURL error on the turl request : [SE][StatusOfPutRequest][] error accessing:/:,0.91,0.09
9,276,2,TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. : Error｟*｠,0.91,0.09
7,285,2,TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. :,0.88,0.12


In [29]:
print(top_clusters['cluster_size'].head(20).sum()/(df.shape[0]-1)*100)
print(clusters['cluster_size'].sum()/(df.shape[0]-1)*100)

11.704834605597965
100.25445292620864


In [30]:
cluster.timings

{'tokenization': 0.1474,
 'group_equals': 0.4371,
 'matching_clusterization': 0.7095,
 'process': 1.294,
 'validation': 0.1809}

In [31]:
top_clusters['pattern'].to_csv("patterns.txt",index=False, header=True)