# Clusterlogs Notebook

In [147]:
import pandas as pd
from clusterlogs import pipeline, cluster_output

### 1. Download data from file and create pandas DataFrame with index 

In [140]:
df = pd.read_csv('fts_mess_panda.csv', index_col=0).set_index('id')

In [141]:
df.head(10)

Unnamed: 0_level_0,message,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,DESTINATION MAKE_PARENT srm-ifce err: Permissi...,27
2,TRANSFER globus_ftp_client: the server respon...,1467
3,TRANSFER globus_ftp_client: the server respon...,2
4,TRANSFER globus_ftp_client: the server respon...,34
5,TRANSFER globus_ftp_client: the server respon...,9
6,TRANSFER globus_ftp_client: the server respon...,84
7,TRANSFER globus_ftp_client: the server respon...,84
8,DESTINATION MAKE_PARENT srm-ifce err: Permissi...,30
9,TRANSFER globus_ftp_client: the server respon...,13
10,TRANSFER globus_ftp_client: the server respon...,46


In [142]:
df.shape

(22792, 2)

### 2. Initialize clusterization pipeline

In [143]:
target = 'message'

In [144]:
#clustering_parameters = {'w2v_size': 200}

In [145]:
cluster = pipeline.ml_clustering(df, target, mode='create', model_name='fts_word2vec.model')

### 3. Execute clusterization pipeline

In [146]:
cluster.process()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


<clusterlogs.pipeline.ml_clustering at 0x1c5bf9acf8>

In [148]:
cluster.w2v_size

400

### 4. Get clusters statistics

In [149]:
output = cluster_output.Output(cluster.df, 
                               cluster.target, 
                               cluster.tokenizer, 
                               cluster.messages, 
                               cluster.cluster_labels)

In [150]:
stat = output.statistics()

In [151]:
stat_df = pd.DataFrame.from_dict(stat)

In [152]:
stat_df.shape

(703, 9)

In [168]:
pd.options.display.max_colwidth = 200

In [170]:
stat_df.sort_values(by='cluster_size', ascending=False)[['cluster_name',
                                                        'cluster_size',
                                                        'pattern',
                                                        'mean_similarity']].head(30)

Unnamed: 0,cluster_name,cluster_size,pattern,mean_similarity
2,2,6467,TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. : open/create : [ERROR] Server responded with an error: [3009] Unable to get free physical space /ek; No space l...,77.16
12,12,2357,TRANSFER globus_ftp_client: the server responded with an error 500 Command failed. : open/create : [ERROR] Server responded with an error: [3021] Unable to get quota space - quota not defined or ...,89.71
0,0,1886,"DESTINATION MAKE_PARENT srm-ifce err: Permission denied, err: [SE][Mkdir][SRM_AUTHORIZATION_FAILURE] httpg://recas-se-01.cs.infn.it:8446/srm/managerv2: srm://recas-se-01.cs.infn.it/dpm/cs.infn.it/...",97.4
5,5,1325,TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 1.:: Connection timed out,97.29
17,17,1283,TRANSFER globus_ftp_client: the server responded with an error 451 451-GlobusError: v=1 c=INTERNAL_ERROR 451-GridFTP-Errno: 255 451-GridFTP-Reason: System error in write into HDFS (host=gftp.he...,67.68
20,20,1259,SOURCE CHECKSUM MISMATCH User defined checksum and source checksum do not match !=,87.77
16,16,748,TRANSFER CHECKSUM MISMATCH USER_DEFINE and SRC checksums are different. !=,86.48
56,56,530,TRANSFER globus_ftp_client: the server responded with an error 451 Failed to deliver PoolMgrSelectWritePoolMsg message <15744626> to [SpaceManager@local:>PoolManager@local]: Route for >*@dCacheDo...,96.36
6,6,422,Error reported from srm_ifce : 2 [SE][Ls][SRM_INVALID_PATH] No such file or directory /,81.18
3,3,407,TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : globus_xio: Unable to connect to 500-globus_xio: System error in connect: Connection timed out 500-glob...,94.1


Clusters with single value - outliers

In [177]:
stat_df[stat_df['cluster_size']==1][['cluster_name','pattern']]

Unnamed: 0,cluster_name,pattern
532,532,"srm-ifce err: Communication error on send, err: [SE][StatusOfLsRequest][ETIMEDOUT] httpg://grid-srm.physik.rwth-aachen.de:8443/srm/managerv2: User timeout over"
529,529,DESTINATION SRM_PUT_TURL error on the turl request : [SE][StatusOfPutRequest][SRM_INTERNAL_ERROR] Request to [>PnfsManager@local] timed out.
533,533,DESTINATION OVERWRITE Connection timed out
528,528,TRANSFER globus_ftp_client: the server responded with an error 500 500-System error in mkdir: File exists 500-A system call failed: File exists 500 End.
530,530,"[gfalt_copy_file][perform_copy] TRANSFER [gfal_http_copy] ERROR: Copy failed with mode 3rd push, with error: [davix2gliberr] Transfer failed: failure: Remote copy failed with status code 0: OpenS..."
...,...,...
358,358,Result HTTP 404 : File not found after 1 attempts
357,357,"TRANSFER ERROR: Copy failed with mode 3rd push, with error: copy Could not connect to server"
356,356,User and source checksums do not match
355,355,TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Stream ended before EOD


### Timings for all stages of clusterization pipeline

process - timing of all process

In [159]:
cluster.timings

{'data_preparation': 0.4888,
 'tokenization': 3.388,
 'tokens_vectorization': 4.0073,
 'sentence_vectorization': 6.6132,
 'kneighbors': 52.2003,
 'epsilon_search': 0.1622,
 'dbscan': 5.4896,
 'process': 72.3525}

### Get all error messages in single cluster

In [160]:
output.in_cluster(0)

['DESTINATION MAKE_PARENT srm-ifce err: Permission denied, err: [SE][Mkdir][SRM_UID_FAILURE] httpg://recas-se-01.cs.infn.it:8446/srm/managerv2: srm://recas-se-01.cs.infn.it/dpm/cs.infn.it/home/atlas/UID/rucio/user/lxu/76/98: Permission denied',
 'DESTINATION MAKE_PARENT srm-ifce err: Permission denied, err: [SE][Mkdir][SRM_UID_FAILURE] httpg://recas-se-01.cs.infn.it:8446/srm/managerv2: srm://recas-se-01.cs.infn.it/dpm/cs.infn.it/home/atlas/UID/rucio/user/lxu/a6/bb: Permission denied',
 'DESTINATION MAKE_PARENT srm-ifce err: Permission denied, err: [SE][Mkdir][SRM_UID_FAILURE] httpg://recas-se-01.cs.infn.it:8446/srm/managerv2: srm://recas-se-01.cs.infn.it/dpm/cs.infn.it/home/atlas/UID/rucio/user/arsahu/8a/1f: Permission denied',
 'DESTINATION MAKE_PARENT srm-ifce err: Permission denied, err: [SE][Mkdir][SRM_UID_FAILURE] httpg://recas-se-01.cs.infn.it:8446/srm/managerv2: srm://recas-se-01.cs.infn.it/dpm/cs.infn.it/home/atlas/UID/rucio/user/lxu/63/54: Permission denied',
 'DESTINATION MAK

In [161]:
output.in_cluster(101)

['TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Problem while connected to 129.107.255.16:59089: Network is unreachable',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Problem while connected to 129.107.255.16:56536: Network is unreachable',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Problem while connected to 129.107.255.17:59271: Network is unreachable',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Problem while connected to 129.107.255.16:59161: Network is unreachable']

In [162]:
output.in_cluster(5)

['TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.22:24905: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.25:23347: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.8:20962: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.19:23485: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.25:23813: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.14.21:21624: Connection timed out',
 'TRANSFER globus_ftp_client: the server responded with an error 451 General problem: Failed to connect 158.195.1

In [163]:
output.in_cluster(3)

['TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : globus_xio: Unable to connect to 128.42.124.218:6129 500-globus_xio: System error in connect: Connection timed out 500-globus_xio: A system call failed: Connection timed out 500 End.',
 'TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : globus_xio: Unable to connect to 10.1.0.74:21464 500-globus_xio: System error in connect: Connection timed out 500-globus_xio: A system call failed: Connection timed out 500 End.',
 'TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : globus_xio: Unable to connect to 152.92.255.246:45591 500-globus_xio: System error in connect: Connection timed out 500-globus_xio: A system call failed: Connection timed out 500 End.',
 'TRANSFER globus_ftp_client: the server responded with an error 500 500-Command failed. : globus_xio: Unable to connect to 10.1.0.74:20531 500-globus_xio: System error in con

### Output clusters - mode == 'ALL'  (for cluster '2')

In [171]:
output.clustered_output(mode='ALL')['1']

[{'message': 'TRANSFER  globus_ftp_client: the server responded with an error 500 Command failed. : IPC failed while attempting to perform request   ',
  'count': 1467,
  'cluster': 1}]

### Output clusters - mode == 'INDEX' (for cluster '2')

In [172]:
output.clustered_output(mode='INDEX')['1']

[2]

### Output clusters - mode == 'TARGET' (for cluster '2')

In [173]:
output.clustered_output(mode='TARGET')['1']

['TRANSFER  globus_ftp_client: the server responded with an error 500 Command failed. : IPC failed while attempting to perform request   ']

### Output clusters - cluster labels

In [87]:
cluster.cluster_labels

array([ 0,  1,  2, ..., 16, 76,  5])

### Get epsilon value (which was used in DBSCAN algorithm)

In [48]:
cluster.epsilon

0.04460315106927138