# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30
pd.options.display.float_format = '{:,.4f}'.format

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import seaborn as sns

## Custom imports

In [2]:
import json

# Analysis/Modeling
Do work here

## Read data

In [3]:
!pwd

/home/luca/workspace/rucio-log-clustering/notebooks


In [4]:
with open("../issues.json", 'r') as f:
    raw_data = json.load(f)

# reorganise them in a pandas dataframe structure    
errors = pd.DataFrame()
for page in raw_data:
    for record in page:
        idx = record.pop('id')
        temp = pd.DataFrame(record, index=[idx])
        errors = errors.append(temp)

In [5]:
print("Total number of errors:", errors.shape[0])

Total number of errors: 2826


In [6]:
errors.head()

Unnamed: 0,message,src_site,dst_site,category,amount,type,status,last_modified
1,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,BNL-ATLAS,9,178,transfer-failure,New,2019-08-06T14:07:04.412401+02:00
2,TRANSFER [70] TRANSFER globus_xio: Unable to ...,BNL-ATLAS,BU_ATLAS_Tier2,10,3288,transfer-failure,New,2019-08-06T14:07:04.595574+02:00
3,TRANSFER [70] TRANSFER globus_xio: Unable to ...,INFN-NAPOLI-ATLAS,BU_ATLAS_Tier2,10,3288,transfer-failure,New,2019-08-06T14:07:04.692554+02:00
4,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,CERN-PROD,11,227,transfer-failure,New,2019-08-06T14:07:04.803866+02:00
5,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,TOKYO-LCG2,11,130,transfer-failure,New,2019-08-06T14:07:04.891438+02:00


## Extract information
Once we read the data, we can focus on the variable *message* and try to extract meaningful information from it.

### Clearing and tokenizing messages

In [7]:
# Reduce to lowercase and split in tokens
tokens_per_message = [x.lower().split() for x in errors.message]

In [8]:
# Retrieve the set of all tokens used in the error messages
word_set = set()
for mess in tokens_per_message:
    word_set = word_set.union(set(mess))
    
word_set

{'!=',
 '""',
 "'unavailable'))",
 "('service_unavailable',",
 '():',
 '(catalog)=1c942109',
 '(catalog)=null',
 '(connection',
 '(http',
 '(input/output',
 '(local)=c6a1662a',
 '(local)=f0a113ee',
 '(neon):',
 '(none',
 '(permission',
 '(server',
 '-',
 '/dc=ch/dc=cern/ou=organic',
 '/eos/ctapps/castor/grid/atlas/rucio/raw/data18_calib/calibration_larelec-delay-7s-high-emec/0',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/02/4c/evnt.18896520._006543.pool.root.1',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/02/60/evnt.18896520._006362.pool.root.1',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/05/22/evnt.18896520._006278.pool.root.1',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/05/68/evnt.18896520._006383.pool.root.1',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/05/af/evnt.18896520._006364.pool.root.1',
 '/pnfs/rzg.mpg.de/data/atlas/dq2/atlasdatadisk/rucio/mc16_13tev/0a/c5/evnt.1889652

In [9]:
print("We have {} error messages, for a total of {} unique tokens adopted.".format(
    len(tokens_per_message), len(word_set)))

We have 2826 error messages, for a total of 852 unique tokens adopted.


<div class="alert alert-block alert-warning">
<b>Warning:</b> Tokenization procedure not optimal, it needs further checks and adaptations for our use case.
</div>

In [10]:
# Setup a dictionary with frequency of all tokens per each message (initialized to 0)
word_dict = [dict.fromkeys(word_set, 0) for i in range(len(tokens_per_message))]
print("Number of tokens:", len(word_dict[0]))

word_dict[0]

Number of tokens: 852


{'[net=2001:1458:301:33:0:0:100:136,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].': 0,
 'output': 0,
 'contained': 0,
 'source': 0,
 'httpg://se-goegrid.gwdg.de:8443/srm/managerv2:': 0,
 'root://eosctapps.cern.ch:1094//eos/ctapps/castor/grid/atlas/rucio/raw//data18_13tev/physics_main/00349111/data18_13tev.00349111.physics_main.daq.raw/data18_13tev.00349111.physics_main.daq.raw._lb0170': 0,
 'out,': 0,
 'url': 0,
 'davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/8c/03/panda.0724164746.896042.lib._18704925.18178566335.log.tgz': 0,
 'httpg://srm01.ncg.ingrid.pt:8444/srm/managerv2:': 0,
 'validated': 0,
 'that': 0,
 'seconds': 0,
 'srm://dcsrm.usatlas.bnl.gov/pnfs/usatlas.bnl.gov/bnlt1d0/data18_13tev/raw/other/data18_13tev.00350848.physics_main.daq.raw/data18_13tev.003': 0,
 'connect:': 0,
 'fts120.cern.ch': 0,
 'mc16_13tev:aod.18195614._000035.pool.root.1': 0,
 'fts308.usatlas.bnl.gov': 0,
 'exist': 0,
 'taiwan-lcg2_datadisk': 0,
 'supported':

In [12]:
# Compute raw frequencies of each token per each message
for i in range(len(errors.message)):
    for word in tokens_per_message[i]:
        word_dict[i][word] += 1
        
word_dict[0]

{'[net=2001:1458:301:33:0:0:100:136,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].': 0,
 'output': 0,
 'contained': 0,
 'source': 1,
 'httpg://se-goegrid.gwdg.de:8443/srm/managerv2:': 0,
 'root://eosctapps.cern.ch:1094//eos/ctapps/castor/grid/atlas/rucio/raw//data18_13tev/physics_main/00349111/data18_13tev.00349111.physics_main.daq.raw/data18_13tev.00349111.physics_main.daq.raw._lb0170': 0,
 'out,': 0,
 'url': 0,
 'davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/8c/03/panda.0724164746.896042.lib._18704925.18178566335.log.tgz': 0,
 'httpg://srm01.ncg.ingrid.pt:8444/srm/managerv2:': 0,
 'validated': 0,
 'that': 0,
 'seconds': 0,
 'srm://dcsrm.usatlas.bnl.gov/pnfs/usatlas.bnl.gov/bnlt1d0/data18_13tev/raw/other/data18_13tev.00350848.physics_main.daq.raw/data18_13tev.003': 0,
 'connect:': 0,
 'fts120.cern.ch': 0,
 'mc16_13tev:aod.18195614._000035.pool.root.1': 0,
 'fts308.usatlas.bnl.gov': 0,
 'exist': 0,
 'taiwan-lcg2_datadisk': 0,
 'supported':

In [13]:
print("Dictionary dimensions:\n", pd.DataFrame(word_dict).shape)

# Visualization
pd.DataFrame(word_dict)

Dictionary dimensions:
 (2826, 852)


Unnamed: 0,"[net=2001:1458:301:33:0:0:100:136,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].",output,contained,source,httpg://se-goegrid.gwdg.de:8443/srm/managerv2:,root://eosctapps.cern.ch:1094//eos/ctapps/castor/grid/atlas/rucio/raw//data18_13tev/physics_main/00349111/data18_13tev.00349111.physics_main.daq.raw/data18_13tev.00349111.physics_main.daq.raw._lb0170,"out,",url,davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/8c/03/panda.0724164746.896042.lib._18704925.18178566335.log.tgz,httpg://srm01.ncg.ingrid.pt:8444/srm/managerv2:,validated,that,seconds,srm://dcsrm.usatlas.bnl.gov/pnfs/usatlas.bnl.gov/bnlt1d0/data18_13tev/raw/other/data18_13tev.00350848.physics_main.daq.raw/data18_13tev.003,connect:,fts120.cern.ch,mc16_13tev:aod.18195614._000035.pool.root.1,fts308.usatlas.bnl.gov,exist,taiwan-lcg2_datadisk,supported,"cn=1568043002,cn=2540943468,cn=1987664213,cn=robot:",an,commands,panda:panda.0724164746.896042.lib._18704925.18178566335.log.tgz,...,davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/1d/7d/panda.0724134449.65659.lib._18701027.18174441103.log.tgz,"[rc=10025,msg=no",[se][srmrm][etimedout],runnin,httpg://t2cmcondor.mi.infn.it:8444/srm/managerv2:,does,corrupted.,fts813.cern.ch,not,c=no_space_left,accessible,handshake,davs://se01.dur.scotgrid.ac.uk:443/dpm/dur.scotgrid.ac.uk/home/atlas/atlasdatadisk/ru,destination,900,overwrite,updated,this,soap,actions,davs://recas-se-01.cs.infn.it:443/dpm/cs.infn.it/home/atlas/atlaslocalgroupdisk/rucio/group/phys-higgs/a5/c9/group.phys-higgs.16629192._000002.cxaod.root,davs://recas-se-01.cs.infn.it:443/dpm/cs.infn.it/home/atlas/atlasdatadisk/rucio/mc16_,eulakeftp.cern.ch:2811,range,grid05.lal.in2p3.fr:8446
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<div class="alert alert-block alert-info">
<b>Note:</b> The word_dict is basically a very sparse matrix in which each column represent a message and each column a token. The entries are then filled with a weight (just frequency at the moment) which somehow represents the importance of that token in that message taking into account the whole corpus of tokens/messages.
</div>

<div class="alert alert-block alert-warning">
<b>Warning:</b> Again tokenization has to be reviewed, for example we may want to divide the first column into single parameters as network, protocol, store, cache and linkgroup.
</div>

### Compute TF-IDF scores
Once we have messages divided in words and we have computed the raw frequencies of each token in each sentence, then we can proceed and compute the **tf-idf** score for each message.

In [18]:
c = 0
for i, dic in enumerate(tokens_per_message):
    if not len(dic):
        print(i, errors.iloc[i])
        c += 1

540 message                                          
src_site                                BNL-ATLAS
dst_site                                CERN-PROD
category                                      124
amount                                       3408
type                             transfer-failure
status                                        New
last_modified    2019-08-15T01:00:05.755828+02:00
Name: 441, dtype: object
541 message                                          
src_site                                  UNKNOWN
dst_site                                CERN-PROD
category                                      124
amount                                       3408
type                             transfer-failure
status                                        New
last_modified    2019-08-15T13:00:04.948552+02:00
Name: 442, dtype: object
547 message                                          
src_site                                BNL-ATLAS
dst_site                              

38

In [19]:
print("Warning: there are {} blanck messages which will be excluded from the analysis.".format(c))



In [32]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        try:
            tf[word] = count/sum_nk
        except ZeroDivisionError:
            tf[word] = 0
    return tf

tf = [compute_tf(word_dict[i], tokens_per_message[i]) for i in range(len(tokens_per_message))] #if sum(word_dict[i].values())]
# tf_A = compute_tf(word_dict_A, l_A)
# tf_B = compute_tf(word_dict_B, l_B)
# tf_C = compute_tf(word_dict_C, l_C)

In [30]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = np.log(n / float(v))
    return idf

idf = compute_idf(word_dict)
# idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])

In [33]:
len(word_dict), len(tf), len(tf[0]), len(idf)

(2826, 2826, 852, 852)

In [34]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf
    
tf_idf =  [compute_tf_idf(tf[i], idf) for i in range(len(tf))]
# tf_idf_A = compute_tf_idf(tf_A, idf)
# tf_idf_B = compute_tf_idf(tf_B, idf)
# tf_idf_C = compute_tf_idf(tf_C, idf)

In [35]:
pd.DataFrame(tf_idf)

Unnamed: 0,"[net=2001:1458:301:33:0:0:100:136,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].",output,contained,source,httpg://se-goegrid.gwdg.de:8443/srm/managerv2:,root://eosctapps.cern.ch:1094//eos/ctapps/castor/grid/atlas/rucio/raw//data18_13tev/physics_main/00349111/data18_13tev.00349111.physics_main.daq.raw/data18_13tev.00349111.physics_main.daq.raw._lb0170,"out,",url,davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/8c/03/panda.0724164746.896042.lib._18704925.18178566335.log.tgz,httpg://srm01.ncg.ingrid.pt:8444/srm/managerv2:,validated,that,seconds,srm://dcsrm.usatlas.bnl.gov/pnfs/usatlas.bnl.gov/bnlt1d0/data18_13tev/raw/other/data18_13tev.00350848.physics_main.daq.raw/data18_13tev.003,connect:,fts120.cern.ch,mc16_13tev:aod.18195614._000035.pool.root.1,fts308.usatlas.bnl.gov,exist,taiwan-lcg2_datadisk,supported,"cn=1568043002,cn=2540943468,cn=1987664213,cn=robot:",an,commands,panda:panda.0724164746.896042.lib._18704925.18178566335.log.tgz,...,davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/1d/7d/panda.0724134449.65659.lib._18701027.18174441103.log.tgz,"[rc=10025,msg=no",[se][srmrm][etimedout],runnin,httpg://t2cmcondor.mi.infn.it:8444/srm/managerv2:,does,corrupted.,fts813.cern.ch,not,c=no_space_left,accessible,handshake,davs://se01.dur.scotgrid.ac.uk:443/dpm/dur.scotgrid.ac.uk/home/atlas/atlasdatadisk/ru,destination,900,overwrite,updated,this,soap,actions,davs://recas-se-01.cs.infn.it:443/dpm/cs.infn.it/home/atlas/atlaslocalgroupdisk/rucio/group/phys-higgs/a5/c9/group.phys-higgs.16629192._000002.cxaod.root,davs://recas-se-01.cs.infn.it:443/dpm/cs.infn.it/home/atlas/atlasdatadisk/rucio/mc16_,eulakeftp.cern.ch:2811,range,grid05.lal.in2p3.fr:8446
0,0.0000,0.0000,0.0000,0.0949,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.0000,0.0000,0.0000,0.0949,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0402,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.0000,0.0000,0.0000,0.0949,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0402,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
5,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0715,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0297,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
6,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0587,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0244,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
7,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1061,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0440,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
8,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0587,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0244,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
9,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0658,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0546,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


### Alternative using Scikit-Learn: TfidfVectorizer  

In [38]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd


print("Extracting features from the training dataset using a sparse vectorizer")
# t0 = time()
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.02, stop_words='english',
                             use_idf=True)
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True)
X = vectorizer.fit_transform(errors.message)


Extracting features from the training dataset using a sparse vectorizer
CPU times: user 55.9 ms, sys: 0 ns, total: 55.9 ms
Wall time: 55.3 ms


In [39]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.8,
 'max_features': None,
 'min_df': 0.02,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

# Clustering

As a first attempt we can try a simple clustering algorithm such as *K-means*. 

<div class="alert alert-block alert-info">
<b>Note:</b> We repeat the information extraction step through the scikit-learn routine TfidfVectorizer just for convenience (so to avoid problems with variable types and to apply easily the clustering algorithm).
</div>

In [40]:
# import libraries
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

# Extract TF-IDF information
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.02, stop_words='english',
                             use_idf=True)
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True)
X = vectorizer.fit_transform(errors.message)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()


# Apply LSA for dimensionality reduction to get a lower-dimensional embedding space
print("Performing dimensionality reduction using LSA")
t0 = time()

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(25)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.053574s
n_samples: 2826, n_features: 135

Performing dimensionality reduction using LSA
done in 0.065861s
Explained variance of the SVD step: 83%



Once we have a *somewhat convenient* subspace for representing the words/tokens, and hence the messages, then we cann proceed and apply the clustering algorithm.

<div class="alert alert-block alert-info">
    <b>Note:</b> In the following, we pretend the variable <i>category</i> as a true label for the clustering results. This is just to show how one could evaluate results of the unsupervised model when target labels are available.
</div>

In [50]:
%%time

# set number of clusters (hyperparameter)
n_clusters = 10

# run K-Means algorithm: 6 clusters
km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=500, n_init=100,
                n_jobs=10, verbose=1)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=10, n_init=100, n_jobs=10, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=1)
done in 1.799s

CPU times: user 462 ms, sys: 179 ms, total: 641 ms
Wall time: 1.8 s


# Results
Show graphs and stats here

First thing we can look at are the centroids. 

In [60]:
print("We have {} centroids represented as {}-dimensional points.".format(km.cluster_centers_.shape[0], km.cluster_centers_.shape[1]))
print("Let see the first one for example:")

km.cluster_centers_[0]

We have 10 centroids represented as 25-dimensional points.
Let see the first one for example:


array([ 0.45096581,  0.02192532,  0.00744968, -0.31440977, -0.3595249 ,
        0.46993336, -0.17620879, -0.30822639,  0.24459762,  0.10495095,
       -0.24708052,  0.04814103, -0.12914338, -0.01589765, -0.04109929,
        0.01171121, -0.01193521, -0.01524568, -0.02506501,  0.01545754,
       -0.01033149, -0.07574596, -0.03075846, -0.00461237,  0.08543847])

Observations labels, instead, can be accessed through the attribute **labels_** of the clustering object:

In [63]:
from collections import Counter, defaultdict

# print the numerosity of each cluster
print(Counter(km.labels_))

Counter({3: 652, 2: 462, 1: 384, 4: 335, 7: 273, 6: 211, 9: 160, 5: 124, 0: 121, 8: 104})


Now we can look for example to the messages that fall into the same class:

In [65]:
errors["kmean_labels"] = km.labels_

for msg in errors.message[errors.kmean_labels==8]:
    print(msg)

Job has been canceled because it stayed in the queue for too long
Job has been canceled because it stayed in the queue for too long
Job canceled by the user
Job canceled by the user
Job canceled by the user
Job has been canceled because it stayed in the queue for too long
Job has been canceled because it stayed in the queue for too long
Job canceled by the user
Job canceled by the user
Job canceled by the user
Job has been canceled because it stayed in the queue for too long
Reaper 0-1: Deletion NOTFOUND of mc16_13TeV:log.18507477._000006.job.log.tgz.1 as davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasdatadisk/rucio/mc16_13TeV/77/0f/log.18507477._000006.job.log.tgz.1 on DESY-HH_DATADISK
Reaper 6-6: Deletion NOTFOUND of panda:panda.um.user.fcardill.TestTomas2_MUMU.AllYear.grp17_v01_p3730.log.18700913.004795.log.tgz as davs://dcache-atlas-webdav.desy.de:2880/dq2/atlasscratchdisk/rucio/panda/56/c5/panda.um.user.fcardill.TestTomas2_MUMU.AllYe
Replica root://golias100.farm.particle.cz:109

## Check the true label (if available) 

In [51]:
# extract true clusters
labels = errors.category

# evaluate results comparing with labels
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.457
Completeness: 1.000
V-measure: 0.628
Adjusted Rand-Index: 0.292
Silhouette Coefficient: 0.423


<div class="alert alert-block alert-warning">
    <b>Warning:</b> Indeed this comparison is unfair, we chose to represent 340 potential (fake) cluster with just 10 groups.
</div>

In [52]:
print("There are {} unique categories for error messages.".format(len(np.unique(labels))))

There are 340 unique categories for error messages.


# Conclusions and Next Steps
Summarize findings here