# Clusterlogs Notebook

In [None]:
import pandas as pd
from clusterlogs import pipeline
import numpy as np

## 1. Download data from file and create pandas DataFrame with index 

In [None]:
df = pd.read_csv('samples/harvester_errors24.csv', sep=';')

In [None]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample.csv', sep='\t')

In [2]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample-2020-02-27 15_01_06.750914.csv', sep='\t')

In [None]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample_superror-2020-03-03 09_35_47.158324.csv', sep='\t')

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [3]:
target = 'message'

In [None]:
len(np.unique(df['message'].values))

## 2. Execute clusterization pipeline

In [None]:
cluster = pipeline.Chain(df, target, mode='create', model_name='models/harvester_new.model', matching_accuracy=0.8, output_file='reports/harv_1day.html', clustering_type='ML')

In [None]:
cluster = pipeline.Chain(df, target, mode='process', model_name='models/harvester_30days.model', matching_accuracy=0.8, clustering_type='ML')

In [4]:
cluster = pipeline.Chain(df, target, mode='update', model_name='models/harvester_30days.model', matching_accuracy=0.8, clustering_type='ML',output_file='reports/harv_1day.html')

In [5]:
cluster.process()

Found 54 equal groups
Vectorization of tokens finished
Vectorization of sentences is finished
DBSCAN finished with 37 clusters
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.009969]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.009528]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.010136]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.008883]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.008684]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.009771]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.010045]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.009498]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.009040]
Parsing data:
Processed 100.0% of log lines.
Parsing done. [Time taken: 0:00:00.0

In [None]:
df['sequence'].values

In [None]:
cluster.groups.shape

## 3. Result: all clusters (big clusters and outliers) - sorted by cluster size 

In [6]:
cluster.result

Unnamed: 0,pattern,indices,cluster_size,common_phrases_RAKE
9,[Payload execution error: returned non-zero],"[12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...",1010,[Payload execution error returned non zero]
20,[Condor HoldReason: None ; Condor RemoveReason...,"[4, 5, 6, 44, 102, 103, 104, 106, 107, 108, 10...",792,"[job restarted undesirably, system_periodic_re..."
27,[not submitted due to incomplete data of the w...,"[31, 32, 33, 142, 151, 155, 156, 163, 164, 165...",362,"[not submitted due, incomplete data, worker]"
28,[LRMS error: (271) job killed: <*>],"[168, 220, 221, 222, 282, 429, 487, 488, 489, ...",300,"[lrms error job killed vmem, lrms error job ki..."
24,[Condor HoldReason: Unspecified gridmanager er...,"[0, 1, 2, 3, 91, 92, 93, 94, 95, 152, 153, 154...",235,"[held too long, harvester due, not found]"
19,[Condor HoldReason: CREAM error: reason=(.*?) ...,"[45, 46, 47, 48, 49, 50, 223, 224, 225, 226, 2...",234,"[held too long, harvester due, not found]"
34,[Condor HoldReason: None ; Condor RemoveReason...,"[40, 105, 138, 205, 206, 207, 232, 249, 326, 3...",214,[user atlpan]
5,[Condor HoldReason: CREAM error: Job has been ...,"[263, 264, 265, 295, 466, 718, 885, 886, 887, ...",190,"[held too long, harvester due, not found]"
25,[Condor HoldReason: CREAM error: CREAM_Job_Reg...,"[25, 26, 27, 34, 35, 60, 61, 62, 63, 64, 65, 6...",183,"[cream service cannot accept jobs, moment faul..."
8,[Condor HoldReason: CREAM error: CREAM_Job_Reg...,"[137, 143, 231, 281, 284, 353, 357, 362, 482, ...",148,"[cream service cannot accept jobs, tomcat fd n..."


In [None]:
cluster.in_cluster(cluster.result, 2)

### Print only patterns

In [None]:
cluster.result['pattern'].values

In [None]:
cluster.result['common_phrases'].values

In [None]:
cluster.in_cluster(cluster.result, 43)

### Split clusters to big (cluster_size >= 1000) and small (cluster_size < 1000)

In [None]:
big, small = cluster.split_clusters(cluster.result, 'cluster_size', 1000)

In [None]:
big['pattern'].values

In [None]:
small

### Print all messages from cluster #40

In [None]:
cluster.in_cluster(clusters, 40)

### Display the performance of all stages

In [None]:
cluster.timings

In [None]:
from gensim.summarization import keywords
text = '''Challenges in natural language processing frequently involve
speech recognition, natural language understanding, natural language
generation (frequently from formal, machine-readable logical forms),
connecting language and machine perception, dialog systems, or some
combination thereof.'''

In [None]:
import spacy
import pytextrank

In [None]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()

In [None]:
phrases = Phrases(text, min_count=1, threshold=1, scoring='npmi', delimiter=b' ')

In [None]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
doc = nlp(text)
for p in doc._.phrases:
    print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    print(p.chunks)

In [None]:
spacy.load('en_core_web_sm')

In [None]:
text = """Automatic summarization is the process of reducing a text document with a \
computer program in order to create a summary that retains the most important points \
of the original document. As the problem of information overload has grown, and as \
the quantity of data has increased, so has interest in automatic summarization. \
Technologies that can make a coherent summary take into account variables such as \
length, writing style and syntax. An example of the use of summarization technology \
is search engines such as Google. Document summarization is another."""
text1 = """
Error on the surl while putdone. This SURL does not exist in the original request. 
"""

from summa import summarizer
print(summarizer.summarize(text1))

In [None]:
from summa import keywords

In [None]:
print(keywords.keywords(text1))

In [None]:
import regroup

In [None]:
def calc_cache_pos(strings, indexes):
    factor = 1
    pos = 0
    for s, i in zip(strings, indexes):
        pos += i * factor
        factor *= len(s)
    return pos

def lcs_back(strings, indexes, cache):
    if -1 in indexes:
        return ""
    match = all(strings[0][indexes[0]] == s[i]
                for s, i in zip(strings, indexes))
    if match:
        new_indexes = [i - 1 for i in indexes]
        result = lcs_back(strings, new_indexes, cache) + strings[0][indexes[0]]
    else:
        substrings = [""] * len(strings)
        for n in range(len(strings)):
            if indexes[n] > 0:
                new_indexes = indexes[:]
                new_indexes[n] -= 1
                cache_pos = calc_cache_pos(strings, new_indexes)
                if cache[cache_pos] is None:
                    substrings[n] = lcs_back(strings, new_indexes, cache)
                else:
                    substrings[n] = cache[cache_pos]
        result = max(substrings, key=len)
    cache[calc_cache_pos(strings, indexes)] = result
    return result

def lcs(strings):
    """
    >>> lcs(['666222054263314443712', '5432127413542377777', '6664664565464057425'])
    '54442'
    >>> lcs(['abacbdab', 'bdcaba', 'cbacaa'])
    'baa'
    """
    if len(strings) == 0:
        return ""
    elif len(strings) == 1:
        return strings[0]
    else:
        cache_size = 1
        for s in strings:
            cache_size *= len(s)
        cache = [None] * cache_size
        indexes = [len(s) - 1 for s in strings]
        return lcs_back(strings, indexes, cache)

In [None]:
lcs(['abacbdab', 'bdcaba', 'cbacaa', 'abacbdabaa'])

In [None]:
import nltk 
pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)