# Clusterlogs Notebook

In [None]:
import pandas as pd
from clusterlogs import pipeline
import numpy as np

## 1. Download data from file and create pandas DataFrame with index 

In [19]:
df = pd.read_csv('samples/harvester_errors24.csv', sep=';')

In [None]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample.csv', sep='\t')

In [10]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample30days.csv', sep=';')

In [None]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample-2020-02-27 15_01_06.750914.csv', sep='\t')

In [None]:
df = pd.read_csv('/Users/maria/cernbox/LogsClusterization/Harvester/data_sample_superror-2020-03-03 09_35_47.158324.csv', sep='\t')

In [None]:
df.dropna(inplace=True)

In [11]:
df.shape

(1756158, 2)

In [None]:
df.head()

In [20]:
target = 'message'

In [13]:
len(np.unique(df['message'].values))

153832

## 2. Execute clusterization pipeline

In [14]:
cluster = pipeline.Chain(df, target, mode='create', model_name='models/harvester_new.model', matching_accuracy=0.8, output_file='reports/harvester_bigdata_create.html', clustering_type='ML',
                        algorithm='dbscan')

In [21]:
cluster = pipeline.Chain(df, target, mode='update', model_name='models/harvester_30days.model', matching_accuracy=0.8, clustering_type='ML',
                        algorithm='dbscan', output_file='reports/harvester_update.html')

In [None]:
cluster = pipeline.Chain(df, target, mode='update', model_name='models/harvester_30days.model', matching_accuracy=0.8, clustering_type='ML',output_file='reports/harv_24_update.html')

In [22]:
cluster.process()

Found 225 equal groups
Vectorization of tokens finished
Vectorization of sentences is finished
K-neighbours = 15
DBSCAN finished with 163 clusters
Calculating group patterns for 1 values
[list(['Condor', '▁', 'HoldReason:', '▁', 'CREAM', '▁', 'error:', '▁', 'reason=127;', '▁', '(.*?)', '▁', 'line', '▁', '52:', '▁', '(.*?)', '▁', 'No', '▁', 'such', '▁', 'file', '▁', 'or', '▁', 'directory', '▁', ';', '▁', 'Worker', '▁', 'canceled', '▁', 'by', '▁', 'harvester', '▁', 'due', '▁', 'to', '▁', 'held', '▁', 'too', '▁', 'long', '▁', 'or', '▁', 'not', '▁', 'found'])]
['Condor HoldReason: CREAM error: reason=127; (.*?) line 52: (.*?) No such file or directory ; Worker canceled by harvester due to held too long or not found']
Extracting key phrases...
Calculating group patterns for 15 values
[list(['26649918', '▁', 'tmpuotSVB+', '▁', 'regular', '▁', 'm2616', '▁', '6800', '▁', 'PENDING', '▁', '0:0'])
 list(['26641045', '▁', 'tmpLVTFlk+', '▁', 'regular', '▁', 'm2616', '▁', '6800', '▁', 'PENDING', '▁'

In [None]:
df['sequence'].values

In [None]:
cluster.groups.shape

## 3. Result: all clusters (big clusters and outliers) - sorted by cluster size 

In [None]:
cluster.result

In [None]:
cluster.in_cluster(cluster.result, 2)

### Print only patterns

In [None]:
cluster.result['pattern'].values

In [None]:
cluster.result['common_phrases'].values

In [None]:
cluster.in_cluster(cluster.result, 43)

### Split clusters to big (cluster_size >= 1000) and small (cluster_size < 1000)

In [16]:
big, small = cluster.split_clusters(cluster.result, 'cluster_size', 1000)

In [18]:
big['pattern'].shape

(25,)

In [None]:
small

### Print all messages from cluster #40

In [None]:
cluster.in_cluster(clusters, 40)

### Display the performance of all stages

In [None]:
cluster.timings

In [None]:
from gensim.summarization import keywords
text = '''Challenges in natural language processing frequently involve
speech recognition, natural language understanding, natural language
generation (frequently from formal, machine-readable logical forms),
connecting language and machine perception, dialog systems, or some
combination thereof.'''

In [None]:
import spacy
import pytextrank

In [None]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()

In [None]:
phrases = Phrases(text, min_count=1, threshold=1, scoring='npmi', delimiter=b' ')

In [None]:
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
doc = nlp(text)
for p in doc._.phrases:
    print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    print(p.chunks)

In [None]:
spacy.load('en_core_web_sm')

In [None]:
text = """Automatic summarization is the process of reducing a text document with a \
computer program in order to create a summary that retains the most important points \
of the original document. As the problem of information overload has grown, and as \
the quantity of data has increased, so has interest in automatic summarization. \
Technologies that can make a coherent summary take into account variables such as \
length, writing style and syntax. An example of the use of summarization technology \
is search engines such as Google. Document summarization is another."""
text1 = """
Error on the surl while putdone. This SURL does not exist in the original request. 
"""

from summa import summarizer
print(summarizer.summarize(text1))

In [None]:
from summa import keywords

In [None]:
print(keywords.keywords(text1))

In [None]:
import regroup

In [None]:
def calc_cache_pos(strings, indexes):
    factor = 1
    pos = 0
    for s, i in zip(strings, indexes):
        pos += i * factor
        factor *= len(s)
    return pos

def lcs_back(strings, indexes, cache):
    if -1 in indexes:
        return ""
    match = all(strings[0][indexes[0]] == s[i]
                for s, i in zip(strings, indexes))
    if match:
        new_indexes = [i - 1 for i in indexes]
        result = lcs_back(strings, new_indexes, cache) + strings[0][indexes[0]]
    else:
        substrings = [""] * len(strings)
        for n in range(len(strings)):
            if indexes[n] > 0:
                new_indexes = indexes[:]
                new_indexes[n] -= 1
                cache_pos = calc_cache_pos(strings, new_indexes)
                if cache[cache_pos] is None:
                    substrings[n] = lcs_back(strings, new_indexes, cache)
                else:
                    substrings[n] = cache[cache_pos]
        result = max(substrings, key=len)
    cache[calc_cache_pos(strings, indexes)] = result
    return result

def lcs(strings):
    """
    >>> lcs(['666222054263314443712', '5432127413542377777', '6664664565464057425'])
    '54442'
    >>> lcs(['abacbdab', 'bdcaba', 'cbacaa'])
    'baa'
    """
    if len(strings) == 0:
        return ""
    elif len(strings) == 1:
        return strings[0]
    else:
        cache_size = 1
        for s in strings:
            cache_size *= len(s)
        cache = [None] * cache_size
        indexes = [len(s) - 1 for s in strings]
        return lcs_back(strings, indexes, cache)

In [None]:
lcs(['abacbdab', 'bdcaba', 'cbacaa', 'abacbdabaa'])

In [None]:
import nltk 
pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)