In [93]:
import spacy,textacy
from spacy import displacy
import re

In [91]:
print(spacy.__version__)
print(textacy.__version__)

2.0.12
0.6.2


In [5]:
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

## Read the data

In [27]:
# with open('openshift-installation.json') as file:
#     data=json.load(file)
    
# print(data)

text = '''
The kubelet exposes metrics that can be collected and stored in back-ends by Heapster.

As an OpenShift Container Platform administrator, you can view a cluster’s metrics from all containers and components in one user interface. These metrics are also used by horizontal pod autoscalers in order to determine when and how to scale.

This topic describes using Hawkular Metrics as a metrics engine which stores the data persistently in a Cassandra database. When this is configured, CPU, memory and network-based metrics are viewable from the OpenShift Container Platform web console and are available for use by horizontal pod autoscalers.

Heapster retrieves a list of all nodes from the master server, then contacts each node individually through the /stats endpoint. From there, Heapster scrapes the metrics for CPU, memory and network usage, then exports them into Hawkular Metrics.

The storage volume metrics available on the kubelet are not available through the /stats endpoint, but are available through the /metrics endpoint. See OpenShift Container Platform via Prometheus for detailed information.

Browsing individual pods in the web console displays separate sparkline charts for memory and CPU. The time range displayed is selectable, and these charts automatically update every 30 seconds. If there are multiple containers on the pod, then you can select a specific container to display its metrics.

If resource limits are defined for your project, then you can also see a donut chart for each pod. The donut chart displays usage against the resource limit. For example: 145 Available of 200 MiB, with the donut chart showing 55 MiB Used. 
'''
text = re.sub('\n', '', text) #removing new lines

## Preprocessing

In [28]:
doc=nlp(text)

In [42]:
doc_lower = nlp(text.lower())

In [43]:
#Tokenization using spacy
#Tokens that are not punctuation alone but preserve punctuation within a token
tokens = [token for token in doc_lower if not token.is_punct] 

In [44]:
str(tokens)

'[the, kubelet, exposes, metrics, that, can, be, collected, and, stored, in, back, ends, by, heapster.as, an, openshift, container, platform, administrator, you, can, view, a, cluster, ’s, metrics, from, all, containers, and, components, in, one, user, interface, these, metrics, are, also, used, by, horizontal, pod, autoscalers, in, order, to, determine, when, and, how, to, scale.this, topic, describes, using, hawkular, metrics, as, a, metrics, engine, which, stores, the, data, persistently, in, a, cassandra, database, when, this, is, configured, cpu, memory, and, network, based, metrics, are, viewable, from, the, openshift, container, platform, web, console, and, are, available, for, use, by, horizontal, pod, autoscalers.heapster, retrieves, a, list, of, all, nodes, from, the, master, server, then, contacts, each, node, individually, through, the, /stats, endpoint, from, there, heapster, scrapes, the, metrics, for, cpu, memory, and, network, usage, then, exports, them, into, hawkular,

**Observations**
Look how "/stats" and "/metrics" are not tokenized separately

In [55]:
# Sentence segmentation
sentences = [sent for sent in doc.sents]
sentences[:5] #First 5 sentences

[The kubelet exposes metrics that can be collected and stored in back-ends by Heapster.,
 As an OpenShift Container Platform administrator, you can view a cluster’s metrics from all containers and components in one user interface.,
 These metrics are also used by horizontal pod autoscalers in order to determine when and how to scale.,
 This topic describes using Hawkular Metrics as a metrics engine which stores the data persistently in a Cassandra database.,
 When this is configured, CPU, memory and network-based metrics are viewable from the OpenShift Container Platform web console and are available for use by horizontal pod autoscalers.]

## Stemming vs Lemmatization

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes.

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw, depending on whether the use of the token was as a verb or a noun. - Christopher Manning

In [58]:
#Spacy only supports lemmatization. stemmers use is generally discouraged
# lemma(accessed using lemma_ attribute) and pos(POS tagging using pos_ attribute) 
# is automatically created for us when we process text with nlp object
print([(token, token.lemma_, token.pos_) for token in tokens][:20])

[(the, 'the', 'DET'), (kubelet, 'kubelet', 'NOUN'), (exposes, 'expose', 'VERB'), (metrics, 'metric', 'NOUN'), (that, 'that', 'ADJ'), (can, 'can', 'VERB'), (be, 'be', 'VERB'), (collected, 'collect', 'VERB'), (and, 'and', 'CCONJ'), (stored, 'store', 'VERB'), (in, 'in', 'ADP'), (back, 'back', 'NOUN'), (ends, 'end', 'NOUN'), (by, 'by', 'ADP'), (heapster.as, 'heapster.a', 'NOUN'), (an, 'an', 'DET'), (openshift, 'openshift', 'ADJ'), (container, 'container', 'NOUN'), (platform, 'platform', 'NOUN'), (administrator, 'administrator', 'NOUN')]


## Named Entity Recognition

In [59]:
# Prints the named entities detected
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

Heapster (PERSON)
OpenShift Container Platform (ORG)
Hawkular Metrics (ORG)
Cassandra (PERSON)
the OpenShift Container Platform (ORG)
Heapster (PERSON)
Heapster (PERSON)
Hawkular Metrics (ORG)
OpenShift Container Platform (ORG)
Prometheus (PRODUCT)
every 30 seconds (TIME)
145 (CARDINAL)
200 (CARDINAL)
MiB (PRODUCT)
55 (CARDINAL)
MiB Used (PRODUCT)


We found all the entities Heapster, Hawkular Metrics, Cassandra, Prometheus (though the label itself does not look right).  Definitely labels with PERSON, ORG, PRODUCT can be used for further manual analysis.**[TODO]**

## Visualizing Entities

In [90]:
content = '''The kubelet exposes metrics that can be collected and stored in back-ends by Heapster. This topic describes using Hawkular Metrics as a metrics engine which stores the data persistently in a Cassandra database. When this is configured, CPU, memory and network-based metrics are viewable from the OpenShift Container Platform web console and are available for use by horizontal pod autoscalers.'''
doc=nlp(content)
displacy.render(doc, style='ent', jupyter=True)

## Part-of-speech tagging

Sometimes we want to create an abstract for the text. We can quickly extract the relevant nouns from the text and show shor. Noun chunks are noun phrases – not single words, but a short phrase which describes the noun.

In [62]:
for idx, sentence in enumerate(sentences[:5]):
    for noun in sentence.noun_chunks: 
        print(f'sentence{idx+1}', noun)

sentence1 The kubelet
sentence1 metrics
sentence1 back-ends
sentence1 Heapster
sentence2 an OpenShift Container Platform administrator
sentence2 you
sentence2 a cluster’s metrics
sentence2 all containers
sentence2 components
sentence2 one user interface
sentence3 These metrics
sentence3 horizontal pod autoscalers
sentence3 order
sentence4 This topic
sentence4 Hawkular Metrics
sentence4 a metrics engine
sentence4 the data
sentence4 a Cassandra database
sentence5 CPU
sentence5 memory
sentence5 network-based metrics
sentence5 the OpenShift Container Platform web console
sentence5 use
sentence5 horizontal pod autoscalers


In [66]:
for token in tokens[:20]:
    print(token, token.pos_, token.tag_) # text , POS tagging & its label

the DET DT
kubelet NOUN NN
exposes VERB VBZ
metrics NOUN NNS
that ADJ WDT
can VERB MD
be VERB VB
collected VERB VBN
and CCONJ CC
stored VERB VBN
in ADP IN
back NOUN NN
ends NOUN NNS
by ADP IN
heapster.as NOUN NNS
an DET DT
openshift ADJ JJ
container NOUN NN
platform NOUN NN
administrator NOUN NN


## Generating Q&A from the above paragraph

In [67]:
from textacy.spacier import utils as spacy_utils

In [77]:
new_sentence = " This topic describes using Hawkular Metrics as a metrics engine which stores the data persistently in a Cassandra database."

In [78]:
#Get the main verbs in the above sentence
doc = nlp(new_sentence)
verbs = spacy_utils.get_main_verbs_of_sent(doc)
print(verbs)

[describes, using, stores]


In [79]:
#Get the nominal subjects of the verbs
for verb in verbs:
    print(verb, spacy_utils.get_subjects_of_verb(verb))

describes [topic]
using []
stores [which]


In [80]:
# Get objects from the sentence
for verb in verbs:
    print(verb, spacy_utils.get_objects_of_verb(verb))

describes [using]
using [Metrics]
stores [data]


In [83]:
def paragraph_to_question(text):
    doc = nlp(text)
    results = []
    for sentence in doc.sents:
        root = sentence.root
        ask_about = spacy_utils.get_subjects_of_verb(root)
        answers = spacy_utils.get_objects_of_verb(root)
        
        if len(ask_about) > 0 and len(answers) > 0:
            if root.lemma_ == 'be':
                question = f'What {root} {ask_about[0]}'
            else:
                question = f'What does {ask_about[0]} {root.lemma_}?'
            results.append({'question': question, 'answers': answers})
        return results

In [84]:
paragraph_to_question('Bansoori is an Indian classical instrument.')

[{'question': 'What is Bansoori', 'answers': [instrument]}]

In [85]:
paragraph_to_question('The kubelet exposes metrics that can be collected and stored in back-ends by Heapster.')

[{'question': 'What does kubelet expose?', 'answers': [metrics]}]