In [None]:
#hide
!pip install -q spacy --upgrade

[K     |████████████████████████████████| 6.4MB 29.2MB/s 
[K     |████████████████████████████████| 460kB 40.8MB/s 
[K     |████████████████████████████████| 624kB 38.7MB/s 
[K     |████████████████████████████████| 10.1MB 35.6MB/s 
[K     |████████████████████████████████| 51kB 7.0MB/s 
[?25h

In [None]:
# default_exp tokens_ke
# default_cls_lvl 3

# Token Knowledge Extraction
> Extract token knowledge from text.

In [None]:
!python -m spacy download en_core_web_sm

2021-07-19 14:40:18.859194: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Collecting en-core-web-sm==3.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6MB)
[K     |████████████████████████████████| 13.6MB 221kB/s 
Installing collected packages: en-core-web-sm
  Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
#export
import spacy

In [None]:
#export
class TokenKnowledgeExtractor:
  """Extract knowledge like token, lemma,pos, tags, noun chunks etc."""
  def __init__(self, spacy_model="en_core_web_sm"):
    try:
      self.nlp = spacy.load(spacy_model)
    except:
      print('Please download en_core_web_sm using your cmd- python -m spacy download en_core_web_sm')

  def __get_token_knowledge(self, doc):
    results = []
    for token in doc:
        token_info = {}
        token_info['token'] = token.text
        token_info['lemma'] = token.lemma_
        token_info['pos'] = token.pos_
        token_info['tag'] = token.tag_
        token_info['is_alpha'] = token.is_alpha
        token_info['is_stop'] = token.is_stop
        results.append(token_info)
    return results

  def __get_noun_chunks_knowledge(self, doc):
    results = []
    for chunk in doc.noun_chunks:
        chunk_info = {}
        chunk_info['chunk'] = chunk.text
        chunk_info['root_text'] = chunk.root.text
        chunk_info['root_dep'] = chunk.root.dep_
        chunk_info['root_head'] = chunk.root.head.text
        results.append(chunk_info)
    return results

  def extract(self, text):
    extracted_info = {
        'raw_text':text
    }
    docs = self.nlp(text)
    sents = docs.sents
    extracted_info["sents"] = []
    for sent in sents:
        sent_info = {}
        sent_info["text"] = sent.text
        sent_info["tokens"] = self.__get_token_knowledge(sent)
        sent_info["noun_chunks"] = self.__get_noun_chunks_knowledge(sent)
        extracted_info["sents"].append(sent_info)
    return extracted_info

`TokenKnowledgeExtractor` is used to extract token based information or knowledge i.e lemma, pos, tags, noun_chunks etc. We have used spacy package to extract information.

In [None]:
token_ke = TokenKnowledgeExtractor(nlp)

In [None]:
input_text = """
Days of riots and looting in South Africa have left more than 70 people dead, hurt thousands of businesses and damaged major infrastructure in some of the worst civil unrest since the end of white minority rule in 1994.
The unrest started after former President Jacob Zuma handed himself over last week to start a 15-month prison sentence for contempt of court.
JOHANNESBURG, July 14 (Reuters) - Days of riots and looting in South Africa have left more than 70 people dead, hurt thousands of businesses and damaged major infrastructure in some of the worst civil unrest since the end of white minority rule in 1994.

What is driving the violence?

ZUMA'S JAILING

The unrest started after former President Jacob Zuma handed himself over last week to start a 15-month prison sentence for contempt of court.

Zuma supporters, who believe he is the victim of a political witch-hunt, burned tyres and blocked roads in his home province of KwaZulu-Natal.

Support for Zuma stems partly from his image as a man of the people during his nine years in power until 2018, and because some see his jailing as an attack on the nation's largest ethnic group, the Zulu.

Although many wealthy and middle-class South Africans were overjoyed when Zuma was ousted after multiple sleaze and graft allegations, he still retains loyal followings in KwaZulu-Natal and some poor, rural areas.
"""

In [None]:
extracted_info = token_ke.extract(input_text)

In [None]:
extracted_info['sents'][2]

{'noun_chunks': [{'chunk': 'The unrest',
   'root_dep': 'nsubj',
   'root_head': 'started',
   'root_text': 'unrest'},
  {'chunk': 'former President Jacob Zuma',
   'root_dep': 'nsubj',
   'root_head': 'handed',
   'root_text': 'Zuma'},
  {'chunk': 'himself',
   'root_dep': 'dobj',
   'root_head': 'handed',
   'root_text': 'himself'},
  {'chunk': 'last week',
   'root_dep': 'pobj',
   'root_head': 'over',
   'root_text': 'week'},
  {'chunk': 'a 15-month prison sentence',
   'root_dep': 'dobj',
   'root_head': 'start',
   'root_text': 'sentence'},
  {'chunk': 'contempt',
   'root_dep': 'pobj',
   'root_head': 'for',
   'root_text': 'contempt'},
  {'chunk': 'court',
   'root_dep': 'pobj',
   'root_head': 'of',
   'root_text': 'court'}],
 'text': 'The unrest started after former President Jacob Zuma handed himself over last week to start a 15-month prison sentence for contempt of court.',
 'tokens': [{'is_alpha': True,
   'is_stop': True,
   'lemma': 'the',
   'pos': 'DET',
   'tag': 'DT'