In [17]:
from natasha import (
    MorphVocab,
    NewsEmbedding,
    NewsNERTagger,
    Segmenter,
    Doc,

    NamesExtractor,
    DatesExtractor,
    MoneyExtractor,
    AddrExtractor
)
from natasha.doc import DocSpan


segmenter = Segmenter()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

names_extractor = NamesExtractor(morph_vocab)
dates_extractor = DatesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)
addr_extractor = AddrExtractor(morph_vocab)


def get_full_tag(tag):
    """
    Function returning full version of received tag.
    Example: LOC - LOCATION, PER - PERSON etc.
    """

    replacements = {
        "LOC":"LOCATION",
        "PER":"PERSON",
        "ORG":"ORGANISATION",
    }

    new_tag = tag

    for old, new in replacements.items():
        new_tag = new_tag.replace(old, new)

    return new_tag


def get_fact_from_span(span):
    """Function transforms span from natasha NER to a service result format."""

    text = span.text
    tag = span.type
    tokens = []

    for token in span.tokens:
        current_token = {
            "text": token.text,
            "offset": token.start
        }
        tokens.append(current_token)

    fact = {
        "text": text,
        "tag": get_full_tag(tag),
        "tokens": tokens
    }

    return fact


def format_spans(spans):
    """Function formats spans from natasha NER to a service result format."""
    
    facts = []
    
    for span in spans:
        fact = get_fact_from_span(span)
        facts.append(fact)
        
    return facts


def form_result(facts):
    """Function forms service result."""
    
    result = {
        "facts": facts
    }
    
    return result


def tag_entities(text, extractor):
    """Function extracts entities according to given extractor.
    Natasha's lib extractors are expected:
        DatesExtractor,
        MoneyExtractor
    """
    
    def _adjust_tokens(tokens, match):
        """Function makes certain entity tokens spans fitting given text."""
        
        adj_tokens = []
        
        for token in tokens:
            token.start = match.start + token.start
            token.stop = match.start + token.stop
            adj_tokens.append(token)
            
        return adj_tokens 
    
    tags = {
        "NamesExtractor": "NAME",
        "DatesExtractor": "DATE",
        "MoneyExtractor": "MONEY",
        "AddrExtractor": "ADDRESS"
    }
    
    #  Getting matches.
    matches = list(extractor(text))
    spans = []
    
    for match in matches:        
        span_text = text[match.start:match.stop] #  Getting whole money match span text(not tokenized).
        tag = tags[extractor.__class__.__name__] #  Setting tag name
        
        #  Tokenizing text in order to get tokens of entity.
        ndoc = Doc(span_text)
        ndoc.segment(segmenter)
        
        tokens = _adjust_tokens(ndoc.tokens, match)
        
        #  Creating money span class in a similar format to natasha's other tags format(LOC, ORG, PER).
        #  DocSpan class is imported from natasha lib.
        span = DocSpan(
            match.start,
            match.stop,
            tag,
            span_text,
            tokens
        )

        spans.append(span)
    
    return spans


def ner_text(text):
    """Main function of NER module. Prepares result and goes through all steps of named entities extraction."""
    
    ndoc = Doc(text)
    
    ndoc.segment(segmenter)
    ndoc.tag_ner(ner_tagger)
    
    money_spans = tag_entities(text, money_extractor)
    dates_spans = tag_entities(text, dates_extractor)

    entities = money_spans + ndoc.spans + dates_spans
    
    facts = format_spans(entities)
    
    result = form_result(facts)

    return result


#text = "Россия. 03.01.2005 Максим Шутов"
text = "Bob Ross lived in Florida."
ner_text(text)

{'facts': [{'text': 'Bob Ross',
   'tag': 'PERSON',
   'tokens': [{'text': 'Bob', 'offset': 0}, {'text': 'Ross', 'offset': 4}]},
  {'text': 'Florida',
   'tag': 'LOCATION',
   'tokens': [{'text': 'Florida', 'offset': 18}]}]}

In [15]:
ndoc = Doc(text)

ndoc.segment(segmenter)

display(ndoc)
display(ndoc.sents[:100])
display(ndoc.tokens[:100])

Doc(text='Россия. 03.01.2005 Максим Шутов', tokens=[...], sents=[...])

[DocSent(stop=7, text='Россия.', tokens=[...]),
 DocSent(start=8, stop=31, text='03.01.2005 Максим Шутов', tokens=[...])]

[DocToken(stop=6, text='Россия'),
 DocToken(start=6, stop=7, text='.'),
 DocToken(start=8, stop=18, text='03.01.2005'),
 DocToken(start=19, stop=25, text='Максим'),
 DocToken(start=26, stop=31, text='Шутов')]

In [16]:
ndoc.tag_ner(ner_tagger)

display(ndoc.spans)

[DocSpan(stop=6, type='LOC', text='Россия', tokens=[...]),
 DocSpan(start=19, stop=31, type='PER', text='Максим Шутов', tokens=[...])]

In [8]:
ndoc.spans[0].tokens[0]

DocToken(stop=6, text='Россия')

In [11]:
money_spans = tag_entities(text, money_extractor)

money_spans

[]

In [12]:
dates_spans = tag_entities(text, dates_extractor)

dates_spans

[]

In [13]:
entities = money_spans + ndoc.spans + dates_spans

In [14]:
facts = format_spans(entities)

facts

[{'text': 'Россия',
  'tag': 'LOCATION',
  'tokens': [{'text': 'Россия', 'offset': 0}]}]

In [15]:
result = form_result(facts)

result

{'facts': [{'text': 'Россия',
   'tag': 'LOCATION',
   'tokens': [{'text': 'Россия', 'offset': 0}]}]}

In [20]:
import uuid

str(uuid.uuid4())

'c6d38ee7-7982-40a0-8369-7ac4713f2ab2'