In [1]:
import json
from collections import Counter, defaultdict

In [2]:
from natasha import (
    Segmenter,
    MorphVocab,

    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,

    Doc
)

In [None]:
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger

In [None]:
with open('openmedia_corpus.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
    
    statistic = defaultdict(Counter)
    
    for value in data.values():
        for dictionary in value:
            segmenter = Segmenter()
            morph_vocab = MorphVocab()

            emb = NewsEmbedding()
            morph_tagger = NewsMorphTagger(emb)
            syntax_parser = NewsSyntaxParser(emb)
            ner_tagger = NewsNERTagger(emb)

            doc_article = Doc(dictionary['text'])
            doc_article.segment(segmenter)
            doc_article.tag_morph(morph_tagger)
            doc_article.parse_syntax(syntax_parser)
            doc_article.tag_ner(ner_tagger)

            spans = []
            for span in doc_article.spans:
                span.normalize(morph_vocab)
                spans.append(span.normal)

            # есть только 2 категории новостей: эксклюзивные и обычные 
            if 'badge' in dictionary and dictionary['badge'] == 'эксклюзив':
                statistic['exclusive_news'].update(spans)
            else:
                statistic['news'].update(spans)

In [6]:
statistic['news'].most_common(10)

[('Россия', 45),
 ('Москва', 38),
 ('YouTube', 24),
 ('Госдума', 23),
 ('СМИ', 20),
 ('Путин', 17),
 ('Владимир Путин', 16),
 ('Роскомнадзор', 14),
 ('Бурятия', 13),
 ('Facebook', 12)]

In [5]:
statistic['exclusive_news'].most_common(10)

[('Москва', 15),
 ('Московский школьник', 12),
 ('Открытые медиа', 11),
 ('ФБК', 11),
 ('Соболь', 6),
 ('Малая Бронная', 6),
 ('Азербайджан', 5),
 ('Островский', 5),
 ('Минюст', 4),
 ('ОМ', 4)]