In [1]:
import numpy as np
import re
import pandas as pd
import datetime
from navec import Navec
from matplotlib import pyplot as pl

# from sklearn.cluster import KMeans
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics.pairwise import euclidean_distances

In [6]:
pd.set_option('max_colwidth', 120)
pd.set_option('display.width', 500)

In [2]:
path = 'natasha_bd//navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(path)

In [7]:
def get_clean_word(word: str) -> str:
	word = re.sub('[^a-z–∞-—è—ë-]', '', word, flags=re.IGNORECASE)
	word = word.strip('-')
	return word

In [8]:
def news2emb(news:str) -> np.ndarray:
    """–ü—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –≤ —ç–º–±–µ–¥–¥–∏–Ω–≥. –£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è"""
    news_clean = [get_clean_word(word) for word in news.split()]
    embeddings_list = []
    for word in news_clean:
        try:
            embeddings_list.append(navec[word.lower()])
        except KeyError:  # –µ—Å–ª–∏ OUV, —ç–º–±–µ–¥–¥–∏–Ω–≥ = —Å–ø–µ—Ü—ç–º–±–µ–¥–∏–Ω–≥—É unknown –¥–ª—è natasha
            embeddings_list.append(navec['<unk>'])
    news_emb = np.mean(embeddings_list, axis=0)
    return news_emb

In [18]:
def date_news(date):
    """–Ω–æ–≤–æ—Å—Ç–∏ –Ω–∞ –¥–∞—Ç—É –≤ —Ñ–æ—Ä–º–∞—Ç–µ YYYY-MM-DD -> dataframe, short_news_list, embeddings"""
    news_df = pd.read_pickle('table_news.pkl')
    news_df['short_date'] = news_df.date.apply(lambda x: x.strftime('%Y-%m-%d'))
    news_df = news_df[news_df.short_date == date]
    list_news = news_df.title[news_df.short_date == date].to_list()
    news_df.drop('short_date', axis=1, inplace=True)
    embeddings = [news2emb(news) for news in list_news]
    return news_df, list_news, embeddings

In [19]:
date_news('2022-07-20')[0]

Unnamed: 0,date,title,short_news,first_link,raw_news,category,agency
65041,2022-07-20 23:59:18,¬´–ú–µ–¥—É–∑–∞¬ª –Ω–∞—á–Ω–µ—Ç –æ–Ω–ª–∞–π–Ω 148 –¥–Ω–µ–π –≤–æ–π–Ω—ã,¬´–ú–µ–¥—É–∑–∞¬ª –Ω–∞—á–Ω–µ—Ç –æ–Ω–ª–∞–π–Ω 148 –¥–Ω—è –≤–æ–π–Ω—ã –≤ 8 —á–∞—Å–æ–≤ —É—Ç—Ä–∞ –ø–æ –º–æ—Å–∫–æ–≤—Å–∫–æ–º—É –≤—Ä–µ–º–µ–Ω–∏. –ù–∞ —ç—Ç–æ–º –º—ã –ø—Ä–∏–æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—é —Å...,,–ù–∞ —ç—Ç–æ–º –º—ã –ø—Ä–∏–æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—é —Å–æ–±—ã—Ç–∏–π —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –≤—Ç–æ—Ä–∂–µ–Ω–∏—è –≤ –£–∫—Ä–∞–∏–Ω—É. ¬´–ú–µ–¥—É–∑–∞¬ª –Ω–∞—á–Ω–µ—Ç –æ–Ω–ª–∞–π–Ω 148 –¥–Ω—è –≤–æ–π–Ω—ã...,society,meduzalive
65040,2022-07-20 23:53:09,–§–æ—Ç–æ –∏ –≤–∏–¥–µ–æ –ø–æ–∂–∞—Ä–∞ –≤ –°–∫–∞–¥–æ–≤—Å–∫–µ,–£–∫—Ä–∞–∏–Ω—Å–∫–∏–µ –°–ú–ò –ø—É–±–ª–∏–∫—É—é—Ç —Ñ–æ—Ç–æ –∏ –≤–∏–¥–µ–æ –ø–æ–∂–∞—Ä–∞ –≤ –°–∫–∞–¥–æ–≤—Å–∫–µ –•–µ—Ä—Å–æ–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏. –ì–æ—Ä–æ–¥ –∫–æ–Ω—Ç—Ä–æ–ª–∏—Ä—É—é—Ç —Ä–æ—Å—Å–∏–π—Å–∫–∏–µ –≤–æ–µ–Ω–Ω—ã–µ –ú...,https://t.me/UAonlii/33523,–£–∫—Ä–∞–∏–Ω—Å–∫–∏–µ –°–ú–ò –ø—É–±–ª–∏–∫—É—é—Ç —Ñ–æ—Ç–æ –∏ –≤–∏–¥–µ–æ –ø–æ–∂–∞—Ä–∞ –≤ –°–∫–∞–¥–æ–≤—Å–∫–µ –•–µ—Ä—Å–æ–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏. –ì–æ—Ä–æ–¥ –∫–æ–Ω—Ç—Ä–æ–ª–∏—Ä—É—é—Ç —Ä–æ—Å—Å–∏–π—Å–∫–∏–µ –≤–æ–µ–Ω–Ω—ã–µ ...,society,meduzalive
65039,2022-07-20 23:47:32,–ë–µ–ª–æ—Ä—É—Å—Å–∫–∏–π –∞–≤–∏–∞–¥–∏—Å–ø–µ—Ç—á–µ—Ä —Ç–∞–π–Ω–æ –∑–∞–ø–∏—Å–∞–ª –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –≤–æ –≤—Ä–µ–º—è –ø–æ—Å–∞–¥–∫–∏ —Å–∞–º–æ–ª–µ—Ç–∞ –≤ –ú–∏–Ω—Å–∫–µ,"–ë–µ–ª–æ—Ä—É—Å—Å–∫–∏–π –∞–≤–∏–∞–¥–∏—Å–ø–µ—Ç—á–µ—Ä —Ç–∞–π–Ω–æ –∑–∞–ø–∏—Å–∞–ª –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –≤–æ –≤—Ä–µ–º—è –ø–æ—Å–∞–¥–∫–∏ —Å–∞–º–æ–ª–µ—Ç–∞ Ryanair –≤ –ú–∏–Ω—Å–∫–µ, –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ –∫–æ—Ç–æ—Ä—ã—Ö ...",https://meduza.io/feature/2022/07/20/kakoy-kod-ugrozy-nu-puskay-budet-krasnyy?utm_source=telegram&utm_medium=live&ut...,"¬´–ö–∞–∫–æ–π –∫–æ–¥ —É–≥—Ä–æ–∑—ã? –ù—É, –ø—É—Å–∫–∞–π –±—É–¥–µ—Ç –∫—Ä–∞—Å–Ω—ã–π¬ª. –î–∏—Å–ø–µ—Ç—á–µ—Ä –∞—ç—Ä–æ–ø–æ—Ä—Ç–∞ —Ç–∞–π–Ω–æ –∑–∞–ø–∏—Å–∞–ª –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –≤–æ –≤—Ä–µ–º—è –ø–æ—Å–∞–¥–∫–∏ —Ä–µ–π—Å–∞ Ry...",society,meduzalive
65038,2022-07-20 23:38:03,–ú–æ–Ω–∞—Ç–∏–∫ –≤—ã—Å—Ç—É–ø–∏–ª –≤ –ö–∏–µ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ,–õ–µ–≥–µ–Ω–¥–∞—Ä–Ω—ã–π —É–∫—Ä–∞–∏–Ω—Å–∫–∏–π –º—É–∑—ã–∫–∞–Ω—Ç –ú–æ–Ω–∞—Ç–∏–∫ –≤—ã—Å—Ç—É–ø–∏–ª –≤ –ö–∏–µ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ. –í–∏–¥–µ–æ –≤—ã—Å—Ç—É–ø–ª–µ–Ω–∏—è –Ω–∞ —Å—Ç–∞–Ω—Ü–∏–∏ ¬´–ú–∞–π–¥–∞–Ω –ù–µ–∑–∞–ª–µ–∂–Ω–æ—Å—Ç–∏...,https://t.me/c/1210003725/21408,–£–∫—Ä–∞–∏–Ω—Å–∫–∏–π –º—É–∑—ã–∫–∞–Ω—Ç –ú–æ–Ω–∞—Ç–∏–∫ –≤–µ—Ä–Ω—É–ª—Å—è –Ω–∞ —Ä–æ–¥–∏–Ω—É –∏ –≤—ã—Å—Ç—É–ø–∏–ª –≤ –ö–∏–µ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ \n. –í–∏–¥–µ–æ –≤—ã—Å—Ç—É–ø–ª–µ–Ω–∏—è –Ω–∞ —Å—Ç–∞–Ω—Ü–∏–∏ ¬´–ú–∞–π–¥...,society,meduzalive
65037,2022-07-20 23:27:59,"–ù–∞ 72-–ª–µ—Ç–Ω—é—é –ø–µ–Ω—Å–∏–æ–Ω–µ—Ä–∫—É, –ø—Ä–∏–∫—Ä–µ–ø–∏–≤—à—É—é –∞–Ω—Ç–∏–≤–æ–µ–Ω–Ω—É—é –∑–∞–ø–∏—Å–∫—É –∫ —Ñ–ª–∞–≥—É, —Å–æ—Å—Ç–∞–≤–∏–ª–∏ –ø—Ä–æ—Ç–æ–∫–æ–ª –æ ¬´–¥–∏—Å–∫—Ä–µ–¥–∏—Ç–∞—Ü–∏–∏ –∞—Ä–º–∏–∏¬ª","–ù–∞ 72-–ª–µ—Ç–Ω—é—é –ø–µ–Ω—Å–∏–æ–Ω–µ—Ä–∫—É, –∫–æ—Ç–æ—Ä–∞—è –ø—Ä–∏–∫—Ä–µ–ø–∏–ª–∞ –∞–Ω—Ç–∏–≤–æ–µ–Ω–Ω—É—é –∑–∞–ø–∏—Å–∫—É –∫ —Ä–æ—Å—Å–∏–π—Å–∫–æ–º—É —Ñ–ª–∞–≥—É, —Å–æ—Å—Ç–∞–≤–∏–ª–∏ –ø—Ä–æ—Ç–æ–∫–æ–ª –æ ¬´–¥–∏—Å–∫—Ä–µ–¥–∏...",https://t.me/ovdinfolive/11669,–ù–∞ 72-–ª–µ—Ç–Ω—é—é –ø–µ–Ω—Å–∏–æ–Ω–µ—Ä–∫—É —Å–æ—Å—Ç–∞–≤–∏–ª–∏ –ø—Ä–æ—Ç–æ–∫–æ–ª –æ ¬´–¥–∏—Å–∫—Ä–µ–¥–∏—Ç–∞—Ü–∏–∏ –∞—Ä–º–∏–∏¬ª –ø–æ—Å–ª–µ –¥–æ–Ω–æ—Å–∞ –Ω–∞—Å—Ç–æ—è—Ç–µ–ª—è –•—Ä–∞–º–∞ –°–≤—è—Ç–æ–≥–æ –õ—é–¥–æ–≤–∏–∫–∞ ...,society,meduzalive
...,...,...,...,...,...,...,...
15002,2022-07-20 08:10:24,–í–ª–æ–∂–µ–Ω–∏—è –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ –≤ –∞–∫—Ü–∏–∏ —É–ø–∞–ª–∏ –¥–æ —Ä–µ–∫–æ—Ä–¥–Ω–æ–≥–æ —É—Ä–æ–≤–Ω—è —Å 2008 –≥–æ–¥–∞,"–ü–æ –∏—Ç–æ–≥–∞–º –∏—é–ª—å—Å–∫–æ–≥–æ –æ–ø—Ä–æ—Å–∞ —É–ø—Ä–∞–≤–ª—è—é—â–∏—Ö —Ñ–æ–Ω–¥–∞–º–∏ –≤—ã—è—Å–Ω–∏–ª–æ—Å—å, —á—Ç–æ –≤–ª–æ–∂–µ–Ω–∏—è –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ –≤ –∞–∫—Ü–∏–∏ —É–ø–∞–ª–∏ –¥–æ —É—Ä–æ–≤–Ω—è, –∫–æ—Ç–æ—Ä—ã–π ...",,"BofA: GLOBAL FUND MANAGERS SURVEY ‚Äì –ú–ê–ö–°–ò–ú–ê–õ–¨–ù–´–ô –†–´–ù–û–ß–ù–´–ô –ü–ï–°–°–ò–ú–ò–ó–ú, –û–ü–ê–°–ï–ù–ò–Ø –†–ï–¶–ï–°–°–ò–ò –¢–ê–ö–ñ–ï –í–ó–õ–ï–¢–ê–Æ–¢ –í–í–ï–†–• –ò—Ç–æ–≥ –∏—é...",economy,russianmacro
15001,2022-07-20 07:41:54,–ù–∞—Ä–æ–¥–Ω—ã–π –ë–∞–Ω–∫ –ö–∏—Ç–∞—è —Å–æ—Ö—Ä–∞–Ω–∏–ª —Å—Ç–∞–≤–∫—É —Å—Ä–µ–¥–Ω–µ—Å—Ä–æ—á–Ω–æ–≥–æ –∫—Ä–µ–¥–∏—Ç–æ–≤–∞–Ω–∏—è –Ω–∞ –æ—Ç–º–µ—Ç–∫–µ 3.70% –≥–æ–¥–æ–≤—ã—Ö,"–ù–∞ —Ñ–æ–Ω–µ —Ä–µ–∑–∫–æ–≥–æ –ø–∞–¥–µ–Ω–∏—è –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–∏—Ö —Ü–µ–Ω –≤ –ö–∏—Ç–∞–µ –ù–∞—Ä–æ–¥–Ω—ã–π –ë–∞–Ω–∫ –ö–∏—Ç–∞—è (–ù–ë–ö), –∫–∞–∫ –∏ –æ–∂–∏–¥–∞–ª–æ—Å—å, —Å–æ—Ö—Ä–∞–Ω–∏–ª —Å—Ç–∞–≤–∫—É —Å—Ä–µ–¥...",,"–ë–ê–ù–ö –ö–ò–¢–ê–Ø –ü–û –ò–¢–û–ì–ê–ú –°–í–û–ï–ì–û –ó–ê–°–ï–î–ê–ù–ò–Ø –í–ù–û–í–¨ –ù–ï –°–¢–ê–õ –ú–ï–ù–Ø–¢–¨ –ì–û–î–ò–ß–ù–£–Æ –ö–õ–Æ–ß–ï–í–£–Æ –°–¢–ê–í–ö–£ –ù–∞—Ä–æ–¥–Ω—ã–π –ë–∞–Ω–∫ –ö–∏—Ç–∞—è (–ù–ë–ö), –∫–∞–∫...",economy,russianmacro
15000,2022-07-20 07:22:41,–ù–∞ –∑–∞–æ–∫–µ–∞–Ω—Å–∫–∏—Ö —Ñ–æ–Ω–¥–æ–≤—ã—Ö —Ä—ã–Ω–∫–∞—Ö —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç—Å—è –ª—É—á—à–∞—è –¥–∏–Ω–∞–º–∏–∫–∞ –∑–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Ç—Ä–∏ –Ω–µ–¥–µ–ª–∏,–ù–∞ –∑–∞–æ–∫–µ–∞–Ω—Å–∫–∏—Ö —Ñ–æ–Ω–¥–æ–≤—ã—Ö —Ä—ã–Ω–∫–∞—Ö —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç—Å—è –ª—É—á—à–∞—è –¥–∏–Ω–∞–º–∏–∫–∞ –∑–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Ç—Ä–∏ –Ω–µ–¥–µ–ª–∏. –ù–∞–∏–±–æ–ª—å—à–∏–π —Å–ø—Ä–æ—Å –≤ –≤—ã—Å–æ–∫–æ—Ç–µ—Ö–Ω–æ–ª–æ...,,"üü¢ - –ª—É—á—à–∞—è –¥–∏–Ω–∞–º–∏–∫–∞ –∑–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Ç—Ä–∏ –Ω–µ–¥–µ–ª–∏ –Ω–∞ –∑–∞–æ–∫–µ–∞–Ω—Å–∫–∏—Ö —Ä—ã–Ω–∫–∞—Ö, WallStreet –ø–æ–∫–∞–∑–∞–ª –¥–æ–≤–æ–ª—å–Ω–æ —É–≤–µ—Ä–µ–Ω–Ω—ã–π —Ä–æ—Å—Ç –∏–∑-–∑–∞ ...",economy,russianmacro
64939,2022-07-20 00:08:34,–ü–æ—Å—Ç–∞–≤–∫–∏ –≥–∞–∑–∞ –≤ –ï–° –ø–æ ¬´–°–µ–≤–µ—Ä–Ω–æ–º—É –ø–æ—Ç–æ–∫—É ‚Äî 1¬ª –≤–æ–∑–æ–±–Ω–æ–≤—è—Ç—Å—è 21 –∏—é–ª—è,–ü–æ—Å—Ç–∞–≤–∫–∏ —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –≥–∞–∑–∞ –≤ –ï–° –ø–æ –≥–∞–∑–æ–ø—Ä–æ–≤–æ–¥—É ¬´–°–µ–≤–µ—Ä–Ω—ã–π –ø–æ—Ç–æ–∫ ‚Äî 1¬ª –≤–æ–∑–æ–±–Ω–æ–≤—è—Ç—Å—è 21 –∏—é–ª—è. –ü—Ä–∏ —ç—Ç–æ–º –ï–° –ø–ª–∞–Ω–∏—Ä—É–µ—Ç –æ—Å–ª–∞–±...,,"–ò—Ç–æ–≥–∏ —Å—Ç–æ —Å–æ—Ä–æ–∫ —à–µ—Å—Ç–æ–≥–æ –¥–Ω—è –≤–æ–π–Ω—ã \n ‚û§–ü–æ –¥–∞–Ω–Ω—ã–º Reuters, –ø–æ—Å—Ç–∞–≤–∫–∏ —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –≥–∞–∑–∞ –≤ –ï–° –ø–æ –≥–∞–∑–æ–ø—Ä–æ–≤–æ–¥—É ¬´–°–µ–≤–µ—Ä–Ω—ã–π –ø...",society,meduzalive


In [20]:
date_df, day_news_list, embeddings = date_news('2022-07-20')

In [22]:
# day_news_list

In [24]:
# embeddings

### –ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –ª—É—á—à–∏—Ö —á–µ—Ç—ã—Ä—ë—Ö –º–æ–¥–µ–ª–µ–π

In [None]:
# pip install diameter-clustering

In [34]:
import sklearn.neighbors
from diameter_clustering import MaxDiameterClustering, LeaderClustering, QTClustering
from diameter_clustering.dist_matrix import compute_sparse_dist_matrix
from sklearn.cluster import (DBSCAN, OPTICS, AffinityPropagation,
                             AgglomerativeClustering, Birch, KMeans, MeanShift,
                             MiniBatchKMeans)

In [37]:
dist_matrix = compute_sparse_dist_matrix(list(embeddings), metric='cosine')

In [38]:
model_Birch = Birch(n_clusters=None, threshold=0.75, branching_factor=50)
model_AgglomerativeClustering = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='complete',
                                        distance_threshold=0.3)
model_MaxDiameterClustering = MaxDiameterClustering(max_distance=0.5, metric=sorted(sklearn.neighbors.VALID_METRICS['brute']), precomputed_dist=True)
model_LeaderClustering = LeaderClustering(max_radius=0.3, metric=sorted(sklearn.neighbors.VALID_METRICS['brute']), precomputed_dist=True)
model_QTClustering = QTClustering(max_radius=0.3, metric=sorted(sklearn.neighbors.VALID_METRICS['brute']),
                             min_cluster_size=2, precomputed_dist=True)

In [40]:
labels_Birch = model_Birch.fit_predict(list(embeddings))
labels_AgglomerativeClustering = model_AgglomerativeClustering.fit_predict(list(embeddings))
labels_MaxDiameterClustering = model_MaxDiameterClustering.fit_predict(compute_sparse_dist_matrix(list(embeddings), metric='cosine'))
labels_LeaderClustering = model_LeaderClustering.fit_predict(compute_sparse_dist_matrix(list(embeddings), metric='cosine'))
labels_QTClustering = model_QTClustering.fit_predict(compute_sparse_dist_matrix(list(embeddings), metric='cosine'))

MaxDiameterClustering fit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:00<00:00, 5655.40it/s]
LeaderClustering fit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:00<00:00, 6067.44it/s]
QTClustering fit. Current cluster size 2.0, total count 20.0:  12%|‚ñà‚ñè        | 20.0/165 [00:00<00:00, 2222.15it/s]


In [45]:
def show_clusters(labels):
    """–í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ä–∞–±–æ—Ç—ã –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏: labels -> pandas.df"""
    df = pd.DataFrame()
    df['text'] = date_df.title
    df['label'] = labels
    df['count'] = df.groupby('label')['label'].transform('count')
    df = df.sort_values(by=['count', 'label'], ascending=[False, True]).loc[:,['text', 'label']]
    num_clusters = len(df.label.value_counts())
    return num_clusters, df

In [44]:
len(date_df)

165

In [49]:
print(f'Birch: {show_clusters(labels_Birch)[0]}')
print(f'Agglomerative: {show_clusters(labels_AgglomerativeClustering)[0]}')
print(f'MaxDiameterClusterin: {show_clusters(labels_MaxDiameterClustering)[0]}')
print(f'LeaderClustering: {show_clusters(labels_LeaderClustering)[0]}')
print(f'QTClustering: {show_clusters(labels_QTClustering)[0]}')


Birch: 159
Agglomerative: 133
MaxDiameterClusterin: 155
LeaderClustering: 155
QTClustering: 9


In [50]:
show_clusters(labels_Birch)[1].head(30)

Unnamed: 0,text,label
35701,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,149
35700,–¶–ë —Ö–æ—á–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,149
19063,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,149
65027,–†–æ—Å—Å–∏–π—Å–∫–∏–µ –≤–ª–∞—Å—Ç–∏ –ø—ã—Ç–∞—é—Ç—Å—è –∑–∞—Å—Ç–∞–≤–∏—Ç—å ¬´–í–∏–∫–∏–ø–µ–¥–∏—é¬ª —É–¥–∞–ª–∏—Ç—å —Å—Ç–∞—Ç—å–∏ –æ –≤—Ç–æ—Ä–∂–µ–Ω–∏–∏ –≤ –£–∫—Ä–∞–∏–Ω—É,57
6556,–†–æ—Å—Å–∏–π—Å–∫–∏–µ –≤–ª–∞—Å—Ç–∏ –ø—ã—Ç–∞—é—Ç—Å—è –∑–∞—Å—Ç–∞–≤–∏—Ç—å ¬´–í–∏–∫–∏–ø–µ–¥–∏—é¬ª —É–¥–∞–ª–∏—Ç—å —Å—Ç–∞—Ç—å–∏ –æ –≤—Ç–æ—Ä–∂–µ–Ω–∏–∏ –≤ –£–∫—Ä–∞–∏–Ω—É. –°–∞–º–æ–µ –≤—Ä–µ–º—è –∏—Ö –ø—Ä–æ—á–µ—Å—Ç—å,57
57643,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,92
64997,–ú–∞–≥–Ω—É—Å –ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å –∑–≤–∞–Ω–∏–µ —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,92
35687,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,103
64965,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,103
57637,¬´–ê–≤—Ç–æ–í–ê–ó¬ª —Å–¥–µ–ª–∞–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π,104


In [51]:
show_clusters(labels_AgglomerativeClustering)[1].head(30)

Unnamed: 0,text,label
35687,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,4
57637,¬´–ê–≤—Ç–æ–í–ê–ó¬ª —Å–¥–µ–ª–∞–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π,4
64965,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,4
23513,–ê–≤—Ç–æ–í–ê–ó –≤—ã–ø—É—Å—Ç–∏—Ç 1800 Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤,4
35701,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,40
35700,–¶–ë —Ö–æ—á–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,40
19063,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,40
35689,–¶–ë –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º —Å–¥–µ–ª–∫–∏ —Å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–º–∏ –±—É–º–∞–≥–∞–º–∏,40
65032,"–ù–æ–≤–∞–∫: –†–æ—Å—Å–∏—è –Ω–µ –±—É–¥–µ—Ç –ø–æ—Å—Ç–∞–≤–ª—è—Ç—å –Ω–µ—Ñ—Ç—å –Ω–∞ –º–∏—Ä–æ–≤—ã–µ —Ä—ã–Ω–∫–∏, –µ—Å–ª–∏ —Å—Ç—Ä–∞–Ω—ã G7 –æ–≥—Ä–∞–Ω–∏—á–∞—Ç —Ü–µ–Ω—É",0
23541,–ò—Å–ø–∞–Ω–∏—è –Ω–µ –æ–¥–æ–±—Ä—è–µ—Ç –∏–¥–µ—é –ï–≤—Ä–æ–∫–æ–º–∏—Å—Å–∏–∏ –ø–æ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—é –ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏—è –≥–∞–∑–∞ –Ω–∞ 15%,0


In [52]:
show_clusters(labels_MaxDiameterClustering)[1].head(30)

Unnamed: 0,text,label
23533,–ü–æ—Å—Ç–ø—Ä–µ–¥—ã –ï–° —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,16
6555,–ì–µ—Ä–º–∞–Ω–∏—è —É—Ç–≤–µ—Ä–¥–∏–ª–∞ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,16
19066,–í –ï–≤—Ä–æ—Å–æ—é–∑–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç –∞–Ω—Ç–∏—Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —Å–∞–Ω–∫—Ü–∏–π,16
57643,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,52
64997,–ú–∞–≥–Ω—É—Å –ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å –∑–≤–∞–Ω–∏–µ —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,52
10190,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∏–≥—Ä–∞—Ç—å —Å –ù–µ–ø–æ–º–Ω—è—â–∏–º–æ–º –∑–∞ —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞,52
35701,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,1
35700,–¶–ë —Ö–æ—á–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,1
57644,–ï–° –≥–æ—Ç–æ–≤–∏—Ç—Å—è –∫ ¬´–∑–∏–º–µ –±–µ–∑ —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –≥–∞–∑–∞¬ª,8
23524,–í –ï–° –≥–æ—Ç–æ–≤—è—Ç—Å—è –ø—Ä–æ–≤–µ—Å—Ç–∏ –ø–µ—Ä–≤—É—é –∑–∏–º—É –±–µ–∑ —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –≥–∞–∑–∞,8


In [53]:
show_clusters(labels_LeaderClustering)[1].head(30)

Unnamed: 0,text,label
57643,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,16
64997,–ú–∞–≥–Ω—É—Å –ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å –∑–≤–∞–Ω–∏–µ —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,16
10190,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∏–≥—Ä–∞—Ç—å —Å –ù–µ–ø–æ–º–Ω—è—â–∏–º–æ–º –∑–∞ —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞,16
23533,–ü–æ—Å—Ç–ø—Ä–µ–¥—ã –ï–° —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,27
6555,–ì–µ—Ä–º–∞–Ω–∏—è —É—Ç–≤–µ—Ä–¥–∏–ª–∞ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,27
19066,–í –ï–≤—Ä–æ—Å–æ—é–∑–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç –∞–Ω—Ç–∏—Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —Å–∞–Ω–∫—Ü–∏–π,27
35700,–¶–ë —Ö–æ—á–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,1
19063,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,1
64998,–í –ü–µ—Ç–µ—Ä–±—É—Ä–≥–µ –ø—Ä–µ–∫—Ä–∞—â–µ–Ω–æ —É–≥–æ–ª–æ–≤–Ω–æ–µ –ø—Ä–µ—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ –±–ª–æ–≥–µ—Ä–∞ –Æ—Ä–∏—è –•–æ–≤–∞–Ω—Å–∫–æ–≥–æ,8
6554,–°—É–¥ –ø—Ä–µ–∫—Ä–∞—Ç–∏–ª —É–≥–æ–ª–æ–≤–Ω–æ–µ –¥–µ–ª–æ –Æ—Ä–∏—è –•–æ–≤–∞–Ω—Å–∫–æ–≥–æ,8


In [58]:
show_clusters(labels_QTClustering)[1].tail(20)

Unnamed: 0,text,label
35701,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,0
35700,–¶–ë —Ö–æ—á–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,0
19063,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,0
23533,–ü–æ—Å—Ç–ø—Ä–µ–¥—ã –ï–° —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,1
6555,–ì–µ—Ä–º–∞–Ω–∏—è —É—Ç–≤–µ—Ä–¥–∏–ª–∞ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ –†–æ—Å—Å–∏–∏,1
19066,–í –ï–≤—Ä–æ—Å–æ—é–∑–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Å–µ–¥—å–º–æ–π –ø–∞–∫–µ—Ç –∞–Ω—Ç–∏—Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —Å–∞–Ω–∫—Ü–∏–π,1
57643,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,2
64997,–ú–∞–≥–Ω—É—Å –ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∑–∞—â–∏—â–∞—Ç—å –∑–≤–∞–Ω–∏–µ —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞ –ø–æ —à–∞—Ö–º–∞—Ç–∞–º,2
10190,–ö–∞—Ä–ª—Å–µ–Ω –æ—Ç–∫–∞–∑–∞–ª—Å—è –∏–≥—Ä–∞—Ç—å —Å –ù–µ–ø–æ–º–Ω—è—â–∏–º–æ–º –∑–∞ —Ç–∏—Ç—É–ª —á–µ–º–ø–∏–æ–Ω–∞ –º–∏—Ä–∞,2
64998,–í –ü–µ—Ç–µ—Ä–±—É—Ä–≥–µ –ø—Ä–µ–∫—Ä–∞—â–µ–Ω–æ —É–≥–æ–ª–æ–≤–Ω–æ–µ –ø—Ä–µ—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ –±–ª–æ–≥–µ—Ä–∞ –Æ—Ä–∏—è –•–æ–≤–∞–Ω—Å–∫–æ–≥–æ,3


### –î–ª—è –±–æ—Ç–∞ –≤—ã–±–∏—Ä–∞–µ–º –∞–≥–ª–æ–º–µ—Ä–∞—Ç–∏–≤–Ω—É—é –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—é

In [67]:
def show_date(date):
    """–í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ä–∞–±–æ—Ç—ã –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏: labels -> pandas.df"""
    date_df, day_news_list, embeddings = date_news(date)
    clast_model = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='complete',
                                        distance_threshold=0.3)
    labels = clast_model.fit_predict(list(embeddings))
    date_df['label'] = labels
    date_df['count'] = date_df.groupby('label')['label'].transform('count')
    date_df = date_df.sort_values(by=['count', 'label'], ascending=[False, True])
    date_df.drop('count', axis=1, inplace=True)
    return date_df

In [68]:
show_date('2022-07-20')

Unnamed: 0,date,title,short_news,first_link,raw_news,category,agency,label
35687,2022-07-20 12:41:14,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,"¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus ‚Äî –Ω–µ –º–µ–Ω–µ–µ 1800 –º–∞—à–∏–Ω, –∏—Ö –Ω–∞—á–Ω—É—Ç —Å–æ–±–∏—Ä–∞—Ç—å –¥–æ –∫–æ–Ω—Ü–∞ –∏—é–ª—è. –ê–≤—Ç–æ–º–æ–±–∏...",vc.ru/transport/466225,"¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus ‚Äî –Ω–µ –º–µ–Ω–µ–µ 1800 –º–∞—à–∏–Ω, –∏—Ö –Ω–∞—á–Ω—É—Ç —Å–æ–±–∏—Ä–∞—Ç—å –¥–æ –∫–æ–Ω—Ü–∞ –∏—é–ª—è. –ê–≤—Ç–æ–º–æ–±...",technology,vcnews,4
57637,2022-07-20 12:29:08,¬´–ê–≤—Ç–æ–í–ê–ó¬ª —Å–¥–µ–ª–∞–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π,¬´–ê–≤—Ç–æ–í–ê–ó¬ª —Å–¥–µ–ª–∞–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π. –ö–æ–º–ø–∞–Ω–∏—è –∑–∞–ø–ª–∞–Ω–∏—Ä–æ–≤–∞–ª–∞ –≤—ã–ø—É—Å—Ç...,https://tjournal.ru/news/684880,¬´–ê–≤—Ç–æ–í–ê–ó¬ª —Å–¥–µ–ª–∞–µ—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π. –ö–æ–º–ø–∞–Ω–∏—è –∑–∞–ø–ª–∞–Ω–∏—Ä–æ–≤–∞–ª–∞ –≤—ã–ø—É—Å...,technology,TJournal,4
64965,2022-07-20 11:10:48,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus ‚Äî –∏–∑ —Å–∫–æ–ø–∏–≤—à–∏—Ö—Å—è –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π. –†–∞–Ω–µ–µ –∫–æ–º–ø–∞–Ω–∏—è –≤–æ–∑–æ–±–Ω–æ–≤–∏–ª–∞...,https://meduza.io/news/2022/07/20/avtovaz-vypustit-ogranichennuyu-partiyu-lada-largus-iz-skopivshihsya-na-skladah-za...,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω—É—é –ø–∞—Ä—Ç–∏—é Lada Largus ‚Äî –∏–∑ —Å–∫–æ–ø–∏–≤—à–∏—Ö—Å—è –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∑–∞–ø—á–∞—Å—Ç–µ–π \n –í —Ç–µ—á–µ–Ω–∏–µ –ø–æ—á—Ç–∏ –≤—Å–µ–≥–æ ...,technology,meduzalive,4
23513,2022-07-20 11:05:00,–ê–≤—Ç–æ–í–ê–ó –≤—ã–ø—É—Å—Ç–∏—Ç 1800 Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤,¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç 1800 Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤ –∏–∑-–∑–∞ –Ω–µ—Ö–≤–∞—Ç–∫–∏ –∫–æ–º–ø–ª–µ–∫—Ç—É—é—â–∏—Ö. –ö –∫–æ–Ω—Ü—É –ª–µ—Ç–∞...,http://vdmsti.ru/gEOn,"¬´–ê–≤—Ç–æ–í–ê–ó¬ª –≤—ã–ø—É—Å—Ç–∏—Ç 1800 Lada Largus –∏–∑ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—ã—Ö –Ω–∞ —Å–∫–ª–∞–¥–∞—Ö –∫–æ–º–ø–æ–Ω–µ–Ω—Ç–æ–≤, —Å–æ–æ–±—â–∏–ª–∞ –ø—Ä–µ—Å—Å-—Å–ª—É–∂–±–∞ –∫–æ–º–ø–∞–Ω–∏–∏. –í –º–∞–µ –ø...",technology,vedomosti,4
35701,2022-07-20 19:45:23,–¶–ë –ø—Ä–µ–¥–ª–æ–∂–∏–ª –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏,–í –ø—Ä–µ–¥–¥–≤–µ—Ä–∏–∏ –≤—ã–±–æ—Ä–æ–≤ –≤ –ì–æ—Å–¥—É–º—É –¶–ë –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏ ¬´—É—Å...,vc.ru/finance/466661,–ü—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–µ –¶–ë –∑–∞–ø—Ä–µ—Ç–∏—Ç—å –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–º –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º –ø–æ–∫—É–ø–∞—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –∞–∫—Ü–∏–∏ ¬´—É—Å–∏–ª–∏—Ç –Ω–µ–¥–æ–≤–µ—Ä–∏–µ —á–∞—Å—Ç–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ...,economy,vcnews,40
...,...,...,...,...,...,...,...,...
4983,2022-07-20 20:13:29,Google –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–∏—Ç –ø—Ä–∏–µ–º –Ω–∞ —Ä–∞–±–æ—Ç—É –Ω–∞ –¥–≤–µ –Ω–µ–¥–µ–ª–∏,"Google –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–∏—Ç –ø—Ä–∏–µ–º –Ω–∞ —Ä–∞–±–æ—Ç—É –Ω–∞ –¥–≤–µ –Ω–µ–¥–µ–ª–∏, –Ω–µ –∑–∞—Ç—Ä–∞–≥–∏–≤–∞—è —É–∂–µ —Å–¥–µ–ª–∞–Ω–Ω—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è. –ò —ç—Ç–æ –ø–æ—Å–ª–µ —Ç–æ–≥–æ –∫–∞–∫ –Ω–∞ ...",https://www.theinformation.com/articles/google-announces-hiring-pause,"Google –ø—Ä–∏–æ—Å—Ç–∞–Ω–æ–≤–∏—Ç –ø—Ä–∏–µ–º –Ω–∞ —Ä–∞–±–æ—Ç—É –Ω–∞ –¥–≤–µ –Ω–µ–¥–µ–ª–∏, –Ω–µ –∑–∞—Ç—Ä–∞–≥–∏–≤–∞—è —É–∂–µ —Å–¥–µ–ª–∞–Ω–Ω—ã–µ –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è. –ò —ç—Ç–æ –ø–æ—Å–ª–µ —Ç–æ–≥–æ –∫–∞–∫ –Ω–∞ ...",society,addmeto,128
19067,2022-07-20 18:00:21,–¶–ë –∑–∞–ø—Ä–µ—Ç–∏–ª —Ç–æ—Ä–≥–æ–≤–ª—é –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–º–∏ –∞–∫—Ü–∏—è–º–∏ –¥–ª—è –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤,–¶–ë –∑–∞–ø—Ä–µ—Ç–∏–ª —Ç–æ—Ä–≥–æ–≤–ª—é –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–º–∏ –∞–∫—Ü–∏—è–º–∏ –¥–ª—è –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤. –ü—Ä–æ–¥–∞–≤–∞—Ç—å –∞–∫—Ü–∏–∏ –∏ –∑–∞–∫—Ä—ã–≤–∞—Ç—å –∫–æ—Ä–æ—Ç–∫–∏–µ –ø...,https://cbr.ru/Crosscut/LawActs/File/5955,"–ó–∞–ø—Ä–µ—Ç –Ω–∞ —Ç–æ—Ä–≥–æ–≤–ª—é –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–º–∏ –∞–∫—Ü–∏—è–º–∏ –¥–ª—è –Ω–µ–∫–≤–∞–ª–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ –µ—â–µ –Ω–µ –∑–∞—Ä–∞–±–æ—Ç–∞–ª, –Ω–æ –¶–ë —É–∂–µ –¥–µ–π—Å—Ç–≤—É–µ—Ç: –≤ ...",society,thebell_io,129
15006,2022-07-20 19:02:19,"–†–æ—Å—Å–∏—è–Ω–µ –Ω–µ –æ—Å–æ–∑–Ω–∞—é—Ç —Ç–æ–≥–æ, —á—Ç–æ –Ω–∞–º –≥—Ä–æ–∑–∏—Ç –≤ —É—Å–ª–æ–≤–∏—è—Ö —É—Å–∏–ª–∏–≤–∞—é—â–µ–π—Å—è –∏–∑–æ–ª—è—Ü–∏–∏",–ù–∞ —Ñ–æ–Ω–µ –Ω–∞—Ä–∞—Å—Ç–∞—é—â–∏—Ö –ø—Ä–æ–±–ª–µ–º –∏ —É–≥–ª—É–±–ª—è—é—â–µ–≥–æ—Å—è —Å–ø–∞–¥–∞ —ç–∫–æ–Ω–æ–º–∏–∫–∏ –±–ª–∏–∑–∫–∏–π –∫ —Ä–µ–∫–æ—Ä–¥–Ω–æ–º—É –æ–ø—Ç–∏–º–∏–∑–º—É —Ä–æ—Å—Å–∏—è–Ω –≤—ã–≥–ª—è–¥–∏—Ç –∏—Ä—Ä–∞—Ü–∏–æ...,https://cbr.ru/Collection/Collection/File/42183/inFOM_22-07.pdf,"–ü–û–¢–†–ï–ë–ò–¢–ï–õ–¨–°–ö–ò–ï –ù–ê–°–¢–†–û–ï–ù–ò–Ø: –û–ü–¢–ò–ú–ò–ó–ú –£–°–ò–õ–ò–í–ê–ï–¢–°–Ø –û–ø—Ä–æ—Å –Ω–∞—Å–µ–ª–µ–Ω–∏—è –∏–Ω–§–û–ú, –ø—Ä–æ–≤–æ–¥–∏–≤—à–∏–π—Å—è —Å 29 –∏—é–Ω—è –ø–æ 8 –∏—é–ª—è, –∑–∞—Ñ–∏–∫—Å–∏—Ä...",economy,russianmacro,130
35699,2022-07-20 18:35:29,–ú–¢–° –∫—É–ø–∏–ª–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–∞ —Å–µ—Ä–≤–∏—Å–æ–≤ –¥–ª—è –≤–∏–¥–µ–æ–∑–≤–æ–Ω–∫–æ–≤ Webinar,–ú–¢–° –∫—É–ø–∏–ª–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–∞ —Å–µ—Ä–≤–∏—Å–æ–≤ –¥–ª—è –≤–∏–¥–µ–æ–∑–≤–æ–Ω–∫–æ–≤ Webinar. –°—É–º–º–∞ —Å–¥–µ–ª–∫–∏ –º–æ–≥–ª–∞ —Å–æ—Å—Ç–∞–≤–∏—Ç—å 2 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π. –ù–∞ –µ—ë –±–∞–∑–µ –ú–¢...,vc.ru/services/466573,–ú–¢–° –∫—É–ø–∏–ª —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–∞ —Å–µ—Ä–≤–∏—Å–æ–≤ –¥–ª—è –≤–∏–¥–µ–æ–∑–≤–æ–Ω–∫–æ–≤ Webinar. –°—É–º–º–∞ —Å–¥–µ–ª–∫–∏ –º–æ–≥–ª–∞ —Å–æ—Å—Ç–∞–≤–∏—Ç—å 2 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π. –ü–æ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω...,technology,vcnews,131


In [69]:
final_df = show_date('2022-07-20')

In [70]:
final_df.to_excel('2022-07-20.xlsx')

In [71]:
final_df.category.value_counts()

society          83
economy          44
sports           16
technology       12
entertainment     7
science           3
Name: category, dtype: int64

In [72]:
final_df.agency.value_counts()

meduzalive      37
vedomosti       31
vcnews          18
TJournal        17
thebell_io      17
rbc_sport       15
russianmacro     8
meduzapro        7
rozetked         5
VwordMedia       5
nplusone         3
addmeto          1
ohmypain         1
Name: agency, dtype: int64