In [354]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: schedule
Successfully installed schedule-1.1.0


In [380]:
import pandas as pd
# import timeit
import schedule    
import time
from datetime import datetime

import requests
from bs4 import BeautifulSoup

import fasttext
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration

In [58]:
model_class = fasttext.load_model("classification/ru_cat_v6.ftz")



In [2]:
tokenizer_resume = AutoTokenizer.from_pretrained("IlyaGusev/mbart_ru_sum_gazeta")
model_resume = AutoModelForSeq2SeqLM.from_pretrained("IlyaGusev/mbart_ru_sum_gazeta")

tokenizer_title = AutoTokenizer.from_pretrained("IlyaGusev/rut5_base_headline_gen_telegram")
model_title = T5ForConditionalGeneration.from_pretrained("IlyaGusev/rut5_base_headline_gen_telegram")

In [7]:
black_label = '–î–ê–ù–ù–û–ï –°–û–û–ë–©–ï–ù–ò–ï (–ú–ê–¢–ï–†–ò–ê–õ) –°–û–ó–î–ê–ù–û –ò (–ò–õ–ò) –†–ê–°–ü–†–û–°–¢–†–ê–ù–ï–ù–û –ò–ù–û–°–¢–†–ê–ù–ù–´–ú –°–†–ï–î–°–¢–í–û–ú –ú–ê–°–°–û–í–û–ô –ò–ù–§–û–†–ú–ê–¶–ò–ò, –í–´–ü–û–õ–ù–Ø–Æ–©–ò–ú –§–£–ù–ö–¶–ò–ò –ò–ù–û–°–¢–†–ê–ù–ù–û–ì–û –ê–ì–ï–ù–¢–ê, –ò (–ò–õ–ò) –†–û–°–°–ò–ô–°–ö–ò–ú –Æ–†–ò–î–ò–ß–ï–°–ö–ò–ú –õ–ò–¶–û–ú, –í–´–ü–û–õ–ù–Ø–Æ–©–ò–ú –§–£–ù–ö–¶–ò–ò –ò–ù–û–°–¢–†–ê–ù–ù–û–ì–û –ê–ì–ï–ù–¢–ê'

In [11]:
def article2summary(article_text):
    """–î–µ–ª–∞–µ—Ç –∫—Ä–∞—Ç–∫–æ–µ —Å–∞–º–º–∞—Ä–∏ –∏–∑ –Ω–æ–≤–æ—Å—Ç–∏"""
    input_ids = tokenizer_resume(
    [article_text],
    max_length=600,
    truncation=True,
    return_tensors="pt")["input_ids"]
    
    output_ids = model_resume.generate(
    input_ids=input_ids,
    no_repeat_ngram_size=4)[0]
    
    summary = tokenizer_resume.decode(output_ids, skip_special_tokens=True)
    
    return summary  

In [4]:
def summary2title(summary):
    """–î–µ–ª–∞–µ—Ç –∑–∞–≥–æ–ª–æ–≤–æ–∫ –∏–∑ –∫—Ä–∞—Ç–∫–æ–π –Ω–æ–≤–æ—Å—Ç–∏"""
    
    input_ids = tokenizer_title(
    [summary],
    max_length=600,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    return_tensors="pt")["input_ids"]

    output_ids = model_title.generate(
    input_ids=input_ids)[0]
    
    title = tokenizer_title.decode(output_ids, skip_special_tokens=True)
    
    return title

In [384]:
def make_clean_text(article, date):
    soup = BeautifulSoup(article)
    
    first_a = soup.find('a')
    try:
        first_link = first_a.get('href')
    except AttributeError:
        first_link = 'NaN'
    
    text = soup.get_text()
    text = text.replace("\xa0", ' ').replace("\n", ' ')
    text = text.replace(black_label, '\n')
    short_news = article2summary(text)
    title = summary2title(short_news)
    
    return {'date': date, 'title': title, 'short_news': short_news, 'first_link': first_link, 'raw_news': text}

In [604]:
def make_articles_dict(channel_name):
    answer = requests.get('https://tg.i-c-a.su/json/' + channel_name)
    data = answer.json()
    messages = data['messages']
    
    temp_df = pd.read_pickle('table_news.pkl')
    start_id = temp_df.index[temp_df['agency'] == channel_name][0]  # –≤—ã—Ç–∞—Å–∫–∏–≤–∞–µ–º —Å–∞–º—É—é –ø–æ—Å–ª–µ–¥–Ω—é—é —Å—Ç–∞—Ç—å—é –¥–∞–Ω–Ω–æ–≥–æ –º–µ–¥–∏–∞ –∏–∑ –ë–î
    del(temp_df)
    
    # –≤—ã–±–∏—Ä–∞–µ–º —Ç–æ–ª—å–∫–æ —Ç–µ —Å—Ç–∞—Ç—å–∏, –∫–æ—Ç–æ—Ä—ã–µ —Å—Ç–∞—Ä—à–µ –ø–æ—Å–ª–µ–¥–Ω–µ–π
    id_articles = [(el, messages[el]['id']) for el in range(len(messages)) if messages[el]['id'] > start_id]
    
    draft_articles = [make_clean_text(messages[el[0]]['message'], messages[el[0]]['date']) for el in id_articles]
    articles_dict = {el[1]: draft_articles[el[0]] for el in id_articles}
    
    # –£–¥–∞–ª—è–µ–º –ø—É—Å—Ç—ã–µ —Å—Ç–∞—Ç—å–∏ –¥–ª—è –Ω–æ–≤–æ—Å—Ç–µ–π, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ —Å–æ–¥–µ—Ä–∂–∞–ª–∏ —Ç–µ–∫—Å—Ç
    empty_keys = [k for k,v in articles_dict.items() if not v['raw_news']]
    for k in empty_keys:
        del articles_dict[k]
    
    return articles_dict

In [586]:
answer = requests.get('https://tg.i-c-a.su/json/' + 'meduzalive')
data = answer.json()
data

{'_': 'messages.channelMessages',
 'flags': 0,
 'inexact': False,
 'pts': 157510,
 'count': 62771,
 'messages': [{'_': 'message',
   'flags': 18048,
   'out': False,
   'mentioned': False,
   'media_unread': False,
   'silent': False,
   'post': True,
   'from_scheduled': False,
   'legacy': False,
   'edit_hide': False,
   'pinned': False,
   'noforwards': False,
   'id': 63485,
   'peer_id': {'_': 'peerChannel', 'channel_id': 1036240821},
   'date': 1656669267,
   'message': '<strong>–£—Ç—Ä–µ–Ω–Ω–∏–π –±—Ä–∏—Ñ–∏–Ω–≥ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –†–§<br />\n</strong><br />\n–î–ê–ù–ù–û–ï –°<a href="https://t.me/tass_agency/144718" target="_blank" rel="nofollow">–û–û–ë–©–ï–ù–ò</a>–ï (–ú–ê–¢–ï–†–ò–ê–õ) –°–û–ó–î–ê–ù–û\xa0–ò (–ò–õ–ò) –†–ê–°–ü–†–û–°–¢–†–ê–ù–ï–ù–û –ò–ù–û–°–¢–†–ê–ù–ù–´–ú –°–†–ï–î–°–¢–í–û–ú –ú–ê–°–°–û–í–û–ô –ò–ù–§–û–†–ú–ê–¶–ò–ò, –í–´–ü–û–õ–ù–Ø–Æ–©–ò–ú –§–£–ù–ö–¶–ò–ò –ò–ù–û–°–¢–†–ê–ù–ù–û–ì–û –ê–ì–ï–ù–¢–ê, –ò\xa0(–ò–õ–ò) –†–û–°–°–ò–ô–°–ö–ò–ú –Æ–†–ò–î–ò–ß–ï–°–ö–ò–ú –õ–ò–¶–û–ú, –í–´–ü–û–õ–

In [590]:
type(data['messages'][0]['message'])

str

In [593]:
data['messages'][0]['date']

1656669267

In [592]:
type(data['messages'][0]['date'])

int

In [460]:
def agency2db(channel_name):
    db = pd.read_pickle('db.pkl')
    channel_dict = make_articles_dict(channel_name)
    if channel_dict:
        df = pd.DataFrame(channel_dict).T
        df['category'] = df['short_news'].apply(lambda x: model_class.predict(x)[0][0].split('__')[-1])  # –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ–º
        df = df.loc[df['category'] != 'not_news']  # —É–¥–∞–ª—è–µ–º –Ω–æ–≤–æ—Å—Ç–∏, –∫–æ—Ç–æ—Ä—ã–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä –Ω–µ –ø—Ä–∏–∑–Ω–∞–ª –Ω–æ–≤–æ—Å—Ç—è–º–∏
        df['date'] = df['date'].apply(lambda x: datetime.fromtimestamp(x))  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤—ã–≤–∞–µ–º timestamp-—á–∏—Å–ª–æ –≤ –¥–∞—Ç—É
        df['agency'] = channel_name
        db = pd.concat([db, df]).sort_values('date', ascending=False)
        db.to_pickle('db.pkl')

In [445]:
def join_all(agency_list):
    for agency in agency_list:
        print(f'–°–æ–±–∏—Ä–∞—é {agency}...')
        agency2db(agency)
        print(f'....... complited')

    db = pd.read_pickle('db.pkl')
    return db

### –†–∞–∑–±–∏—Ä–∞–µ–º—Å—è —Å –¥–∞—Ç–∞–º–∏ –∏ –ø–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫–∏–º –ø–∞—Ä—Å–∏–Ω–≥–æ–º

In [48]:
timeit.timeit('nplusone = record_newsbatch2db(agency_name[1])', globals=globals(), number=1)

228.33496230000037

In [51]:
228/60*len(agency_name)

45.599999999999994

In [147]:
nplusone.date.loc[17990] + pd.Timedelta("1 hour")

Timestamp('2022-06-26 09:14:01')

In [163]:
today =  pd.to_datetime('today').normalize()
start_time = today + pd.Timedelta("7 hour")
start_time

Timestamp('2022-06-29 07:00:00')

In [166]:
idx = pd.date_range(start_time, periods=4, freq="5H")
ts = pd.Series(range(len(idx)), index=idx)
ts

2022-06-29 07:00:00    0
2022-06-29 12:00:00    1
2022-06-29 17:00:00    2
2022-06-29 22:00:00    3
Freq: 5H, dtype: int64

In [173]:
len(idx), idx[0]

(4, Timestamp('2022-06-29 07:00:00', freq='5H'))

In [171]:
pd.Timestamp.now()

Timestamp('2022-06-29 08:37:33.273123')

In [353]:
df_pickle.date.iloc[0]

Timestamp('2022-06-29 15:05:30')

In [566]:
first = pd.to_datetime('today')

In [567]:
second = pd.to_datetime('today')

In [568]:
print(first, second)

2022-07-01 12:37:28.951656 2022-07-01 12:37:42.411020


In [584]:
str(pd.to_timedelta(second - first))

'0 days 00:00:13.459364'

### –ó–∞–ø—É—Å–∫–∞–µ–º –ø–∞—Ä—Å–∏–Ω–≥

In [395]:
db.head(10)

Unnamed: 0,date,title,short_news,first_link,raw_news,category,agency
35212,2022-06-29 19:23:40,¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ–¥–∞–ª –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–∏–∫–≤–∏–¥–∞—Ü–∏—é —Å—Ç—Ä—É–∫—Ç—É...,¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ–¥–∞–ª –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–∏–∫–≤–∏–¥–∞—Ü–∏—é —Å—Ç—Ä—É–∫—Ç—É...,vc.ru/finance/452888,¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ–¥–∞–ª –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–∏–∫–≤–∏–¥–∞—Ü–∏—é —Å—Ç—Ä—É–∫—Ç—É...,economy,vcnews
35209,2022-06-29 18:30:11,–ì–ª–∞–≤–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞ –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–∞ –ú–æ—Å–∫–≤—ã ...,–ì–ª–∞–≤–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞ –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–∞ –ú–æ—Å–∫–≤—ã ...,https://go.the.tj/HPF6CE,–°–ª–µ–¥–∏—Ç—å –∑–∞ –Ω–æ–≤–æ—Å—Ç—è–º–∏ –æ –Ω–æ–≤—ã—Ö –º–µ—Ä–∞—Ö –ø–æ–¥–¥–µ—Ä–∂–∫–∏ –±...,economy,vcnews
35208,2022-06-29 18:15:52,–®–≤–µ–¥—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è Sandvik —Ä–µ—à–∏–ª–∞ —É–π—Ç–∏ –∏–∑ –†–æ—Å—Å–∏–∏,–ú–∏–Ω—Ç—Ä–∞–Ω—Å –∑–∞–∫–ª—é—á–∏–ª –¥–æ–≥–æ–≤–æ—Ä—ã —Å 20 –∞–≤–∏–∞–∫–æ–º–ø–∞–Ω–∏—è–º–∏...,vc.ru/finance/452252,–ü—Ä–æ–¥–æ–ª–∂–∞–µ–º —Å–ª–µ–¥–∏—Ç—å –∑–∞ –Ω–æ–≤–æ—Å—Ç—è–º–∏: ‚Äî –ú–∏–Ω—Ç—Ä–∞–Ω—Å –∑...,economy,vcnews
35207,2022-06-29 18:05:25,–ó–∞–∫–æ–Ω –æ–± –æ—Ç–∑—ã–≤–µ –ª–∏—Ü–µ–Ω–∑–∏–∏ —É –°–ú–ò –∑–∞ ¬´—Ñ–µ–π–∫–∏¬ª –ø—Ä–∏–Ω...,"–≠–∫—Å–ø–µ—Ä—Ç—ã –æ—Ç–º–µ—á–∞—é—Ç, —á—Ç–æ –∑–∞–∫–æ–Ω –æ–± –æ—Ç–∑—ã–≤–µ –ª–∏—Ü–µ–Ω–∑–∏...",vc.ru/legal/452811,–ì–æ—Å–¥—É–º–∞ –≤–æ –≤—Ç–æ—Ä–æ–º —á—Ç–µ–Ω–∏–∏ –æ–¥–æ–±—Ä–∏–ª–∞ –∑–∞–∫–æ–Ω–æ–ø—Ä–æ–µ–∫—Ç...,society,vcnews
18002,2022-06-29 17:43:56,–í –ü–µ—Ä—É–∞–Ω—Å–∫–æ–º –ª–µ—Å—É –æ–±–Ω–∞—Ä—É–∂–µ–Ω –Ω–æ–≤—ã–π –≤–∏–¥ –∞–º—Ñ–∏–±–∏–π,–í –ø–µ—Ä—É–∞–Ω—Å–∫–æ–º –ª–µ—Å—É –æ–±–Ω–∞—Ä—É–∂–µ–Ω –Ω–æ–≤—ã–π –≤–∏–¥ –∞–º—Ñ–∏–±–∏–π ...,https://nplus1.ru/news/2022/06/29/rhinella-una...,"–ò–Ω–æ–≥–¥–∞ —á—Ç–æ–±—ã –æ—Ç–∫—Ä—ã—Ç—å –Ω–æ–≤—ã–π –≤–∏–¥ –∂–∏–≤–æ—Ç–Ω—ã—Ö, –∑–æ–æ–ª–æ...",science,nplusone
35206,2022-06-29 16:37:31,¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ç–µ—Å—Ç–∏—Ä—É–µ—Ç –≤–µ—Ä–∏—Ñ–∏–∫–∞—Ü–∏—é —Å—Ç—Ä–∞–Ω–∏—Ü –≥–æ—Å–æ...,¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ç–µ—Å—Ç–∏—Ä—É–µ—Ç –≤–µ—Ä–∏—Ñ–∏–∫–∞—Ü–∏—é —Å—Ç—Ä–∞–Ω–∏—Ü –≥–æ—Å–æ...,vc.ru/social/452700,–ú–∏–Ω—Ü–∏—Ñ—Ä—ã –∏ ¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ç–µ—Å—Ç–∏—Ä—É—é—Ç –≤–µ—Ä–∏—Ñ–∏–∫–∞—Ü–∏—é —Å...,society,vcnews
35205,2022-06-29 16:27:11,–í ¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω—è—é—Ç –∞–Ω–æ–Ω–∏–º–Ω—ã–µ –ø–æ—Å—Ç—ã —Å...,–í –≥–æ—Ä–æ–¥—Å–∫–∏—Ö –ø–∞–±–ª–∏–∫–∞—Ö –≤–æ ¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞...,vc.ru/marketing/452730,–í –≥–æ—Ä–æ–¥—Å–∫–∏—Ö –ø–∞–±–ª–∏–∫–∞—Ö –≤–æ ¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞...,society,vcnews
35204,2022-06-29 15:56:08,¬´–ù–æ–≤—ã–µ —Ä–µ—à–µ–Ω–∏—è¬ª –ø–æ–ª—É—á–∏–ª–∏ –∫–æ–Ω—Ç—Ä–æ–ª—å –Ω–∞–¥ —Å–µ—Ä–≤–∏—Å–∞–º...,¬´–ù–æ–≤—ã–µ —Ä–µ—à–µ–Ω–∏—è¬ª –ø–æ–ª—É—á–∏–ª–∏ –∫–æ–Ω—Ç—Ä–æ–ª—å –Ω–∞–¥ —Å–µ—Ä–≤–∏—Å–∞–º...,vc.ru/services/452662,"–ù–æ–≤—ã–π –≤–ª–∞–¥–µ–ª–µ—Ü ¬´–°–±–µ—Ä–ó–≤—É–∫–∞¬ª, Okko, SberCloud –∏ ...",society,vcnews
63325,2022-06-29 15:05:30,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë–µ–ª–≥–æ—Ä–æ–¥—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –ø–µ—Ä–µ—Å—Ç–∞–ª –∏–Ω—Ñ–æ—Ä...,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë–µ–ª–≥–æ—Ä–æ–¥—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –í—è—á–µ—Å–ª–∞–≤ –ì–ª–∞–¥–∫...,https://t.me/vvgladkov/500,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë–µ–ª–≥–æ—Ä–æ–¥—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –í—è—á–µ—Å–ª–∞–≤ –ì–ª–∞–¥–∫...,society,meduzalive
63324,2022-06-29 15:02:51,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë—Ä—è–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ —Å–æ–æ–±—â–∏–ª –æ —Å–±–∏—Ç–æ–º —É...,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë—Ä—è–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ë–æ–≥–æ–º–∞–∑ ...,https://t.me/avbogomaz/570,–ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä –ë—Ä—è–Ω—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏ –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ë–æ–≥–æ–º–∞–∑ ...,society,meduzalive


In [398]:
db.agency.value_counts()

meduzalive    19
vcnews        16
TJournal      10
nplusone      10
rbc_sport     10
vedomosti     10
rozetked       8
VwordMedia     8
now_ka         8
addmeto        7
ohmypain       3
Name: agency, dtype: int64

In [399]:
db.category.value_counts()

society       55
science       16
economy       14
technology    13
sports        10
other          1
Name: category, dtype: int64

In [408]:
for el in db['title']:
    print(el)

¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ–¥–∞–ª –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–∏–∫–≤–∏–¥–∞—Ü–∏—é —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –≤ –ò—Ä–ª–∞–Ω–¥–∏–∏
–ì–ª–∞–≤–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞ –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–∞ –ú–æ—Å–∫–≤—ã –ê–ª–µ–∫—Å–µ–π –§—É—Ä—Å–∏–Ω –≤ –ø—Ä—è–º–æ–º —ç—Ñ–∏—Ä–µ –æ—Ç–≤–µ—á–∞–µ—Ç –Ω–∞ –≤–æ–ø—Ä–æ—Å—ã –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª–µ–π
–®–≤–µ–¥—Å–∫–∞—è –∫–æ–º–ø–∞–Ω–∏—è Sandvik —Ä–µ—à–∏–ª–∞ —É–π—Ç–∏ –∏–∑ –†–æ—Å—Å–∏–∏
–ó–∞–∫–æ–Ω –æ–± –æ—Ç–∑—ã–≤–µ –ª–∏—Ü–µ–Ω–∑–∏–∏ —É –°–ú–ò –∑–∞ ¬´—Ñ–µ–π–∫–∏¬ª –ø—Ä–∏–Ω—è—Ç –≤–æ –≤—Ç–æ—Ä–æ–º —á—Ç–µ–Ω–∏–∏
–í –ü–µ—Ä—É–∞–Ω—Å–∫–æ–º –ª–µ—Å—É –æ–±–Ω–∞—Ä—É–∂–µ–Ω –Ω–æ–≤—ã–π –≤–∏–¥ –∞–º—Ñ–∏–±–∏–π
¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ç–µ—Å—Ç–∏—Ä—É–µ—Ç –≤–µ—Ä–∏—Ñ–∏–∫–∞—Ü–∏—é —Å—Ç—Ä–∞–Ω–∏—Ü –≥–æ—Å–æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–π —á–µ—Ä–µ–∑ –ì–æ—Å—É—Å–ª—É–≥–∏
–í ¬´–í–ö–æ–Ω—Ç–∞–∫—Ç–µ¬ª —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω—è—é—Ç –∞–Ω–æ–Ω–∏–º–Ω—ã–µ –ø–æ—Å—Ç—ã —Å —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–µ–π –µ—Å—Ç—å –∫–æ—à–∞—á–∏–π –∫–æ—Ä–º ¬´365 –¥–Ω–µ–π¬ª –Ω–∞ –≥–∞—Ä–Ω–∏—Ä
¬´–ù–æ–≤—ã–µ —Ä–µ—à–µ–Ω–∏—è¬ª –ø–æ–ª—É—á–∏–ª–∏ –∫–æ–Ω—Ç—Ä–æ–ª—å –Ω

In [409]:
for el in db['short_news']:
    print(el)

¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ–¥–∞–ª –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–∏–∫–≤–∏–¥–∞—Ü–∏—é —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –≤ –ò—Ä–ª–∞–Ω–¥–∏–∏, –∫–æ—Ç–æ—Ä—É—é –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–ª –≤ —è–Ω–≤–∞—Ä–µ 2022 –≥–æ–¥–∞. –¢–∞–∫ –∫–æ–º–ø–∞–Ω–∏—è —Ö–æ—Ç–µ–ª–∞ –ø–æ–ª—É—á–∏—Ç—å –Ω–∞–ª–æ–≥–æ–≤—ã–µ –ª—å–≥–æ—Ç—ã, —Ä–∞—Å—Å–∫–∞–∑–∞–ª –∏—Å—Ç–æ—á–Ω–∏–∫ ¬´–ö–æ–º–º–µ—Ä—Å–∞–Ω—Ç–∞¬ª. –ù–æ –æ—Ç –∏–¥–µ–∏ –æ—Ç–∫–∞–∑–∞–ª–∏—Å—å ‚Äî —á—Ç–æ–±—ã ¬´–Ω–∏–∫–æ–≥–æ –Ω–µ –ø—Ä–æ–≤–æ—Ü–∏—Ä–æ–≤–∞—Ç—å¬ª.
–ì–ª–∞–≤–∞ –¥–µ–ø–∞—Ä—Ç–∞–º–µ–Ω—Ç–∞ –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å—Å—Ç–≤–∞ –ú–æ—Å–∫–≤—ã –ê–ª–µ–∫—Å–µ–π –§—É—Ä—Å–∏–Ω –≤ –ø—Ä—è–º–æ–º —ç—Ñ–∏—Ä–µ –æ—Ç–≤–µ—á–∞–µ—Ç –Ω–∞ –≤–æ–ø—Ä–æ—Å—ã –ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª–µ–π –≤ —Ç–µ–ª–µ–≥—Ä–∞–º-–∫–∞–Ω–∞–ª–µ –ê–ª–µ–∫—Å–µ—è –§—É—Ä—Å–∏–Ω–∞. –ù–∞–ø–æ–º–∏–Ω–∞–µ—Ç –æ –¥–µ–¥–ª–∞–π–Ω–∞—Ö –ø–æ–¥–∞—á–∏ –∑–∞—è–≤–æ–∫ –Ω–∞ –≥—Ä–∞–Ω—Ç—ã –∏ –æ—Ç—á–∏—Ç—ã–≤–∞–µ—Ç—Å—è –æ –ø—Ä–æ–¥–µ–ª–∞–Ω–Ω–æ–π —Ä–∞–±–æ—Ç–µ. –ù–∞–ø—Ä–∏–º–µ—Ä, –≤ –∏—é–Ω–µ –∞–∫—Ç–∏–≤–Ω–æ —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–ª –æ –¥–µ–ª–æ–≤–æ–π —Å–æ—Å—Ç–∞–≤–ª—è—é—â–µ–π Moscow Fash

In [248]:
df.category.value_counts()

society       42
science       14
technology    12
economy       11
sports        10
other          1
Name: category, dtype: int64

In [252]:
df.agency.value_counts()

TJournal      10
rbc_sport     10
vedomosti     10
vcnews         9
meduzalive     9
rozetked       8
nplusone       8
VwordMedia     8
now_ka         8
addmeto        7
ohmypain       3
Name: agency, dtype: int64

In [345]:
db = pd.read_pickle('db.pkl')

In [413]:
del(db)

In [443]:
join_all(agency_list)

–°–æ–±–∏—Ä–∞—é rbc_sport...
....... complited


–°–æ–±–∏—Ä–∞—é ohmypain...
....... complited


–°–æ–±–∏—Ä–∞—é addmeto...
....... complited




Unnamed: 0,date,title,short_news,first_link,raw_news,category,agency
63382,2022-06-30 10:25:46,–í—è—á–µ—Å–ª–∞–≤ –í–æ–ª–æ–¥–∏–Ω –ø—Ä–æ–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–ª –≤—Å—Ç—É–ø–ª–µ–Ω–∏–µ –§...,–í—è—á–µ—Å–ª–∞–≤ –í–æ–ª–æ–¥–∏–Ω –ø—Ä–æ–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–ª –≤–æ–∑–º–æ–∂–Ω–æ–µ –≤—Å...,https://t.me/vv_volodin/504,–í—è—á–µ—Å–ª–∞–≤ –í–æ–ª–æ–¥–∏–Ω –ø—Ä–æ–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–ª –≤–æ–∑–º–æ–∂–Ω–æ–µ –≤—Å...,society,meduzalive
63381,2022-06-30 10:23:57,–í –ë–µ–ª–∞—Ä—É—Å–∏ —Å–æ—Ä–≤–∞–ª–∏ –ø—Ä–∞–∑–¥–Ω–∏–∫ –¥–ª—è –¥–µ—Ç–µ–π –±–µ–∂–µ–Ω—Ü–µ–≤,–í –ë–µ–ª–∞—Ä—É—Å–∏ —Å–æ—Ä–≤–∞–ª–∏ –ø—Ä–∞–∑–¥–Ω–∏–∫ –¥–ª—è –¥–µ—Ç–µ–π —É–∫—Ä–∞–∏–Ω—Å–∫...,https://news.zerkalo.io/life/16948.html?tg,–í –ë–µ–ª–∞—Ä—É—Å–∏ —Å–æ—Ä–≤–∞–ª–∏ –ø—Ä–∞–∑–¥–Ω–∏–∫ –¥–ª—è –¥–µ—Ç–µ–π —É–∫—Ä–∞–∏–Ω—Å–∫...,society,meduzalive
22885,2022-06-30 10:22:09,–ú–æ—Å–±–∏—Ä–∂–∞ –æ—Ç–∫—Ä—ã–ª–∞—Å—å —Å –ø–æ–≤—ã—à–µ–Ω–∏–µ–º –∫—É—Ä—Å–∞ –¥–æ–ª–ª–∞—Ä–∞ ...,–í –ø—è—Ç–Ω–∏—Ü—É –ú–æ—Å–±–∏—Ä–∂–∞ –æ—Ç–∫—Ä—ã–ª–∞—Å—å —Å –ø–æ–≤—ã—à–µ–Ω–∏–µ–º –∫—É—Ä—Å...,tg://resolve?domain=@vedomosti,–ö–∞–∫ –æ—Ç–∫—Ä—ã–ª–∞—Å—å –ú–æ—Å–±–∏—Ä–∂–∞ üîª –ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 10:1...,economy,vedomosti
12985,2022-06-30 10:04:52,Xiaomi –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç –Ω–æ–≤—ã–π —Ñ–∏—Ç–Ω–µ—Å-–±—Ä–∞—Å–ª–µ—Ç —Å —É–≤–µ–ª–∏...,–í –∏—é–ª–µ Xiaomi –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç –Ω–æ–≤—ã–π —Ñ–∏—Ç–Ω–µ—Å-–±—Ä–∞—Å–ª–µ—Ç ...,https://rozetked.me/news/24218-xiaomi-ob-yavil...,–≠—Ç–æ Xiaomi Smart Band 7 Pro! –ü—Ä–µ–∑–µ–Ω—Ç–∞—Ü–∏—è —Ñ–∏—Ç–Ω...,technology,rozetked
63379,2022-06-30 10:04:07,"–í –ú–æ—Å–∫–≤–µ –ø–æ—è–≤–∏–ª–∏—Å—å –±–∏–ª–±–æ—Ä–¥—ã, –ø—Ä–æ—Å–ª–∞–≤–ª—è—é—â–∏–µ —É—á–∞...",–í –ú–æ—Å–∫–≤–µ –∏ –¥—Ä—É–≥–∏—Ö –≥–æ—Ä–æ–¥–∞—Ö –†–æ—Å—Å–∏–∏ –ø–æ—è–≤–∏–ª–∏—Å—å –±–∏–ª...,https://tvzvezda.ru/news/2022317134-wtzsO.html,ü§¶‚Äç‚ôÇÔ∏èüîç –í –ú–æ—Å–∫–≤–µ –∏ –¥—Ä—É–≥–∏—Ö –≥–æ—Ä–æ–¥–∞—Ö –ø–æ—è–≤–∏–ª–∏—Å—å –±–∏–ª–±...,society,meduzalive
...,...,...,...,...,...,...,...
5713,2022-06-20 13:31:02,–ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º–∏—Ç—Ä–æ–ø–æ–ª–∏—Ç–∞...,ging.ru ‚Äì –ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º...,https://vpost-media.ru/opinions/za-chto-patria...,–ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º–∏—Ç—Ä–æ–ø–æ–ª–∏—Ç–∞...,society,VwordMedia
242,2022-06-17 23:33:42,–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç —Å–Ω–∏–∑...,–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç —Å–Ω–∏–∑...,https://www.tohoku.ac.jp/en/press/training_vir...,üîãüß†üíª –¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç ...,science,now_ka
241,2022-06-15 23:05:59,–£—á–µ–Ω—ã–µ –æ–±–Ω–∞—Ä—É–∂–∏–ª–∏ –º–æ–ª–æ–¥—É—é –ø–ª–∞–Ω–µ—Ç—É —Å –º–∞—Å—Å–æ–π –Æ–ø–∏...,"–°–ø—É—Ç–Ω–∏–∫–∏ –¥–∞–ª–µ–∫–∏—Ö –∑–≤–µ–∑–¥ –ú–æ–∑–∞–∏–∫–∞ –ø—ã–ª—å–Ω—ã—Ö, –≤—Ä–∞—â–∞—é...",https://noirlab.edu/public/news/noirlab2212/,üõ∏ –ò–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Å —Ç–µ–ª–µ—Å–∫–æ–ø–∞ Gemini South –≤ –ß–∏–ª–∏ ...,science,now_ka
239,2022-06-13 23:50:58,–£—á–µ–Ω—ã–µ –Ω–∞—à–ª–∏ –Ω–æ–≤—ã–π —Å–ø–æ—Å–æ–± –ª–µ—á–µ–Ω–∏—è —Ö—Ä–æ–Ω–∏—á–µ—Å–∫–æ–π ...,–º–æ–ª–µ–∫—É–ª–∞ –≤ –Ω–µ—Ä–≤–Ω–æ–π —Å–∏—Å—Ç–µ–º–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –∫–ª—é—á–æ–º –∫...,https://ucalgary.ca/news/ucalgary-researchers-...,üíäüß† –ú–æ–ª–µ–∫—É–ª–∞ –≤ –Ω–µ—Ä–≤–Ω–æ–π —Å–∏—Å—Ç–µ–º–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –∫–ª—é—á–æ...,science,now_ka


In [463]:
db.category.value_counts()

society          78
economy          22
technology       21
science          16
sports           11
entertainment     6
other             1
Name: category, dtype: int64

In [464]:
db.agency.value_counts()

meduzalive    27
vcnews        23
vedomosti     19
TJournal      18
rbc_sport     14
rozetked      12
VwordMedia    11
nplusone      10
addmeto        9
now_ka         8
ohmypain       4
Name: agency, dtype: int64

In [465]:
agencies = [
    'vcnews',
    'nplusone',
    'TJournal',
    'vedomosti',
    'VwordMedia',
    'meduzalive',
    'rozetked',
    'now_ka',
    'rbc_sport',
    'ohmypain', 
    'addmeto']

In [468]:
# schedule.every().day.at("07:00").do(join_all, agency_list = agencies)
# schedule.every().day.at("13:05").do(join_all, agency_list = agencies)
# schedule.every().day.at("17:12").do(join_all, agency_list = agencies)
# schedule.every().day.at("21:00").do(join_all, agency_list = agencies)

# while True:
#     schedule.run_pending()
#     time.sleep(1)

In [477]:
db.to_csv('db.csv', encoding='utf-8')

In [509]:
db = pd.read_pickle('db.pkl')

In [508]:
schedule.clear()

In [560]:
db.index.names = ['id_news']

In [511]:
db.category.value_counts()

society          147
economy           44
technology        36
sports            20
science           17
entertainment     13
other              1
Name: category, dtype: int64

In [512]:
db.short_news[db.category != 'society']

12993    ¬´–û–±–∏–¥–Ω–æ –≤–∏–¥–µ—Ç—å —Ç–æ, —á—Ç–æ –Ω–µ –º–æ–∂–µ—à—å –∫—É–ø–∏—Ç—å, –ø—Ä–∞–≤–¥...
63455    ¬´–ë–µ—Å–ø–æ—Ä–Ω–æ¬ª ‚Äî —Ñ–∏–ª—å–º-–∏—Å–ø–æ–≤–µ–¥—å –ø–æ–¥—Ä–æ—Å—Ç–∫–∞, –∫–æ—Ç–æ—Ä—ã–π...
63454    –í –ì–æ–≥–æ–ª—å-—Ü–µ–Ω—Ç—Ä–µ –∏–∑ –ê–≤–∏–Ω—å–æ–Ω–∞ –ø—Ä–æ—à–ª–∞ –ø—Ä–µ–º—å–µ—Ä–∞ —Å–ø...
63449    Volkswagen –∑–∞—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç ¬´—Ç–∞–∫ –º–Ω–æ–≥–æ, –∫–∞–∫ –Ω–∏–∫–æ–≥–¥...
22918    –°–±–µ—Ä–±–∞–Ω–∫ –ø–æ–≤—ã—Å–∏—Ç –∑–∞—Ä–ø–ª–∞—Ç—ã —Å–æ—Ç—Ä—É–¥–Ω–∏–∫–∞–º –Ω–∞ 8,5%....
                               ...                        
5714     –ü–æ–ª–∏—Ç–æ–ª–æ–≥ –ë–æ—Ä–∏—Å –ú–∏–Ω–∞–µ–≤ ‚Äì –æ –ì–µ–Ω–Ω–∞–¥–∏–∏ –ë—É—Ä–±—É–ª–∏—Å–µ,...
242      –¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç —Å–Ω–∏–∑...
241      –°–ø—É—Ç–Ω–∏–∫–∏ –¥–∞–ª–µ–∫–∏—Ö –∑–≤–µ–∑–¥ –ú–æ–∑–∞–∏–∫–∞ –ø—ã–ª—å–Ω—ã—Ö, –≤—Ä–∞—â–∞—é...
239      –º–æ–ª–µ–∫—É–ª–∞ –≤ –Ω–µ—Ä–≤–Ω–æ–π —Å–∏—Å—Ç–µ–º–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –∫–ª—é—á–æ–º –∫...
238      –ù–∞ –æ—Å—Ç—Ä–æ–≤–µ –£–∞–π—Ç –æ–±–Ω–∞—Ä—É–∂–µ–Ω –∫—Ä—É–ø–Ω

In [553]:
length = db.category.to_list()

In [556]:
result = max(len(x) for x in length)
result

13

In [621]:
schedule.clear()

In [599]:
db.agency.value_counts()

meduzalive      55
vedomosti       46
vcnews          44
TJournal        31
rbc_sport       22
rozetked        19
VwordMedia      14
nplusone        13
addmeto         11
russianmacro    10
now_ka           8
ohmypain         5
Name: agency, dtype: int64

In [601]:
db[db.agency == 'russianmacro']

Unnamed: 0_level_0,date,title,short_news,first_link,raw_news,category,agency
id_news,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14884,2022-06-30 18:30:10,–¶–ë —É–≤–µ–ª–∏—á–∏–ª –ª–∏–º–∏—Ç –ø–µ—Ä–µ–≤–æ–¥–∞ —Å—Ä–µ–¥—Å—Ç–≤ —Ñ–∏–∑–ª–∏—Ü –Ω–∞ —Å...,–¶–ë —É–≤–µ–ª–∏—á–∏–ª –ª–∏–º–∏—Ç –ø–µ—Ä–µ–≤–æ–¥–∞ —Å—Ä–µ–¥—Å—Ç–≤ —Ñ–∏–∑–ª–∏—Ü –Ω–∞ —Å...,http://cbr.ru/press/event/?id=13976,–¶–ë –ü–†–û–î–û–õ–ñ–ê–ï–¢ –ü–û–ü–´–¢–ö–ò –í–´–¢–û–õ–ö–ê–¢–¨ –í–ê–õ–Æ–¢–£ –ò–ó –°–¢–†–ê...,economy,russianmacro
14883,2022-06-30 15:08:22,–í –º–∞–µ –≥—Ä—É–∑–æ–æ–±–æ—Ä–æ—Ç —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–∞ –≤ –†–§ —Å–Ω–∏–∑–∏–ª—Å—è –Ω–∞ ...,–í –º–∞–µ –≥—Ä—É–∑–æ–æ–±–æ—Ä–æ—Ç —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–∞ –≤ –†–§ —Å–Ω–∏–∑–∏–ª—Å—è –Ω–∞ ...,,–ì–†–£–ó–û–û–ë–û–†–û–¢ –í –ú–ê–ï: –¢–†–£–ë–û–ü–†–û–í–û–î–ù–´–ô –¢–†–ê–ù–°–ü–û–†–¢ –ù–ï...,economy,russianmacro
14882,2022-06-30 14:30:05,–†–æ—Å—Å–∏–π—Å–∫–∏–π –∏–ø–æ—Ç–µ—á–Ω—ã–π —Ä—ã–Ω–æ–∫ –ø–µ—Ä–µ–∂–∏–ª –∫–æ–ª–ª–∞–ø—Å,–†–æ—Å—Å–∏–π—Å–∫–∏–π –∏–ø–æ—Ç–µ—á–Ω—ã–π —Ä—ã–Ω–æ–∫ –ø–µ—Ä–µ–∂–∏–ª –∫–æ–ª–ª–∞–ø—Å. –í ...,,"–ö–û–õ–õ–ê–ü–° –ò–ü–û–¢–ï–ß–ù–û–ì–û –†–´–ù–ö–ê, –ù–ê–ß–ê–í–®–ò–ô–°–Ø –í –ê–ü–†–ï–õ–ï,...",economy,russianmacro
14881,2022-06-30 12:31:58,"–†–æ—Å—Å–∏–π—Å–∫–∏–µ —Å–∞–Ω–∫—Ü–∏–∏, –æ—Ç–∫–ª—é—á–µ–Ω–∏–µ –†–æ—Å—Å–∏–∏ –æ—Ç SWIFT...","–†–æ—Å—Å–∏–π—Å–∫–∏–µ —Å–∞–Ω–∫—Ü–∏–∏, –æ—Ç–∫–ª—é—á–µ–Ω–∏–µ –†–æ—Å—Å–∏–∏ –æ—Ç SWIFT...",tg://resolve?domain=@e_magic,‚ö°Ô∏è–í —Å–≤—è–∑–∏ —Å–æ —Å–ª–æ–∂–∏–≤—à–µ–π—Å—è –≥–µ–æ–ø–æ–ª–∏—Ç–∏—á–µ—Å–∫–æ–π —Å–∏—Ç—É–∞...,economy,russianmacro
14880,2022-06-30 12:03:42,–†–æ—Å—Å–∏–π—Å–∫–∏–π —Ñ–æ–Ω–¥–æ–≤—ã–π —Ä—ã–Ω–æ–∫ ‚Äì —ç—Ç–æ –≥–∏–±–ª–æ–µ –º–µ—Å—Ç–æ –¥...,–†–æ—Å—Å–∏–π—Å–∫–∏–π —Ñ–æ–Ω–¥–æ–≤—ã–π —Ä—ã–Ω–æ–∫ ‚Äì —ç—Ç–æ —Å–µ–π—á–∞—Å –≥–∏–±–ª–æ–µ ...,https://www.gazprom.ru/press/news/2022/june/ar...,–ï–°–¢–¨ –í–ï–©–ò –ü–û–í–ê–ñ–ù–ï–ï –§–û–ù–î–û–í–û–ì–û –†–´–ù–ö–ê –ü–æ—Å–ª–µ —Ç–æ–≥–æ...,economy,russianmacro
14879,2022-06-30 11:30:49,–†–æ—Å—Å—Ç–∞—Ç –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–ª –∑–∞–º–µ–¥–ª–µ–Ω–∏–µ –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ–≥–æ ...,–†–æ—Å—Å—Ç–∞—Ç –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–ª –∑–∞–º–µ–¥–ª–µ–Ω–∏–µ –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ–≥–æ ...,https://t.me/russianmacro/14865,–ü–ê–î–ï–ù–ò–ï –ü–†–û–ú–´–®–õ–ï–ù–ù–û–°–¢–ò –í –ú–ê–ï –ó–ê–ú–ï–î–õ–ò–õ–û–°–¨. –í–´–ü–£...,economy,russianmacro
14878,2022-06-30 10:15:24,–≠–∫—Å–ø–µ—Ä—Ç—ã ¬´–ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫–∞.–ú–Ω–µ–Ω–∏–µ¬ª –æ–±–Ω–æ–≤–ª—è—é—Ç –ø—Ä–æ–≥–Ω...,–≠–∫—Å–ø–µ—Ä—Ç—ã ¬´–ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫–∞.–ú–Ω–µ–Ω–∏–µ¬ª –æ–±–Ω–æ–≤–ª—è—é—Ç –ø—Ä–æ–≥–Ω...,https://telegra.ph/Svezhie-makroprognozy-06-30,üí°–°–µ–≥–æ–¥–Ω—è –≤ –µ–∂–µ–Ω–µ–¥–µ–ª—å–Ω–æ–π —Ä—É–±—Ä–∏–∫–µ ¬´–ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫.–ú...,economy,russianmacro
14877,2022-06-30 10:01:52,–ò–Ω—Ñ–ª—è—Ü–∏—è –≤–æ –§—Ä–∞–Ω—Ü–∏–∏ –ø—Ä–æ–¥–æ–ª–∂–∏—Ç —Ä–∞—Å—Ç–∏,–ù–∞ —Ñ–æ–Ω–µ —É—Å–∫–æ—Ä–µ–Ω–∏—è —Ü–µ–Ω –Ω–∞ —ç–Ω–µ—Ä–≥–æ–Ω–æ—Å–∏—Ç–µ–ª–∏ –∏–Ω—Ñ–ª—è—Ü...,https://t.me/russianmacro/14870,–ò–ù–§–õ–Ø–¶–ò–Ø –í–û –§–†–ê–ù–¶–ò–ò: –¢–ï–ú–ü–´ –ü–†–û–î–û–õ–ñ–ê–Æ–¢ –£–í–ï–†–ï–ù–ù–û...,economy,russianmacro
14876,2022-06-30 09:12:03,–ö–∏—Ç–∞–π—Å–∫–∏–µ –∏–Ω–¥–µ–∫—Å—ã —Ç–æ—Ä–≥—É—é—Ç—Å—è –≤ –ø–ª—é—Å–µ –Ω–∞ —Ö–æ—Ä–æ—à–∏—Ö...,"–°–Ω–∏–∂–µ–Ω–∏–µ –í–í–ü –°–®–ê –≤ 1–∫–≤22, —Å -1.5% –¥–æ -1.6% –∞–Ω–Ω...",https://t.me/russianmacro/14875,üî¥üü¢ - –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–µ —Ü–µ–Ω–æ–≤—ã–µ –∏–∑–º–µ–Ω–µ–Ω–∏—è –Ω–∞ WallStre...,economy,russianmacro
14875,2022-06-30 08:10:05,–î–ï–õ–û–í–ê–Ø –ê–ö–¢–ò–í–ù–û–°–¢—å –≤ –ö–ò–¢–ê–µ –≤ –ò–Æ–ù–ï ‚Äì –õ–û–ö–î–ê–£–ù–´ –°...,–î–ï–õ–û–í–ê–Ø –ê–ö–¢–ò–í–ù–û–°–¢—å –≤ –ö–ò–¢–ê–µ –≤ –ò–Æ–ù–ï ‚Äì –õ–û–ö–î–ê–£–ù–´ –°...,,–î–ï–õ–û–í–ê–Ø –ê–ö–¢–ò–í–ù–û–°–¢–¨ –í –ö–ò–¢–ê–ï –í –ò–Æ–ù–ï ‚Äì –õ–û–ö–î–ê–£–ù–´ –°...,society,russianmacro


In [602]:
def ag(channel_name: str) -> None:
	"""–ü–æ–ª—É—á–∞–µ—Ç —Å–ª–æ–≤–∞—Ä—å —Ç–µ–∫—É—â–µ–≥–æ –°–ú–ò, –∏ –ø–∏—à–µ—Ç –µ–≥–æ –≤ –Ω–∞—à—É –±–∞–∑—É –¥–∞–Ω–Ω—ã—Ö, –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ –æ–±—Ä–∞–±–æ—Ç–∞–≤ –∏ –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞–≤"""
	db = pd.read_pickle('table_news.pkl')
	channel_dict = make_articles_dict(channel_name)
	if channel_dict:
		df = pd.DataFrame(channel_dict).T
		df['category'] = df['short_news'].apply(
			lambda x: model_class.predict(x)[0][0].split('__')[-1])  # –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ–º fasttext-–æ–º, –¥–æ—Å—Ç–∞—ë–º –∫–ª–∞—Å—Å
		df = df.loc[df['category'] != 'not_news']  # —É–¥–∞–ª—è–µ–º –Ω–æ–≤–æ—Å—Ç–∏, –∫–æ—Ç–æ—Ä—ã–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä –Ω–µ –ø—Ä–∏–∑–Ω–∞–ª –Ω–æ–≤–æ—Å—Ç—è–º–∏
		df['date'] = df['date'].apply(lambda x: datetime.fromtimestamp(x))  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤—ã–≤–∞–µ–º timestamp-—á–∏—Å–ª–æ –≤ –¥–∞—Ç—É
		df['agency'] = channel_name
		ch_news = pd.concat([db, df]).sort_values('date', ascending=False)
		ch_news.to_pickle('table_news.pkl')

In [609]:
def make_articles_dict(channel_name):
    answer = requests.get('https://tg.i-c-a.su/json/' + channel_name)
    data = answer.json()
    messages = data['messages']
    
#     temp_df = pd.read_pickle('table_news.pkl')
    start_id = 0  # –≤—ã—Ç–∞—Å–∫–∏–≤–∞–µ–º —Å–∞–º—É—é –ø–æ—Å–ª–µ–¥–Ω—é—é —Å—Ç–∞—Ç—å—é –¥–∞–Ω–Ω–æ–≥–æ –º–µ–¥–∏–∞ –∏–∑ –ë–î
#     del(temp_df)
    
    # –≤—ã–±–∏—Ä–∞–µ–º —Ç–æ–ª—å–∫–æ —Ç–µ —Å—Ç–∞—Ç—å–∏, –∫–æ—Ç–æ—Ä—ã–µ —Å—Ç–∞—Ä—à–µ –ø–æ—Å–ª–µ–¥–Ω–µ–π
    id_articles = [(el, messages[el]['id']) for el in range(len(messages)) if messages[el]['id'] > start_id]
    
    draft_articles = [make_clean_text(messages[el[0]]['message'], messages[el[0]]['date']) for el in id_articles]
    articles_dict = {el[1]: draft_articles[el[0]] for el in id_articles}
    
    # –£–¥–∞–ª—è–µ–º –ø—É—Å—Ç—ã–µ —Å—Ç–∞—Ç—å–∏ –¥–ª—è –Ω–æ–≤–æ—Å—Ç–µ–π, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ —Å–æ–¥–µ—Ä–∂–∞–ª–∏ —Ç–µ–∫—Å—Ç
    empty_keys = [k for k,v in articles_dict.items() if not v['raw_news']]
    for k in empty_keys:
        del articles_dict[k]
    
    return articles_dict

In [616]:
meduzapro = make_articles_dict('meduzapro')

In [617]:
channel_dict = meduzapro

In [618]:
db = pd.read_pickle('table_news.pkl')
df = pd.DataFrame(channel_dict).T
df['category'] = df['short_news'].apply(
    lambda x: model_class.predict(x)[0][0].split('__')[-1])  # –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä—É–µ–º fasttext-–æ–º, –¥–æ—Å—Ç–∞—ë–º –∫–ª–∞—Å—Å
df = df.loc[df['category'] != 'not_news']  # —É–¥–∞–ª—è–µ–º –Ω–æ–≤–æ—Å—Ç–∏, –∫–æ—Ç–æ—Ä—ã–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä –Ω–µ –ø—Ä–∏–∑–Ω–∞–ª –Ω–æ–≤–æ—Å—Ç—è–º–∏
df['date'] = df['date'].apply(lambda x: datetime.fromtimestamp(x))  # –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤—ã–≤–∞–µ–º timestamp-—á–∏—Å–ª–æ –≤ –¥–∞—Ç—É
df['agency'] = 'meduzapro'
ch_news = pd.concat([db, df]).sort_values('date', ascending=False)
ch_news.to_pickle('table_news.pkl')

In [3]:
import pandas as pd

In [4]:
db = pd.read_pickle('table_news.pkl')

In [5]:
db

Unnamed: 0,date,title,short_news,first_link,raw_news,category,agency
22966,2022-07-03 12:00:54,–ì–¥–µ –≤ –ü–æ–¥–º–æ—Å–∫–æ–≤—å–µ –æ—Ç–¥–æ—Ö–Ω—É—Ç—å —Å –ø–∞–ª–∞—Ç–∫–æ–π: –≤ –ü—É—à–∫...,‚õ∫Ô∏è –ì–¥–µ –≤ –ü–æ–¥–º–æ—Å–∫–æ–≤—å–µ –æ—Ç–¥–æ—Ö–Ω—É—Ç—å —Å –ø–∞–ª–∞—Ç–∫–æ–π: –≤ –ü...,http://vdmsti.ru/gurb,‚õ∫Ô∏è –ì–¥–µ –≤ –ü–æ–¥–º–æ—Å–∫–æ–≤—å–µ –æ—Ç–¥–æ—Ö–Ω—É—Ç—å —Å –ø–∞–ª–∞—Ç–∫–æ–π –ù–æ–≤—ã...,society,vedomosti
18843,2022-07-03 11:57:40,–†–æ—Å—Å–∏–π—Å–∫–∞—è —Å—Ç–æ—Ä–æ–Ω–∞ –Ω–∞–Ω–µ—Å–ª–∞ —É–¥–∞—Ä –ø–æ –ë–µ–ª–≥–æ—Ä–æ–¥—É —Ä...,–†–æ—Å—Å–∏–π—Å–∫–∞—è —Å—Ç–æ—Ä–æ–Ω–∞ –Ω–∞–Ω–µ—Å–ª–∞ —É–¥–∞—Ä –ø–æ –ë–µ–ª–≥–æ—Ä–æ–¥—É —Ä...,https://www.interfax.ru/russia/850182,"–ü–æ —Å–æ–æ–±—â–µ–Ω–∏—é —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã, —É–∫—Ä–∞–∏–Ω—Å–∫–∞...",society,thebell_io
57262,2022-07-03 11:56:53,–ú–∏—Ä–∑–∏—ë–µ–≤ –≤—Ç–æ—Ä–æ–π —Ä–∞–∑ –∑–∞ —Å—É—Ç–∫–∏ –≤—ã–ª–µ—Ç–µ–ª –≤ –ö–∞—Ä–∞–∫–∞–ª...,–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç –£–∑–±–µ–∫–∏—Å—Ç–∞–Ω–∞ –®–∞–≤–∫–∞—Ç –ú–∏—Ä–∑–∏—ë–µ–≤ –ø–æ–æ–±–µ—â–∞–ª...,https://tjournal.ru/news/668194,–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç –£–∑–±–µ–∫–∏—Å—Ç–∞–Ω–∞ –ø—Ä–∏–±—ã–ª –≤ –ö–∞—Ä–∞–∫–∞–ª–ø–∞–∫—Å—Ç–∞–Ω ...,society,TJournal
6474,2022-07-03 11:55:09,–ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –†–§ –æ–±–≤–∏–Ω–∏–ª–æ –£–∫—Ä–∞–∏–Ω—É –≤ —É–¥–∞—Ä–µ –ø–æ –ë–µ–ª–≥...,–ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã –†–§ –æ–±–≤–∏–Ω–∏–ª–æ –£–∫—Ä–∞–∏–Ω—É –≤ —É–¥–∞—Ä–µ –ø–æ –ë–µ–ª–≥...,https://meduza.io/news/2022/07/03/minoborony-r...,–î–ê–ù–ù–û–ï –°–û–û–ë–©–ï–ù–ò–ï (–ú–ê–¢–ï–†–ò–ê–õ) –°–û–ó–î–ê–ù–û –ò (–ò–õ–ò) –†–ê...,society,meduzapro
63625,2022-07-03 11:54:19,–†–æ—Å—Å–∏–π—Å–∫–æ–µ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã: –ë–µ–ª–≥–æ—Ä–æ–¥ –±—ã–ª –æ–±—Å—Ç—Ä–µ–ª—è–Ω ...,"–†–æ—Å—Å–∏–π—Å–∫–æ–µ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã —É—Ç–≤–µ—Ä–∂–¥–∞–µ—Ç, —á—Ç–æ –ë–µ–ª–≥–æ—Ä–æ–¥...",,"–†–æ—Å—Å–∏–π—Å–∫–æ–µ –ú–∏–Ω–æ–±–æ—Ä–æ–Ω—ã —É—Ç–≤–µ—Ä–∂–¥–∞–µ—Ç, —á—Ç–æ –ë–µ–ª–≥–æ—Ä–æ–¥...",society,meduzalive
...,...,...,...,...,...,...,...
5713,2022-06-20 13:31:02,–ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º–∏—Ç—Ä–æ–ø–æ–ª–∏—Ç–∞...,ging.ru ‚Äì –ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º...,https://vpost-media.ru/opinions/za-chto-patria...,–ö–æ–ª–æ–Ω–∫–∞ –°–µ—Ä–≥–µ—è –ß–∞–ø–Ω–∏–Ω–∞ –æ–± –æ—Ç—Å—Ç–∞–≤–∫–µ –º–∏—Ç—Ä–æ–ø–æ–ª–∏—Ç–∞...,society,VwordMedia
242,2022-06-17 23:33:42,–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç —Å–Ω–∏–∑...,–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç —Å–Ω–∏–∑...,https://www.tohoku.ac.jp/en/press/training_vir...,üîãüß†üíª –¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –≤ –≤–∏—Ä—Ç—É–∞–ª—å–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏ –º–æ–∂–µ—Ç ...,science,now_ka
241,2022-06-15 23:05:59,–£—á–µ–Ω—ã–µ –æ–±–Ω–∞—Ä—É–∂–∏–ª–∏ –º–æ–ª–æ–¥—É—é –ø–ª–∞–Ω–µ—Ç—É —Å –º–∞—Å—Å–æ–π –Æ–ø–∏...,"–°–ø—É—Ç–Ω–∏–∫–∏ –¥–∞–ª–µ–∫–∏—Ö –∑–≤–µ–∑–¥ –ú–æ–∑–∞–∏–∫–∞ –ø—ã–ª—å–Ω—ã—Ö, –≤—Ä–∞—â–∞—é...",https://noirlab.edu/public/news/noirlab2212/,üõ∏ –ò–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Å —Ç–µ–ª–µ—Å–∫–æ–ø–∞ Gemini South –≤ –ß–∏–ª–∏ ...,science,now_ka
239,2022-06-13 23:50:58,–£—á–µ–Ω—ã–µ –Ω–∞—à–ª–∏ –Ω–æ–≤—ã–π —Å–ø–æ—Å–æ–± –ª–µ—á–µ–Ω–∏—è —Ö—Ä–æ–Ω–∏—á–µ—Å–∫–æ–π ...,–º–æ–ª–µ–∫—É–ª–∞ –≤ –Ω–µ—Ä–≤–Ω–æ–π —Å–∏—Å—Ç–µ–º–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –∫–ª—é—á–æ–º –∫...,https://ucalgary.ca/news/ucalgary-researchers-...,üíäüß† –ú–æ–ª–µ–∫—É–ª–∞ –≤ –Ω–µ—Ä–≤–Ω–æ–π —Å–∏—Å—Ç–µ–º–µ –º–æ–∂–µ—Ç –±—ã—Ç—å –∫–ª—é—á–æ...,science,now_ka


In [6]:
start_id = db.index[db['agency'] == 'nltk'][0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [622]:
# schedule.clear()