In [8]:
import re
import spacy
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
tqdm.pandas()

In [2]:
def clean_text(input_text):    
    input_text = str(input_text)
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ —Ä–µ–∫–ª–∞–º—É –ø–æ –ï–†–ò–†
    ad = r'–†–µ–∫–ª–∞–º–∞.*(ERID|Erid|erid|–û–ì–†–ù)'

    if bool(re.search(ad, input_text)):
        return "–†–ï–ö–õ–ê–ú–ê"
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ —Ä–µ–∫–ª–∞–º—É –ø–æ —Ö—ç—à—Ç–µ–≥–∞–º
    # –°–ø–∏—Å–æ–∫ —Ö—ç—à—Ç–µ–≥–æ–≤ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
    hashtags_to_check = ['#–Ω–∞—Ç–∏–≤–Ω–∞—è–∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏—è', '#–∏–Ω—Ç–µ–≥—Ä–∞—Ü–∏—è', '#—Ä–µ–∫–ª–∞–º–∞', '#—Å–ø–æ–Ω—Å–æ—Ä']

    # –†–µ–≥—É–ª—è—Ä–Ω–æ–µ –≤—ã—Ä–∞–∂–µ–Ω–∏–µ –¥–ª—è –ø–æ–∏—Å–∫–∞ —Ö—ç—à—Ç–µ–≥–æ–≤ –≤ —Ç–µ–∫—Å—Ç–µ
    pattern = '|'.join(re.escape(tag) for tag in hashtags_to_check)

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —Å–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ —Ç–µ–∫—Å—Ç –æ–¥–∏–Ω –∏–∑ —Ö—ç—à—Ç–µ–≥–æ–≤
    contains_hashtags = bool(re.search(pattern, input_text))

    if contains_hashtags:
        return "–†–ï–ö–õ–ê–ú–ê"

    # URL –∏ —Å—Å—ã–ª–∫–∏: –¥–∞–ª–µ–µ - —É–¥–∞–ª—è–µ–º –∏–∑ —Ç–µ–∫—Å—Ç–∞ –≤—Å–µ URL –∏ —Å—Å—ã–ª–∫–∏
    clean_text = re.sub(r'http\S+', '', input_text)
    
    # –ü—Ä–∏–≤–æ–¥–∏–º –≤—Å–µ –≤—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
    clean_text = clean_text.lower()
        
    clean_text = clean_text.replace('—ë', '–µ')
    clean_text = clean_text.replace('–Å', '–ï')

    # –ó–∞–º–µ–Ω—è–µ–º –ø–µ—Ä–µ–Ω–æ—Å —Å—Ç—Ä–æ–∫–∏ –Ω–∞ —Ç–æ—á–∫—É
    clean_text = re.sub('\n', '. ', clean_text)
    clean_text = re.sub('\n\n', '. ', clean_text)

    # –£–±–∏—Ä–∞–µ–º –≤—Å–µ –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –∏ —Ç—á–∫–∏
    clean_text = re.sub('(\. )+', '. ', clean_text)
    
    # –£–±–∏—Ä–∞–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã
    clean_text = re.sub('[^–∞-—è–ê-–Øa-zA-Z]', ' ', clean_text)
    clean_text = re.sub('\s+', ' ', clean_text)

    return clean_text

In [3]:
def get_lemma(doc):
    words = []
    for token in doc:
        if (token.is_stop != True) and (token.is_punct != True) and (token.is_space != True) and (token.is_digit != True):
            words.append(token.lemma_)
    return ' '.join(words)

In [3]:
def sw(clean_text):
    # –°—Ç–æ–ø-—Å–ª–æ–≤–∞: —É–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤ - —ç—Ç–æ —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω–∞—è –ø—Ä–∞–∫—Ç–∏–∫–∞ –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–æ–≤
    stop_words = set(stopwords.words('russian'))
    tokens = word_tokenize(clean_text)
    tokens = [token for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)

    return clean_text

In [4]:
nlp = spacy.load('ru_core_news_lg', exclude=['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'ner'])

In [20]:
df = pd.read_csv('../../raw_data/mads_news_27cat.csv')

In [5]:
df['clear_text'] = df['text'].apply(clean_text)

In [6]:
df = df.query('clear_text != "–†–ï–ö–õ–ê–ú–ê"')

In [8]:
df['clear_text_doc'] = list(nlp.pipe(df['clear_text'], n_process=-1))

In [9]:
df['clear_text'] = df['clear_text_doc'].apply(get_lemma)

In [11]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,date,text,category,name,channel_id,clean_text,clear_text
0,262,2021-09-09 13:50:22,‚Äã–ß—Ç–æ —Ç—Ä–µ–±—É–µ—Ç—Å—è –¥–ª—è —É—Å–ø–µ—Ö–∞ –≤ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–∞—Ö.\n\n–õ—É...,–ë–∏–∑–Ω–µ—Å –∏ —Å—Ç–∞—Ä—Ç–∞–ø—ã,–ë–∏–∑–Ω–µ—Å –°—Ç–∏–ª—å,1482665221,—á—Ç–æ —Ç—Ä–µ–±—É–µ—Ç—Å—è —É—Å–ø–µ—Ö–∞ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–∞—Ö . –ª—É—á—à–∏–µ –ø–µ—Ä–µ...,—Ç—Ä–µ–±—É–µ—Ç—Å—è —É—Å–ø–µ—Ö–∞ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä–∞—Ö –ª—É—á—à–∏–µ –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—â...
1,248,2021-08-31 10:04:44,‚Äã¬´–°–æ—Å—Ä–µ–¥–æ—Ç–æ—á—å –≤—Å–µ —Å–∏–ª—ã –Ω–∞ –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–µ –∂–µ–ª–∞–µ–º–æ–≥–æ...,–ë–∏–∑–Ω–µ—Å –∏ —Å—Ç–∞—Ä—Ç–∞–ø—ã,–ë–∏–∑–Ω–µ—Å –õ–∏—á–Ω–æ—Å—Ç—å,1169845777,"—Å–æ—Å—Ä–µ–¥–æ—Ç–æ—á—å —Å–∏–ª—ã –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–µ –∂–µ–ª–∞–µ–º–æ–≥–æ , –≤—Ä–µ–º...",—Å–æ—Å—Ä–µ–¥–æ—Ç–æ—á—å —Å–∏–ª—ã –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–µ –∂–µ–ª–∞–µ–º–æ–≥–æ –≤—Ä–µ–º–µ–Ω–∏ ...
2,493,2022-06-22 18:14:44,‚Äã–°–∞–º–æ–º–æ—Ç–∏–≤–∞—Ü–∏—è –∏–ª–∏ –∫–∞–∫ –º–æ—Ç–∏–≤–∏—Ä–æ–≤–∞—Ç—å —Å–µ–±—è. 6 –∫–æ...,–ë–∏–∑–Ω–µ—Å –∏ —Å—Ç–∞—Ä—Ç–∞–ø—ã,–ë–∏–∑–Ω–µ—Å –°—Ç–∏–ª—å,1482665221,—Å–∞–º–æ–º–æ—Ç–∏–≤–∞—Ü–∏—è –º–æ—Ç–∏–≤–∏—Ä–æ–≤–∞—Ç—å . 6 –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã—Ö –ø—Ä–∏–µ...,—Å–∞–º–æ–º–æ—Ç–∏–≤–∞—Ü–∏—è –º–æ—Ç–∏–≤–∏—Ä–æ–≤–∞—Ç—å –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã—Ö –ø—Ä–∏–µ–º–æ–≤ ...
3,864,2022-02-25 10:09:00,"–ë–∏–∑–Ω–µ—Å-–∫–µ–π—Å: –°–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–∂–Ω—ã—Ö —Å–∞–π—Ç–æ–≤, CRM\n\nüí∏...",–ë–∏–∑–Ω–µ—Å –∏ —Å—Ç–∞—Ä—Ç–∞–ø—ã,Business Advisor,1284946539,"–±–∏–∑–Ω–µ—Å-–∫–µ–π—Å —Å–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–∂–Ω—ã—Ö —Å–∞–π—Ç–æ–≤ , crm –æ–¥–Ω...",–±–∏–∑–Ω–µ—Å –∫–µ–π—Å —Å–æ–∑–¥–∞–Ω–∏–µ —Å–ª–æ–∂–Ω—ã—Ö —Å–∞–π—Ç–æ–≤ crm –æ–¥–Ω–æ–π ...
4,570,2022-10-17 20:43:18,"‚Äã‚Äã3 —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞, –ø—Ä–∏–º–µ–Ω–∏–º—ã—Ö –≤ –±–∏–∑–Ω–µ—Å–µ.\n\n–ü—Å–∏...",–ë–∏–∑–Ω–µ—Å –∏ —Å—Ç–∞—Ä—Ç–∞–ø—ã,–ë–∏–∑–Ω–µ—Å –°—Ç–∏–ª—å,1482665221,"3 —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ , –ø—Ä–∏–º–µ–Ω–∏–º—ã—Ö –±–∏–∑–Ω–µ—Å–µ . –ø—Å–∏—Ö–æ–ª–æ–≥...",—ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ –ø—Ä–∏–º–µ–Ω–∏–º—ã—Ö –±–∏–∑–Ω–µ—Å–µ –ø—Å–∏—Ö–æ–ª–æ–≥–∏—á–µ—Å–∫–∏...
...,...,...,...,...,...,...,...,...
81022,132,2022-12-26 14:25:07,"–¢–∞–π. –ß–∞—Å—Ç—å 3.\n\n–ë–µ–∑ –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏–π, –∫–∞–∫ –≤—ã –ø–æ–Ω–∏–º...",–ë–ª–æ–≥,–°–µ–º—å—è –∏ –Ø,1693400962,"—Ç–∞–π . —á–∞—Å—Ç—å 3. –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏–π , –ø–æ–Ω–∏–º–∞–µ—Ç–µ , –æ–±–æ—à–ª...",—Ç–∞–π —á–∞—Å—Ç—å –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏–π –ø–æ–Ω–∏–º–∞–µ—Ç–µ –æ–±–æ—à–ª–æ—Å—å –ø—Ä—è–º–æ...
81023,133,2022-12-31 15:06:10,"2Ô∏è‚É£ 0Ô∏è‚É£ 2Ô∏è‚É£ 3Ô∏è‚É£\n\n–î–æ—Ä–æ–≥–∏–µ –º–æ–∏, —Å–µ–≥–æ–¥–Ω—è –ø–æ—Å–ª–µ–¥...",–ë–ª–æ–≥,–°–µ–º—å—è –∏ –Ø,1693400962,"2 0 2 3 –¥–æ—Ä–æ–≥–∏–µ –º–æ–∏ , —Å–µ–≥–æ–¥–Ω—è –ø–æ—Å–ª–µ–¥–Ω–∏–π –¥–µ–Ω—å —É...",–¥–æ—Ä–æ–≥–∏–µ –º–æ–∏ —Å–µ–≥–æ–¥–Ω—è –ø–æ—Å–ª–µ–¥–Ω–∏–π –¥–µ–Ω—å —É—Ö–æ–¥—è—â–µ–≥–æ –≥...
81024,134,2023-01-06 17:52:58,–ù–æ–≤–æ–≥–æ–¥–Ω–∏–µ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏ –ø—Ä–æ—à–ª–∏ –¥–æ–≤–æ–ª—å–Ω–æ –Ω–∞—Å—ã—â–µ–Ω–æ....,–ë–ª–æ–≥,–°–µ–º—å—è –∏ –Ø,1693400962,–Ω–æ–≤–æ–≥–æ–¥–Ω–∏–µ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏ –ø—Ä–æ—à–ª–∏ –¥–æ–≤–æ–ª—å–Ω–æ –Ω–∞—Å—ã—â–µ–Ω–æ ...,–Ω–æ–≤–æ–≥–æ–¥–Ω–∏–µ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏ –ø—Ä–æ—à–ª–∏ –¥–æ–≤–æ–ª—å–Ω–æ –Ω–∞—Å—ã—â–µ–Ω–æ ...
81025,135,2023-01-10 13:32:33,üí∞–Å–º–∫–æ—Å—Ç—å —Ñ–∏–Ω–∞–Ω—Å–æ–≤–æ–π —á–∞—à–∏.\n\n\n–ù–∞—à—ë–ª –≤ –ø—Ä–æ—Å—Ç–æ—Ä...,–ë–ª–æ–≥,–°–µ–º—å—è –∏ –Ø,1693400962,–µ–º–∫–æ—Å—Ç—å —Ñ–∏–Ω–∞–Ω—Å–æ–≤–æ–π —á–∞—à–∏ . –Ω–∞—à–µ–ª –ø—Ä–æ—Å—Ç–æ—Ä–∞—Ö —Å–µ—Ç–∏...,–µ–º–∫–æ—Å—Ç—å —Ñ–∏–Ω–∞–Ω—Å–æ–≤–æ–π —á–∞—à–∏ –Ω–∞—à–µ–ª –ø—Ä–æ—Å—Ç–æ—Ä–∞—Ö —Å–µ—Ç–∏ —Ç...
