In [None]:
import re
from html.parser import HTMLParser
import pandas as pd
import emoji
import string
import fasttext
from nltk.corpus import stopwords
import spacy
from spacy.lang.ru import Russian
import concurrent.futures

contractions = {
    "–∏ —Ç.–¥.": " –∏ —Ç–∞–∫ –¥–∞–ª–µ–µ ",
    "–∏ —Ç.–ø.": " –∏ —Ç–æ–º—É –ø–æ–¥–æ–±–Ω–æ–µ ",
    "—É–ª.": " —É–ª–∏—Ü–∞ "
}

nlp = spacy.load('ru_core_news_lg')

In [None]:
class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.extracted_data = []

    def handle_data(self, data):
        self.extracted_data.append(data)

    def get_data(self):
        return ''.join(self.extracted_data)

In [None]:
def remove_non_cyrillic(text):
    # –£–¥–∞–ª—è–µ–º –≤—Å–µ, –∫—Ä–æ–º–µ —Ä—É—Å—Å–∫–∏—Ö –±—É–∫–≤
    return re.sub(r'[^–∞-—è–ê-–Ø—ë–Å]', ' ', text)

In [None]:
def replace_r_with_rubles(text):
    # –ó–∞–º–µ–Ω–∞ "—Ä" –ø–æ—Å–ª–µ —á–∏—Å–ª–∞
    text = re.sub(r'(\d)—Ä\b', r'\1 —Ä—É–±–ª–µ–π ', text)
    # –ó–∞–º–µ–Ω–∞ "—Ä" –ø–µ—Ä–µ–¥ —á–∏—Å–ª–æ–º
    text = re.sub(r'\b—Ä(\d)', r' —Ä—É–±–ª–µ–π \1', text)
    return text

In [None]:
def replace_currency_symbols(text):
    # –°–ª–æ–≤–∞—Ä—å —Å–∏–º–≤–æ–ª–æ–≤ –≤–∞–ª—é—Ç –∏ –∏—Ö –ø–æ–ª–Ω—ã—Ö –Ω–∞–∑–≤–∞–Ω–∏–π
    currency_symbols = {
        r'\$': ' –¥–æ–ª–ª–∞—Ä–æ–≤ ',
        r'‚Ç¨': ' –µ–≤—Ä–æ ',
        r'¬£': ' —Ñ—É–Ω—Ç–æ–≤ ',
        r'¬•': ' –π–µ–Ω ',
        r'‚ÇΩ': ' —Ä—É–±–ª–µ–π '
    }
    
    # –ü–µ—Ä–µ–±–∏—Ä–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –∏ –∑–∞–º–µ–Ω—è–µ–º –∫–∞–∂–¥—ã–π —Å–∏–º–≤–æ–ª –≤–∞–ª—é—Ç—ã –≤ —Ç–µ–∫—Å—Ç–µ
    for symbol, name in currency_symbols.items():
        text = re.sub(symbol, name, text)
    
    return text

In [None]:
def remove_punctuation_list_comp(text):
    return re.sub(r'\s+', ' ', re.sub(r'[\s{}]+'.format(re.escape(string.punctuation)), ' ', text)).strip()

In [None]:
def expand_contractions(text, contractions_dict):
    for key, value in contractions_dict.items():
        text = text.replace(key, value)
    return text

In [None]:
def replace_units_with_full_names(text):
    # –°–ª–æ–≤–∞—Ä—å –∑–∞–º–µ–Ω: —Å–æ–∫—Ä–∞—â–µ–Ω–∏–µ –µ–¥–∏–Ω–∏—Ü—ã –∏–∑–º–µ—Ä–µ–Ω–∏—è –∏ –µ–≥–æ –ø–æ–ª–Ω–æ–µ —Å–ª–æ–≤–µ—Å–Ω–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ
    units = {
        '–∫–≥': '–∫–∏–ª–æ–≥—Ä–∞–º–º',
        '–≥': '–≥—Ä–∞–º–º',
        '–º': '–º–µ—Ç—Ä',
        '—Å–º': '—Å–∞–Ω—Ç–∏–º–µ—Ç—Ä',
        '–º–º': '–º–∏–ª–ª–∏–º–µ—Ç—Ä',
        '–ª': '–ª–∏—Ç—Ä',
        '–º–ª': '–º–∏–ª–ª–∏–ª–∏—Ç—Ä',
        '—á': '—á–∞—Å',
        '–º–∏–Ω': '–º–∏–Ω—É—Ç–∞',
        '—Å–µ–∫': '—Å–µ–∫—É–Ω–¥–∞',
        '–∫–º': '–∫–∏–ª–æ–º–µ—Ç—Ä',
        '—à—Ç': '—à—Ç—É–∫'
    }
    
    for unit, full_name in units.items():
        text = text.replace(f' {unit} ', f' {full_name} ')
    return text

In [None]:
def replace_hyphens(text):
    # –ó–∞–º–µ–Ω–∞ –¥–µ—Ñ–∏—Å–∞ –Ω–∞ –º–∏–Ω—É—Å –≤ –º–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏—Ö –≤—ã—Ä–∞–∂–µ–Ω–∏—è—Ö
    text = re.sub(r'(?<=\d)\-(?=\d)', ' –º–∏–Ω—É—Å ', text)  # –º–µ–∂–¥—É —Ü–∏—Ñ—Ä–∞–º–∏
    text = re.sub(r'(?<=\s)\-(?=\d)', ' –º–∏–Ω—É—Å ', text)  # –ø–µ—Ä–µ–¥ —á–∏—Å–ª–æ–º –ø–æ—Å–ª–µ –ø—Ä–æ–±–µ–ª–∞
    
    # –ó–∞–º–µ–Ω–∞ –¥–µ—Ñ–∏—Å–∞ –Ω–∞ —Ç–∏—Ä–µ, —É—á–∏—Ç—ã–≤–∞—è –∫–æ–Ω—Ç–µ–∫—Å—Ç—ã, –≥–¥–µ –¥–µ—Ñ–∏—Å –Ω–µ –∑–∞–º–µ–Ω—è–µ—Ç—Å—è (–Ω–∞–ø—Ä–∏–º–µ—Ä, —á–∞—Å—Ç–∏—Ü—ã)
    # –†–∞–∑–ª–∏—á–∏–µ –º–µ–∂–¥—É —Ç–∏—Ä–µ –∏ –¥–µ—Ñ–∏—Å–æ–º –≤ —á–∞—Å—Ç–∏—Ü–∞—Ö (–Ω–∞–ø—Ä–∏–º–µ—Ä, "—á—Ç–æ-—Ç–æ", "–∫—Ç–æ-–ª–∏–±–æ")
    text = re.sub(r'\b(\w+)-(—Ç–æ|–ª–∏–±–æ|–Ω–∏–±—É–¥—å|—Ç–∞–∫–∏|–∫–∞)\b', r'\1-\2', text)

    # –û–±—â–µ–µ –ø—Ä–∞–≤–∏–ª–æ –∑–∞–º–µ–Ω—ã –¥–µ—Ñ–∏—Å–∞ –Ω–∞ —Ç–∏—Ä–µ, –µ—Å–ª–∏ —ç—Ç–æ –Ω–µ —á–∞—Å—Ç–∏—Ü—ã
    text = re.sub(r'(?<!\w)(\w+)-(\w+)', r'\1 –¥–µ—Ñ–∏—Å \2', text)

    # –ó–∞–º–µ–Ω–∞ –¥–µ—Ñ–∏—Å–∞ –Ω–∞ —Ç–∏—Ä–µ –≤ –ø—Ä–æ—á–∏—Ö —Å–ª—É—á–∞—è—Ö
    text = re.sub(r'(?<=\s)-(?=\s)', ' –¥–µ—Ñ–∏—Å ', text)  # –º–µ–∂–¥—É –ø—Ä–æ–±–µ–ª–∞–º–∏
    text = re.sub(r'(?<=\w)-(?=\s)', ' –¥–µ—Ñ–∏—Å ', text)  # –ø–æ—Å–ª–µ —Å–ª–æ–≤–∞ –ø–µ—Ä–µ–¥ –ø—Ä–æ–±–µ–ª–æ–º

    return text

In [None]:
def remove_extra_spaces_regex(text):
    text = re.sub(r'\s+', ' ', text.strip())
    return text

In [None]:
def replace_identifiers(text):
    # –†–µ–≥—É–ª—è—Ä–Ω–æ–µ –≤—ã—Ä–∞–∂–µ–Ω–∏–µ, –∫–æ—Ç–æ—Ä–æ–µ –∏—â–µ—Ç —à–µ—Å—Ç–Ω–∞–¥—Ü–∞—Ç–µ—Ä–∏—á–Ω—ã–µ —Ö–µ—à–∏, UUIDs –∏ –¥—Ä—É–≥–∏–µ —Ç–∏–ø–∏—á–Ω—ã–µ –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä—ã
    pattern = r'\b([a-f0-9]{32}|[a-f0-9]{40}|[a-f0-9]{64}|[a-f0-9-]{36}|[a-zA-Z0-9-]{7,})\b'
    
    # –ó–∞–º–µ–Ω—è–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä—ã –Ω–∞ —Å–ª–æ–≤–æ "–∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä"
    return re.sub(pattern, ' –∏–¥–µ–Ω—Ç–∏—Ñ–∏–∫–∞—Ç–æ—Ä ', text, flags=re.IGNORECASE)

In [None]:
def replace_phone_numbers_and_digits(text):
    # –†–µ–≥—É–ª—è—Ä–Ω–æ–µ –≤—ã—Ä–∞–∂–µ–Ω–∏–µ –¥–ª—è –Ω–æ–º–µ—Ä–æ–≤ —Ç–µ–ª–µ—Ñ–æ–Ω–æ–≤
    phone_pattern = r'\+?\d[\d\-\(\)\.\s]{8,}\d'
    
    # –ó–∞–º–µ–Ω–∏—Ç—å –Ω–æ–º–µ—Ä–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–æ–≤ –Ω–∞ "–Ω–æ–º–µ—Ä —Ç–µ–ª–µ—Ñ–æ–Ω–∞"
    text = re.sub(phone_pattern, ' –Ω–æ–º–µ—Ä —Ç–µ–ª–µ—Ñ–æ–Ω–∞ ', text)

    text = replace_identifiers(text)
    
    # –ó–∞–º–µ–Ω–∏—Ç—å –æ—Å—Ç–∞–≤—à–∏–µ—Å—è —á–∏—Å–ª–∞ –Ω–∞ "—á–∏—Å–ª–æ"
    text = re.sub(r'\b\d+\b', ' —á–∏—Å–ª–æ ', text)
    
    return text

In [None]:
def replace_custom_text_emojis(text):
    # –°–ª–æ–≤–∞—Ä—å —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö —ç–º–æ–¥–∑–∏ –∏ –∏—Ö —Å–ª–æ–≤–µ—Å–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏–π
    emoji_dict = {
        r':\)': ' —É–ª—ã–±–∫–∞ ',
        r':\(': ' –≥—Ä—É—Å—Ç–Ω–æ–µ –ª–∏—Ü–æ ',
        r':D': ' —Å–º–µ—Ö ',
        r';\)': ' –ø–æ–¥–º–∏–≥–∏–≤–∞–Ω–∏–µ '
    }
    
    # –ü–µ—Ä–µ–±–∏—Ä–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –∏ –∑–∞–º–µ–Ω—è–µ–º –∫–∞–∂–¥—ã–π —ç–º–æ–¥–∑–∏ –≤ —Ç–µ–∫—Å—Ç–µ
    for emoji, description in emoji_dict.items():
        text = re.sub(emoji, description, text, flags=re.IGNORECASE)
    
    return text

In [None]:
def replace_math_symbols_with_words(text):
    # –°–ª–æ–≤–∞—Ä—å –∑–∞–º–µ–Ω: –º–∞—Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–∏–π —Å–∏–º–≤–æ–ª –∏ –µ–≥–æ —Å–ª–æ–≤–µ—Å–Ω–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ
    math_symbols = {
        '+': ' –ø–ª—é—Å ',
        '*': ' —É–º–Ω–æ–∂–∏—Ç—å –Ω–∞ ',
        '/': ' —Ä–∞–∑–¥–µ–ª–∏—Ç—å –Ω–∞ ',
        '=': ' —Ä–∞–≤–Ω–æ ',
        '<': ' –º–µ–Ω—å—à–µ ',
        '>': ' –±–æ–ª—å—à–µ ',
        '‚â§': ' –º–µ–Ω—å—à–µ –∏–ª–∏ —Ä–∞–≤–Ω–æ ',
        '‚â•': ' –±–æ–ª—å—à–µ –∏–ª–∏ —Ä–∞–≤–Ω–æ '
    }
    
    # –ü–µ—Ä–µ–±–∏—Ä–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –∏ –∑–∞–º–µ–Ω—è–µ–º –∫–∞–∂–¥—ã–π —Å–∏–º–≤–æ–ª –≤ —Ç–µ–∫—Å—Ç–µ
    for symbol, word in math_symbols.items():
        text = text.replace(symbol, word)
    
    return text

In [None]:
def replace_slang(text):
    # –°–ª–æ–≤–∞—Ä—å –∑–∞–º–µ–Ω: —Å–ª–µ–Ω–≥ –∏ –µ–≥–æ –Ω–æ—Ä–º–∞–ª—å–Ω–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ
    slang_dict = {
        r'\b—Ç–≥\b': ' —Ç–µ–ª–µ—Ñ–æ–Ω ',
        r'\b—Å–ø—Å\b': ' —Å–ø–∞—Å–∏–±–æ ',
        r'\b–ø–ª–∏–∑\b': ' –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ ',
        r'\b—á–µ–ª\b': ' —á–µ–ª–æ–≤–µ–∫ ',
        r'\b–∫–µ–∫\b': ' —Å–º–µ—à–Ω–æ ',
        r'\b–ª–æ–ª\b': ' —Å–º–µ—à–Ω–æ '
    }
    
    # –ü–µ—Ä–µ–±–∏—Ä–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –∏ –∑–∞–º–µ–Ω—è–µ–º –∫–∞–∂–¥—ã–π —Å–ª–µ–Ω–≥ –≤ —Ç–µ–∫—Å—Ç–µ
    for slang, normal in slang_dict.items():
        text = re.sub(slang, normal, text, flags=re.IGNORECASE)
    
    return text

In [None]:


def preprocess_text(text):

    text = text.lower()

   # text = replace_hyphens(text)

    #print("hyphens ", text)

    text = replace_slang(text)

    #print("slang ", text)

    text = replace_custom_text_emojis(text)

    #print("emoji ", text)

    text = replace_r_with_rubles(text)

    #print("rubles ", text)

    text = replace_currency_symbols(text)

    #print("currency ", text)

    text = replace_phone_numbers_and_digits(text)

    #print("phone_numbers_and_digits ", text)

    # –ü–µ—Ä–µ–≤–æ–¥ –≤ –Ω–∏–∂–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä

    
    # –†–∞—Å–∫—Ä—ã—Ç–∏–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π
    text = expand_contractions(text, contractions)

    """   url_pattern = re.compile(
    r'\b(?:https?|ftp|mailto|data|tel):\/\/'  # –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–µ —Å—Ö–µ–º—ã
    r'(?:(?:[a-z0-9-]+\.)+[a-z]{2,13})'  # –î–æ–º–µ–Ω–Ω–æ–µ –∏–º—è
    r'(?:\/[\w\-\.~:+\/?#\[\]@!$&\'()*;,=]*)?'  # –ü—É—Ç—å
    r'(?:(?:\?[\w\-\.~:+\/?#\[\]@!$&\'()*;,=]*)?)'  # –ü–∞—Ä–∞–º–µ—Ç—Ä—ã
    r'(?:(?:#[\w\-]*)?)\b',  # –Ø–∫–æ—Ä—å
    re.IGNORECASE)
    """


    url_pattern = re.compile(r'https?://(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}')

    # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫
    text = re.sub(url_pattern, '', text)

    #print("links ", text)

    # –ø–µ—Ä–µ–≤–æ–¥–∏–º —ç–º–æ–¥–∂–∏ –≤ —Ç–µ–∫—Å—Ç –≤–∏–¥–∞ :–º–∞—à–µ—Ç_—Ä—É–∫–æ–π: , –∑–∞–º–µ–Ω—è–µ–º —Å–∏–º–≤–æ–ª—ã : –∏ _ –Ω–∞ –ø—Ä–æ–±–µ–ª—ã
    text = emoji.demojize(text, language='ru').replace(':', ' ').replace('_', ' ')

    #print("emoji  ", text)
    
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ HTML
    parser = MyHTMLParser()
    parser.feed(text)
    extracted_text = parser.get_data()
    # –†–µ–∫—É—Ä—Å–∏–≤–Ω–æ –ø—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∫ –∏–∑–≤–ª–µ—á–µ–Ω–Ω–æ–º—É —Ç–µ–∫—Å—Ç—É
    if extracted_text != text:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –±—ã–ª –ª–∏ HTML —Ç–µ–≥
        text = preprocess_text(extracted_text)
    
    #print("html  ", text)
    
    # —É–±–∏—Ä–∞–µ–º –∑–Ω–∞–∫–∏ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
    text = remove_punctuation_list_comp(replace_math_symbols_with_words(text))

    #print("remove_punctuation_list_comp  ", text)

    text = replace_units_with_full_names(text)

    #print("replace_units_with_full_names  ", text)

    text = remove_extra_spaces_regex(remove_non_cyrillic(text))
    
    return text

In [None]:
def lemmatize_texts(texts):
    lemmatized_texts = []
    for doc in nlp.pipe(texts):
        lemmatized_text = " ".join([token.lemma_ for token in doc])
        lemmatized_texts.append(lemmatized_text)
    return lemmatized_texts


In [None]:
def process_csv_and_lemmatize(input_file_path, output_file_path):
    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ CSV —Ñ–∞–π–ª–∞
    df = pd.read_csv(input_file_path)
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –Ω—É–∂–Ω–æ–≥–æ —Å—Ç–æ–ª–±—Ü–∞
    if 'processed_comment' not in df.columns:
        raise ValueError("–í —Ñ–∞–π–ª–µ –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç —Å—Ç–æ–ª–±–µ—Ü 'processed_comment'.")

    # –ü–∞–∫–µ—Ç–Ω–∞—è –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è –≤—Å–µ—Ö –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤
    df['lemmatized_comment'] = lemmatize_texts(df['processed_comment'])

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ–±–Ω–æ–≤–ª–µ–Ω–Ω—ã–π DataFrame –≤ –Ω–æ–≤—ã–π CSV —Ñ–∞–π–ª
    df.to_csv(output_file_path, index=False)
    print("–§–∞–π–ª —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω:", output_file_path)

In [None]:
sample_text = " –ü—Ä–∏–≤–µ—Ç :D –ö–∞–∫ –¥–µ–ª–∞? :) –ù–∞–¥–µ—é—Å—å,  ) –≤—Å—ë?—Ö–æ—Ä–æ—à–æ:($100, ‚Ç¨75, ¬£50, f wa awfx —Å–ø—Å ¬• 5000 –∏ ‚ÇΩ3000 –í–∞—à –∞–ø–∏.–∫–ª—é—á: abcd1234 ef5678gh, –≤–∞—à UUID: 123e4567-e89b-12d3-a456-426614174000. +7 123 456-78-90 –∏–ª–∏ (123) 456 7890. –í–æ–∑—Ä–∞—Å—Ç 30 –ª–µ—Ç –≠—Ç–æ –ø—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞ —Å —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è–º–∏ —Ç–∏–ø–∞ –∫–≥ –∏ —Ç.–¥. –∏ HTML + <b>—Ç–µ–≥–∞–º–∏</b>. –ü–æ–¥—Ä–æ–±–Ω–µ–µ –Ω–∞ www.foufos.gr  –ü—Ä–∏–≤–µ—Ç üëã! –Ø —Ä–∞–¥ —ç—Ç–æ–º—É –¥–Ω—é üòä"
processed_text = preprocess_text(sample_text)
print(processed_text)

In [None]:
def find_potential_contractions(file_path, column_name='comments'):
    # –†–µ–≥—É–ª—è—Ä–Ω–æ–µ –≤—ã—Ä–∞–∂–µ–Ω–∏–µ –¥–ª—è –ø–æ–∏—Å–∫–∞ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π
    contractions_pattern = re.compile(r'\b\w+\.\b')
    
    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ —Ñ–∞–π–ª–∞
    df = pd.read_csv(file_path)
    
    # –£–±–µ–¥–∏–º—Å—è, —á—Ç–æ –∫–æ–ª–æ–Ω–∫–∞ —Å –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏—è–º–∏ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç
    if column_name not in df.columns:
        raise ValueError(f"–ö–æ–ª–æ–Ω–∫–∞ {column_name} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞ –≤ —Ñ–∞–π–ª–µ.")
    
    # –ü–æ–∏—Å–∫ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π –≤ –∫–∞–∂–¥–æ–º –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏
    df['potential_contractions'] = df[column_name].apply(lambda x: re.findall(contractions_pattern, x))
    
    return df[df['potential_contractions'].map(bool)]

In [None]:
file_path = 'dataset/dataset-ru2.csv'
result_df = find_potential_contractions(file_path, 'comment')
result_df.to_csv('dataset/look2.csv', index=False)

In [None]:
def process_csv_add_column(input_file_path, output_file_path):
    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ CSV —Ñ–∞–π–ª–∞
    df = pd.read_csv(input_file_path)
    
    if 'comment' not in df.columns:
        raise ValueError("–í —Ñ–∞–π–ª–µ –æ—Ç—Å—É—Ç—Å—Ç–≤—É–µ—Ç —Å—Ç–æ–ª–±–µ—Ü 'comment'.")

    df['processed_comment'] = df['comment'].apply(preprocess_text)
    
    df.to_csv(output_file_path, index=False)
    print("–§–∞–π–ª —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω:", output_file_path)

In [None]:
input_file = 'dataset/dataset_lg.csv'  # –£–∫–∞–∂–∏—Ç–µ –∑–¥–µ—Å—å –ø—É—Ç—å –∫ –≤–∞—à–µ–º—É —Ñ–∞–π–ª—É
output_file = 'dataset/dataset_lg_train.csv'  # –ü—É—Ç—å –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –Ω–æ–≤–æ–≥–æ —Ñ–∞–π–ª–∞

process_csv_add_column(input_file, output_file)

In [None]:
input_file_path = 'dataset_lg_train.csv'
output_file_path = 'dataset/dataset_lg_train_final.csv'
process_csv_and_lemmatize(input_file_path, output_file_path)

In [None]:
df = pd.read_csv('dataset/train-ru2-lemmatized.csv')
print(df.shape[0])
df.head()

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
df = pd.read_csv('train-ru-lemmatized.csv')

# –í—ã–±–æ—Ä —Å—Ç–æ–ª–±—Ü–∞ (–∑–∞–º–µ–Ω–∏—Ç–µ 'column_name' –Ω–∞ –∏–º—è –≤–∞—à–µ–≥–æ —Å—Ç–æ–ª–±—Ü–∞)
column = df['lemmatized_comment']

unique_words = set()

# –ü–æ–¥—Å—á—ë—Ç —Å–ª–æ–≤ –≤ –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–µ —Å—Ç–æ–ª–±—Ü–∞
df['word_count'] = column.apply(lambda x: unique_words.update(str(x).lower().split()))

# –ü–æ–¥—Å—á—ë—Ç –æ–±—â–µ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–ª–æ–≤ –≤ —Å—Ç–æ–ª–±—Ü–µ
print(column.apply(lambda x: len(str(x))).max())

print(column.apply(lambda x: len(str(x))).mean())

print("–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–≤ –≤ —Å—Ç–æ–ª–±—Ü–µ:", len(unique_words))

In [None]:
df = pd.read_csv('train-ru-lemmatized.csv')

df['word_count'] = df['lemmatized_comment'].str.split().apply(len)

# –ù–∞—Ö–æ–∂–¥–µ–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ —è—á–µ–π–∫–∏ —Å –Ω–∞–∏–±–æ–ª—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Å–ª–æ–≤
max_words_index = df['word_count'].idxmax()

# –í—ã–≤–æ–¥ —è—á–µ–π–∫–∏ —Å –Ω–∞–∏–±–æ–ª—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Å–ª–æ–≤
print("–Ø—á–µ–π–∫–∞ —Å –Ω–∞–∏–±–æ–ª—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Å–ª–æ–≤:")
print(df.loc[max_words_index, 'lemmatized_comment'])

# –í—ã–≤–æ–¥ –≤—Å–µ–π —Å—Ç—Ä–æ–∫–∏, –µ—Å–ª–∏ –Ω—É–∂–Ω–æ –±–æ–ª—å—à–µ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏
print("\n–°—Ç—Ä–æ–∫–∞ —Å –Ω–∞–∏–±–æ–ª—å—à–∏–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Å–ª–æ–≤:")
print(df.loc[max_words_index])
print(df['comment'][250236])

In [None]:
df = pd.read_csv('train-ru-lemmatized.csv')
df['word_count'] = df['lemmatized_comment'].str.split().apply(len)

# –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å—Ä–µ–¥–Ω–µ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–ª–æ–≤
average_word_count = df['word_count'].mean()

print("–°—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–≤ –≤ —è—á–µ–π–∫–µ:", average_word_count)

In [None]:
df1 = pd.read_csv('dataset/dataset-ru1.csv')

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ –≤—Ç–æ—Ä–æ–≥–æ CSV-—Ñ–∞–π–ª–∞
df2 = pd.read_csv('dataset/dataset-ru2.csv')

# –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ DataFrame
combined_df = pd.concat([df1, df2], ignore_index=True)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω–æ–≥–æ DataFrame –≤ –Ω–æ–≤—ã–π CSV-—Ñ–∞–π–ª
combined_df.to_csv('dataset/dataset_lg.csv', index=False)

In [None]:
def lemmatize_text(text):
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ –æ–¥–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞ —á–µ—Ä–µ–∑ nlp
    doc = nlp(text)
    # –°–æ–∑–¥–∞–Ω–∏–µ –ª–µ–º–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–π —Å—Ç—Ä–æ–∫–∏
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

In [None]:
print(lemmatize_text('–≤–æ—Ç –ø—Ä—è–º –±–µ—Å—è—Ç –∏—Ö –ø–µ—Å–Ω–∏ –∞–∂ –±–ª–µ–≤–∞—Ç—å –≤—Å–µ–≥–¥–∞ —Ö–æ—á–µ—Ç—Å—è –∫–ª–æ—É–Ω'))

In [None]:
df1 = pd.read_csv('dataset/dataset-ru2.csv')
df1.head()

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤
df1 = pd.read_csv('dataset/dataset-ru1.csv')
df2 = pd.read_csv('dataset/dataset-ru2.csv')

# –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ñ–∞–π–ª–æ–≤
combined_df = pd.concat([df1, df2], ignore_index=True)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω–æ–≥–æ —Ñ–∞–π–ª–∞
combined_df.to_csv('dataset/dataset-ru.csv', index=False)

In [None]:
df2 = pd.read_csv('dataset/dataset-ru3.csv')
df2.shape

In [None]:
df2['comment'][239000]

In [None]:
df2['preprocessed_comment'] = df2['comment'].apply(preprocess_text)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±–Ω–æ–≤–ª—ë–Ω–Ω–æ–≥–æ DataFrame –≤ –Ω–æ–≤—ã–π CSV —Ñ–∞–π–ª
df2.to_csv('updated_file-april.csv', index=False)

In [None]:
duplicates = df2['preprocessed_comment'].duplicated(keep=False)

# –í—ã–≤–æ–¥ —Å—Ç—Ä–æ–∫ —Å –¥—É–±–ª–∏—Ä—É—é—â–∏–º–∏—Å—è –∑–Ω–∞—á–µ–Ω–∏—è–º–∏
duplicate_rows = df2[duplicates]
print(duplicate_rows)

In [None]:

ids = df2["preprocessed_comment"]
df2[ids.isin(ids[ids.duplicated()])].sort_values("preprocessed_comment")

In [None]:
df2 = pd.read_csv('dataset/dataset_lg_train.csv')
df.dropna(inplace=True)
unique_words = set()

# –ü—Ä–æ—Ö–æ–¥ –ø–æ –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–µ –∫–æ–ª–æ–Ω–∫–∏ –∏ –¥–æ–±–∞–≤–ª–µ–Ω–∏–µ —Å–ª–æ–≤ –≤ –º–Ω–æ–∂–µ—Å—Ç–≤–æ
for comment in df2['processed_comment']:
    # –î–µ–ª–∞–µ–º —Å–ø–ª–∏—Ç –ø–æ –ø—Ä–æ–±–µ–ª—É –∏ –¥–æ–±–∞–≤–ª—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –º–Ω–æ–∂–µ—Å—Ç–≤–æ
    unique_words.update(comment.split())

# –í—ã–≤–æ–¥ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤
print("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤:", len(unique_words))
#print(list(unique_words)[:100])

In [None]:
df2 = pd.read_csv('dataset/dataset_lg_train.csv')

# –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç—Ä–æ–∫ —Å –ø—Ä–æ–ø—É—â–µ–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏
df2.dropna(inplace=True)

# –°–æ–∑–¥–∞–Ω–∏–µ –º–Ω–æ–∂–µ—Å—Ç–≤–∞ –¥–ª—è —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤
unique_words = set()

# –ü—Ä–æ—Ö–æ–¥ –ø–æ –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–µ –∫–æ–ª–æ–Ω–∫–∏ –∏ –¥–æ–±–∞–≤–ª–µ–Ω–∏–µ —Å–ª–æ–≤ –≤ –º–Ω–æ–∂–µ—Å—Ç–≤–æ
for comment in df2['processed_comment']:
    # –î–µ–ª–∞–µ–º —Å–ø–ª–∏—Ç –ø–æ –ø—Ä–æ–±–µ–ª—É –∏ –¥–æ–±–∞–≤–ª—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –º–Ω–æ–∂–µ—Å—Ç–≤–æ
    unique_words.update(comment.split())

# –í—ã–≤–æ–¥ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤
print("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤:", len(unique_words))

In [None]:
df = pd.read_csv('dataset/dataset-ru2.csv')
df.describe()

In [None]:
248283 + 14412