In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **IMPORTS**

In [2]:
from pathlib import Path

root_to_data = Path('/content/drive/MyDrive/gagarin_dataset')

In [45]:
%%time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statistics
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

!pip install --upgrade transformers
!pip install transformers[torch]
!pip install accelerate -U

import transformers

print(f'transformer varsion: {transformers.__version__}')

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m297.4/297.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-

# **DATA**

In [4]:
sentiment_labels = pd.read_csv(f'{root_to_data}/sentiment.csv')
sentiment_labels.drop('Unnamed: 0',axis = 1,inplace = True)
print(sentiment_labels.shape, '\n', sentiment_labels.head(3))

(10931, 4) 
    MessageID   ChannelID  issuerid  SentimentScore
0     241407  1203560567       153               2
1      33684  1136626166       230               4
2      10090  1063908560       118               4


In [5]:
sentiment_text = pd.read_pickle(f'{root_to_data}/sentiment_texts.pickle')
print(sentiment_text.shape, '\n', sentiment_text.head(3))

(9289, 8) 
    MessageID   ChannelID  issuerid  SentimentScore           DateAdded  \
0     241407  1203560567       153               2 2023-05-12 19:03:20   
1      33684  1136626166       230               4 2023-02-03 20:56:29   
2      10090  1063908560       118               4 2023-06-02 19:18:37   

           DatePosted                                        MessageText  \
0 2023-05-12 19:02:42  ‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥  —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 20...   
1 2023-02-03 16:46:34  Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...   
2 2023-06-02 18:50:00  ‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏  üìà–í–¢–ë ...   

   IsForward  
0      False  
1      False  
2      False  


In [6]:
# –°–¥–µ–ª–∞–µ–º —Å–ª–∏—è–Ω–∏–µ sentiment_labels –∏ sentiment_text
summarization_df = sentiment_labels.merge(sentiment_text, how='right', on='MessageID')

summarization_df.drop(['ChannelID_y','issuerid_y','SentimentScore_y', 'MessageID', 'ChannelID_x', 'issuerid_x', 'SentimentScore_x', 'DateAdded', 'DatePosted', 'IsForward'], axis=1, inplace=True)

summarization_df = summarization_df.drop_duplicates(subset=['MessageText'], keep='first')

print(summarization_df.shape, '\n', summarization_df.head(5))

(7188, 1) 
                                           MessageText
0   ‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥  —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 20...
1   Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...
2   ‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏  üìà–í–¢–ë ...
6   ‚Äã‚ÄãWindfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ ...
22  –£ –Ω–∞—Å –±—ã–ª–æ 2 –ø–∞–∫–µ—Ç–∞ –ø—Ä–µ—Ñ–æ–≤ –°—É—Ä–≥—É—Ç–Ω–µ—Ñ—Ç–µ–≥–∞–∑–∞, 75...


**–î–ª—è –Ω–∞–∏–ª—É—á—à–µ–≥–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞ —Å—É–º–º–∞—Ä–∏–∑–∞—Ü–∏–∏ —Å—Ç–æ–∏—Ç —É–±—Ä–∞—Ç—å –≤—Å–µ —Å–º–∞–π–ª–∏–∫–∏** üî•

In [7]:
sentence = summarization_df['MessageText'][2]
print(sentence)
sentence_clean = re.sub('[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]+', '', sentence)
print(sentence_clean)

‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏  üìà–í–¢–ë +5.1% –í–¢–ë —Å–µ–≥–æ–¥–Ω—è –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —á—É–¥–µ—Å–∞ –Ω–∞ –≤–∏—Ä–∞–∂–∞—Ö! –í –º–æ–º–µ–Ω—Ç–µ —Ç–µ—Ä—è–ª –±–æ–ª–µ–µ 5.5% –Ω–∞ –Ω–æ–≤–æ—Å—Ç—è—Ö –æ —Ü–µ–Ω–µ —Ä–∞–∑–º–µ—â–µ–Ω–∏—è –¥–æ–ø—ç–º–∏—Å—Å–∏–∏, –∫–æ—Ç–æ—Ä–∞—è –æ–∫–∞–∑–∞–ª–∞—Å—å –ø–æ—á—Ç–∏ –Ω–∞ 15% –Ω–∏–∂–µ –∫–æ—Ç–∏—Ä–æ–≤–æ–∫ –∑–∞–∫—Ä—ã—Ç–∏—è —á–µ—Ç–≤–µ—Ä–≥–∞(—Ü–µ–Ω–∞ —Ä–∞–∑–º–µ—â–µ–Ω–∏—è 0,018225 —Ä—É–±–ª–µ–π). –¢–µ–º –Ω–µ –º–µ–Ω–µ–µ, –ø–æ—Å–ª–µ –ø–∞–¥–µ–Ω–∏—è –∫–æ—Ç–∏—Ä–æ–≤–∫–∏ –Ω–∞—á–∞–ª–∏ —É–≤–µ—Ä–µ–Ω–Ω–æ –æ—Ç—Å–∫–∞–∫–∏–≤–∞—Ç—å –∏ –∞–∫—Ü–∏–∏ –≤—ã—à–ª–∏ –≤ –ª–∏–¥–µ—Ä—ã —Ä–æ—Å—Ç–∞üò≥  üìâ–ú–µ—á–µ–ª –∞–ø -0.5% –ï—â–µ –æ–¥–∏–Ω —Ñ–æ–∫—É—Å–Ω–∏–∫ –¥–Ω—è ‚Äî —ç—Ç–æ –ú–µ—á–µ–ª. –í –º–æ–º–µ–Ω—Ç–µ —Ç–µ—Ä—è–ª –±–æ–ª–µ–µ 5.5%, –°–î —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞–ª –Ω–µ –≤—ã–ø–ª–∞—á–∏–≤–∞—Ç—å –¥–∏–≤–∏–¥–µ–Ω–¥—ã –ø–æ –∏—Ç–æ–≥–∞–º 2022 –≥–æ–¥–∞. –ö–æ–º–ø–∞–Ω–∏—è –ø–æ–ª—É—á–∏–ª–∞ —É–±—ã—Ç–æ–∫ –ø–æ –∏—Ç–æ–≥–∞–º 2022 –≥–æ–¥–∞ –≤ —Ä–∞–∑–º–µ—Ä–µ 33 258 899 —Ä

In [8]:
list_messages = summarization_df['MessageText'].tolist()
clean_messages_df = pd.DataFrame(columns=['CleanSentences'])

for sent in list_messages:
  sent_clean = re.sub('[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]+', '', sent)
  clean_messages_df = pd.concat([clean_messages_df, pd.DataFrame({'CleanSentences': [sent_clean]})], ignore_index=True)

clean_messages_df.head(3)

Unnamed: 0,CleanSentences
0,#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 2022–≥ ...
1,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...
2,‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—è–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ –í–¢–ë +5...


In [9]:
clean_messages_df.shape

(7188, 1)

# **–î–ª—è –∑–∞–¥–∞—á–∏ —Å—É–º–º–∞—Ä–∏–∑–∞—Ü–∏–∏ –±—É–¥–µ–º –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å sberbank-ai/rugpt3large_based_on_gpt2**

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# –ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ T5 –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
model_name = 't5-base' # –ò—Å–ø–æ–ª—å–∑—É–π—Ç–µ 't5-base' –¥–ª—è –∞–Ω–≥–ª–∏–π—Å–∫–æ–≥–æ —è–∑—ã–∫–∞
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è —Å—É–º–º–∞—Ä–∏–∑–∞—Ü–∏–∏ —Ç–µ–∫—Å—Ç–∞
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0])
    return summary

summary = summarize(sentence)
print(summary)

# def tokenize_function(examples):
#     inputs = tokenizer(examples["documents"], truncation=True, padding="max_length", max_length=1024)
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples["summaries"], truncation=True, padding="max_length", max_length=128)
#     inputs["labels"] = labels["input_ids"]
#     return inputs
# tokenized_dataset = your_dataset.map(tokenize_function, batched=True)

# model = BartForConditionalGeneration.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')

# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     push_to_hub=False,
# )

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer,
# )

# trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<pad> <unk> –æ–∫—É—Å<unk> <unk> —Ä–æ–¥–æ–ª<unk> –∞<unk> —Ç—Å<unk> –∞–∫<unk> –∏–∏ –∏–Ω–≤–µ—Å—Ç–∏<unk> –∏–∏ –≤—Ç<unk> 5 1 –≤—Ç<unk> —Å–µ<unk> –æ–¥–Ω<unk> –æ–∫–∞<unk> –∞–ª–∞—Å<unk> 15 –Ω–∏<unk> –µ –∫–æ—Ç–∏—Ä–æ–≤–æ–∫ <unk> –∞–∫—Ä<unk> —Ç–∏<unk> <unk> –µ—Ç–≤–µ—Ä<unk> –∞ <unk> –µ–Ω–∞ —Ä–∞<unk> –º–µ<unk> –µ–Ω–∏<unk> 0 018225 —Ä—É<unk> –ª–µ<unk> –º–µ–Ω–µ–µ <unk> –∞–¥–µ–Ω–∏<unk> –∫–æ—Ç–∏</s>


In [None]:
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0])
    return summary

summary = summarize(sentence)
print(summary)

AttributeError: 'WordPunctTokenizer' object has no attribute 'encode'

# **–ü–æ–¥–±–∏—Ä–∞–µ–º –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω—ã–µ –º–æ–¥–µ–ª–∏ —Å [*hugging face*](https://huggingface.co/) –Ω–∞ –ø—Ä–∏–º–µ—Ä–µ –∏–∑ –æ–¥–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞**

In [48]:
example = clean_messages_df['CleanSentences'][3]
example

'\u200b\u200bWindfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏ –∑–∞–ø–ª–∞—Ç—è—Ç –ú–∏–Ω—Ñ–∏–Ω—É.  –ú–Ω–æ–≥–∏–µ –ø–æ–Ω–∏–º–∞—é—Ç, —á—Ç–æ –Ω–∞—à –±—é–¥–∂–µ—Ç –∏—Å–ø—ã—Ç—ã–≤–∞–µ—Ç —Ç—Ä—É–¥–Ω–æ—Å—Ç–∏ –∏ –µ–≥–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –Ω–∞–ø–æ–ª–Ω—è—Ç—å –≤—Å–µ–≤–æ–∑–º–æ–∂–Ω—ã–º–∏ —Å–ø–æ—Å–æ–±–∞–º–∏. –ú–∏–Ω—Ñ–∏–Ω –∏—Å–ø–æ–ª—å–∑—É–µ—Ç —Ä–∞–∑–ª–∏—á–Ω—ã–µ —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ —Ä—ã—á–∞–≥–∏: –≤—ã—Ö–æ–¥ –Ω–∞ –¥–æ–ª–≥–æ–≤–æ–π —Ä—ã–Ω–æ–∫ ‚Äî –û–§–ó, –ø—Ä–æ–¥–∞–∂–∞ –∑–æ–ª–æ—Ç–∞ –∏ —é–∞–Ω–µ–π –∏–∑ –§–ù–ë, –Ω–æ –æ–¥–Ω–∏–º –∏–∑ –≥–ª–∞–≤–Ω—ã—Ö —è–≤–ª—è–µ—Ç—Å—è –Ω–∞–ª–æ–≥–æ–≤—ã–π.   –ö–∞–∫ –≤—ã –º–æ–≥–ª–∏ –∑–∞–º–µ—Ç–∏—Ç—å, –ø–æ–≤—ã—à–µ–Ω–∏–µ –ù–î–ü–ò –Ω–∞–ª–æ–≥ –Ω–∞ –¥–æ–±—ã—á—É –ø–æ–ª–µ–∑–Ω—ã—Ö –∏—Å–∫–æ–ø–∞–µ–º—ã—Ö –¥–ª—è —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –æ—Ç—Ä–∞—Å–ª–µ–π –≤ —ç—Ç–æ–º –≥–æ–¥—É —É–∂–µ —Å–æ—Å—Ç–æ—è–ª–æ—Å—å. –ü—Ä–∏—á—ë–º –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –∑–∞–ø–ª–∞—Ç–∏–ª–∏ —Ä–∞–∑–æ–≤—ã–π —É–∂–µ –≤ –ø—Ä–æ—à–ª–æ–º –∏ –±—É–¥—É—Ç –ø–ª–∞—Ç–∏—Ç—å –≤ —ç—Ç–æ–º: –ì–∞–∑–ø—Ä–æ–º, –ê–ª—Ä–æ—Å–∞. –≠—Ç–

In [46]:
%%time
from transformers import T5ForConditionalGeneration, T5TokenizerFast, Trainer, TrainingArguments

# –ó–∞–¥–∞–¥–∏–º –Ω–∞–∑–≤–∞–Ω–∏–µ –≤—ã–±—Ä–æ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏ –∏–∑ —Ö–∞–±–∞
MODEL_NAME = 'UrukHan/t5-russian-summarization'

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

CPU times: user 1.49 s, sys: 1.3 s, total: 2.8 s
Wall time: 11.1 s


In [49]:
inputs = tokenizer(example, return_tensors="pt", padding=True, truncation=True, max_length=1024)

# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ä–µ–∑—é–º–µ
summary_ids = model.generate(**inputs)

# –î–µ–∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ –≤—ã—Ö–æ–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)

¬´–°–±–µ—Ä–±–∞–Ω–∫ –∏ –í–¢–ë –Ω–µ –º–æ–≥—É—Ç –¥–æ–≥–æ–≤–æ—Ä–∏—Ç—å—Å—è¬ª: –ú–∏–Ω—Ñ–∏–Ω –†–§ –æ –ø–æ–≤—ã—à–µ–Ω–∏–∏ –ù–î–ü–ò


# **–ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –¥–∞–Ω–Ω—ã–µ –¥–ª—è —Ñ–∞–π–Ω—Ç—é–Ω–∏–Ω–≥–∞ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏**

In [40]:
clean_messages_df['CleanSentences'][19]

'\u200b\u200b–ì–æ–ª—É–±—ã–µ —Ñ–∏—à–∫–∏ —Å –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª–æ–º +50%  –†—ã–Ω–∫–∏ –∞–∫—Ü–∏–π —Ä–∞–¥—É—é—Ç –≤ —ç—Ç–æ–º –≥–æ–¥—É: —Å —è–Ω–≤–∞—Ä—è –≤ —Ö–æ—Ä–æ—à–µ–º –ø–ª—é—Å–µ –≤—Å–µ —ç—à–µ–ª–æ–Ω—ã —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —Ü–µ–Ω–Ω—ã—Ö –±—É–º–∞–≥, –∞ —Ç–∞–∫–∂–µ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–µ –±—É–º–∞–≥–∏, –¥–æ—Å—Ç—É–ø–Ω—ã–µ —Ä–æ—Å—Å–∏—è–Ω–∞–º. –ü—Ä–æ–π–¥–µ–º—Å—è –ø–æ —Å–∞–º—ã–º –ª–∏–∫–≤–∏–¥–Ω—ã–º –∞–∫—Ü–∏—è–º —Å –≤—ã—Å–æ–∫–∏–º –Ω–µ —Ä–µ–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã–º –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª–æ–º.  –ì–æ–ª—É–±—ã–µ —Ñ–∏—à–∫–∏ –†–§  –î–∞–∂–µ —Å–µ–π—á–∞—Å, –ø–æ—Å–ª–µ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –º–µ—Å—è—Ü–µ–≤ —Ä–æ—Å—Ç–∞ —Ä–æ—Å—Å–∏–π—Å–∫–∏–π —Ä—ã–Ω–æ–∫ –æ—Å—Ç–∞–µ—Ç—Å—è –Ω–µ–¥–æ–æ—Ü–µ–Ω–µ–Ω–Ω—ã–º –≤ —Ä—É–±–ª—è—Ö –∏ –º–æ–∂–µ—Ç –ø–æ—Ö–≤–∞—Å—Ç–∞—Ç—å—Å—è –≤—ã—Å–æ–∫–æ–π –æ–∂–∏–¥–∞–µ–º–æ–π –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å—é –¥–∞–∂–µ —Å—Ä–µ–¥–∏ –∞–∫—Ü–∏–π —Ç—è–∂–µ–ª–æ–≤–µ—Å–æ–≤.  –û—Ç–±–µ—Ä–µ–º –±—É–º–∞–≥–∏, –∫–æ—Ç–æ—Ä—ã–µ –≤—Ö–æ–¥—è—Ç –≤ —Å–æ—Å—Ç–∞–≤ –ò–Ω–¥–µ–∫—Å–∞ –ú–æ—Å–ë–∏—Ä–∂–∏ –≥–æ–ª—É–±—ã—Ö —Ñ–∏—à–µ–∫ —Ç–æ–ø-15 —Å–∞–º—ã—Ö –∫—Ä—É–ø–Ω—ã—Ö –∏ 

–ü—É—Å—Ç—å –¥–∞—Ç–∞—Å–µ—Ç –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –±—É–¥–µ—Ç —Å–æ—Å—Ç–æ—è—Ç—å –∏–∑ 20 —ç–ª–µ–º–µ–Ω—Ç–æ–≤

In [17]:
clean_sentence = clean_messages_df['CleanSentences'].iloc[0:20]
DataSet = pd.DataFrame(clean_sentence, columns=['CleanSentences'])
DataSet.head(3)

Unnamed: 0,CleanSentences
0,#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 2022–≥ ...
1,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...
2,‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—è–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ –í–¢–ë +5...


–Ø –Ω–µ —Å–º–æ–≥ –Ω–∞–π—Ç–∏ –¥–∞—Ç–∞—Å–µ—Ç—ã –≤ –æ—Ç–∫—Ä—ã—Ç–æ–º –¥–æ—Å—Ç—É–ø–µ, –∫–æ—Ç–æ—Ä—ã–µ –ø–æ–¥–æ—à–ª–∏ –±—ã –¥–ª—è —ç—Ç–æ–π –∑–∞–¥–∞—á–∏ –∏ –¥–ª—è —Ç–∞–∫–∏—Ö –¥–∞–Ω–Ω—ã—Ö, –ø–æ—ç—Ç–æ–º—É —Å–¥–µ–ª–∞–ª –¥–∞—Ç–∞—Å–µ—Ç –≤—Ä—É—á–Ω—É—é, –ø—É—Å—Ç—å –∏ –Ω–µ –æ—á–µ–Ω—å –±–æ–ª—å—à–æ–π

In [41]:
DataSet.loc[0, 'summarize'] = "–°–µ–ª–∏–≥–¥–∞—Ä –Ω–µ –≤—ã–ø–ª–∞—á–∏–≤–∞–µ—Ç –¥–∏–≤–∏–¥–µ–Ω–¥—ã –∑–∞ 2022 –≥–æ–¥."
DataSet.loc[1, 'summarize'] = "Ozon –∑–∞–ø—É—Å–∫–∞–µ—Ç –Ω–æ–≤—ã–π —Ñ–∏–Ω—Ç–µ—Ö-–ø—Ä–æ–¥—É–∫—Ç ¬´–î–µ–Ω—å–≥–∏ –Ω–∞ –∑–∞–∫—É–ø–∫–∏¬ª, –ø–æ–∑–≤–æ–ª—è—é—â–∏–π –ø—Ä–æ–¥–∞–≤—Ü–∞–º –Ω–∞ –º–∞—Ä–∫–µ—Ç–ø–ª–µ–π—Å–µ –ø–æ–ª—É—á–∞—Ç—å —Ñ–∏–Ω–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏–µ —Å –æ—Ç—Å—Ä–æ—á–∫–æ–π –ø–ª–∞—Ç–µ–∂–∞. Ozon –æ–ø–ª–∞—á–∏–≤–∞–µ—Ç –∑–∞–∫—É–ø–∫—É —Ç–æ–≤–∞—Ä–∞, –∞ –ø—Ä–æ–¥–∞–≤—Ü—ã –≤–Ω–æ—Å—è—Ç –ø–ª–∞—Ç—É –ø–æ –º–µ—Ä–µ –ø–æ—è–≤–ª–µ–Ω–∏—è —Å—Ä–µ–¥—Å—Ç–≤ –Ω–∞ –±–∞–ª–∞–Ω—Å–µ –≤ —Å—Ä–æ–∫ –æ—Ç 30 –¥–æ 90 –¥–Ω–µ–π. –≠—Ç–æ—Ç —Å–µ—Ä–≤–∏—Å —É–ø—Ä–æ—â–∞–µ—Ç –ø—Ä–æ—Ü–µ—Å—Å –ø–æ–ª—É—á–µ–Ω–∏—è –∫—Ä–µ–¥–∏—Ç–∞, –∏—Å–∫–ª—é—á–∞—è –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç—å —Å–æ–±–∏—Ä–∞—Ç—å –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –∏—Å–∫–∞—Ç—å –ø–æ—Ä—É—á–∏—Ç–µ–ª—è."
DataSet.loc[2, 'summarize'] = "–í–¢–ë –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —Ä–æ—Å—Ç –∞–∫—Ü–∏–π –Ω–∞ 5.1% –ø–æ—Å–ª–µ –ø–∞–¥–µ–Ω–∏—è –∏–∑-–∑–∞ –Ω–æ–≤–æ—Å—Ç–µ–π –æ —Ü–µ–Ω–µ —Ä–∞–∑–º–µ—â–µ–Ω–∏—è –¥–æ–ø—ç–º–∏—Å—Å–∏–∏, –∫–æ—Ç–æ—Ä–∞—è –æ–∫–∞–∑–∞–ª–∞—Å—å –Ω–∏–∂–µ –æ–∂–∏–¥–∞–Ω–∏–π. –ù–µ—Å–º–æ—Ç—Ä—è –Ω–∞ —ç—Ç–æ, –∞–∫—Ü–∏–∏ –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–∏–ª–∏—Å—å –∏ –≤—ã—à–ª–∏ –≤ –ª–∏–¥–µ—Ä—ã —Ä–æ—Å—Ç–∞. –ú–µ—á–µ–ª, –Ω–µ—Å–º–æ—Ç—Ä—è –Ω–∞ –ø–∞–¥–µ–Ω–∏–µ –±–æ–ª–µ–µ 5.5%, –≤—ã—Ö–æ–¥–∏—Ç –≤ –ø–ª—é—Å –ø–æ—Å–ª–µ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –°–î –Ω–µ –≤—ã–ø–ª–∞—á–∏–≤–∞—Ç—å –¥–∏–≤–∏–¥–µ–Ω–¥—ã –∑–∞ 2022 –≥–æ–¥ –∏–∑-–∑–∞ —É–±—ã—Ç–∫–∞. –ú–æ—Å–±–∏—Ä–∂–∞ –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —Ä–æ—Å—Ç –Ω–∞ 4.4% –ø–æ—Å–ª–µ —É—Ç–≤–µ—Ä–∂–¥–µ–Ω–∏—è –¥–∏–≤–∏–¥–µ–Ω–¥–∞. –†—É—Å–ì–∏–¥—Ä–æ –∏ –°–æ–≤–∫–æ–º—Ñ–ª–æ—Ç —Ç–∞–∫–∂–µ –ø–æ–∫–∞–∑—ã–≤–∞—é—Ç —Ä–æ—Å—Ç, –∞ –†–æ—Å—Ç–µ–ª–µ–∫–æ–º –∏ –õ—É–∫–æ–π–ª –æ—Ç–º–µ—á–∞—é—Ç—Å—è –∑–∞ —Ä–æ—Å—Ç –≤—ã—Ä—É—á–∫–∏ –∏ –ø–∞–¥–µ–Ω–∏–µ –ø–æ—Å–ª–µ –¥–∏–≤–∏–¥–µ–Ω–¥–Ω–æ–π –æ—Ç—Å–µ—á–∫–∏ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–µ–Ω–Ω–æ."
DataSet.loc[3, 'summarize'] = "–í –†–æ—Å—Å–∏–∏ –≤–≤–µ–¥–µ–Ω–∞ –Ω–æ–≤–∞—è —Ñ–æ—Ä–º–∞ –Ω–∞–ª–æ–≥–∞ - Windfall Tax (–Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å), –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –≤–∑–∏–º–∞—Ç—å—Å—è —Å –∫–æ–º–ø–∞–Ω–∏–π, –ø—Ä–µ–≤—ã—Å–∏–≤—à–∏—Ö —Å—Ä–µ–¥–Ω—é—é –∞—Ä–∏—Ñ–º–µ—Ç–∏—á–µ—Å–∫—É—é –ø—Ä–∏–±—ã–ª—å –∑–∞ 2021 –∏ 2022 –≥–æ–¥—ã –ø–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—é —Å 2018 –∏ 2019 –≥–æ–¥–∞–º–∏. –≠—Ç–æ—Ç –Ω–∞–ª–æ–≥ –±—É–¥–µ—Ç –ø—Ä–∏–º–µ–Ω—è—Ç—å—Å—è –∫ —Ä–æ—Å—Å–∏–π—Å–∫–∏–º –∏ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–º –∫–æ–º–ø–∞–Ω–∏—è–º, –≤–µ–¥—É—â–∏–º –¥–µ—è—Ç–µ–ª—å–Ω–æ—Å—Ç—å –≤ –†–æ—Å—Å–∏–∏, —Å –∏—Å–∫–ª—é—á–µ–Ω–∏–µ–º –Ω–µ—Ñ—Ç—è–Ω—ã—Ö –∏ –Ω–µ—Ñ—Ç–µ–ø–µ—Ä–µ—Ä–∞–±–∞—Ç—ã–≤–∞—é—â–∏—Ö –∫–æ–º–ø–∞–Ω–∏–π, —É–≥–æ–ª—å–Ω—ã—Ö –∫–æ–º–ø–∞–Ω–∏–π, –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª–µ–π –°–ü–ì, –º–∞–ª—ã—Ö –∏ —Å—Ä–µ–¥–Ω–∏—Ö –ø—Ä–µ–¥–ø—Ä–∏—è—Ç–∏–π (–ú–°–ü), –∞ —Ç–∞–∫–∂–µ –∫–æ–º–ø–∞–Ω–∏–π —Å —á–∏—Å—Ç–æ–π –¥–æ–Ω–∞–ª–æ–≥–æ–≤–æ–π –ø—Ä–∏–±—ã–ª—å—é –∑–∞ 2021-2022 –≥–æ–¥—ã –º–µ–Ω–µ–µ 1 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π. –†–∞–∑–º–µ—Ä –Ω–∞–ª–æ–≥–∞ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç 10% –æ—Ç —Å—É–º–º—ã –ø—Ä–µ–≤—ã—à–µ–Ω–∏—è –ø—Ä–∏–±—ã–ª–∏ –Ω–∞–¥ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–µ–º –∑–∞ 2018-2019 –≥–æ–¥—ã, —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é –¥–æ—Å—Ä–æ—á–Ω–æ–π —É–ø–ª–∞—Ç—ã —Å 1 –æ–∫—Ç—è–±—Ä—è –ø–æ 30 –Ω–æ—è–±—Ä—è 2023 –≥–æ–¥–∞ –≤ —Ä–∞–∑–º–µ—Ä–µ 5%. –ó–∞–∫–æ–Ω–æ–ø—Ä–æ–µ–∫—Ç –≤—Å—Ç—É–ø–∏—Ç –≤ —Å–∏–ª—É —Å 1 —è–Ω–≤–∞—Ä—è 2024 –≥–æ–¥–∞. –°—Ä–µ–¥–∏ –∫–æ–º–ø–∞–Ω–∏–π, –∫–æ—Ç–æ—Ä—ã–µ –º–æ–≥—É—Ç –∑–∞–ø–ª–∞—Ç–∏—Ç—å —ç—Ç–æ—Ç –Ω–∞–ª–æ–≥, –Ω–∞–∏–±–æ–ª—å—à–∏–π —Ä–∞–∑–æ–≤—ã–π —Å–±–æ—Ä –æ–∂–∏–¥–∞–µ—Ç—Å—è —É –°–±–µ—Ä–±–∞–Ω–∫–∞ (20,4 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –ù–æ—Ä–Ω–∏–∫–µ–ª—è (18,3 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π) –∏ –ù–õ–ú–ö (15 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –∞ —Ç–∞–∫–∂–µ —É –§–æ—Å–ê–≥—Ä–æ (12,1 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –†—É—Å–∞–ª–∞ (10,4 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –°–µ–≤–µ—Ä—Å—Ç–∞–ª–∏ (9,5 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –ú–ú–ö (9,2 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –í–¢–ë (6,8 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π), –ü–æ–ª—é—Å–∞ (6,3 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π) –∏ –ü–ò–ö (5,3 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π)."
DataSet.loc[4, 'summarize'] = "–ê–≤—Ç–æ—Ä —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–µ—Ç –æ —Å–≤–æ–µ–º –æ–ø—ã—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è –≤ —Ä–æ—Å—Å–∏–π—Å–∫–∏–π —Ä—ã–Ω–æ–∫, —É–ø–æ–º–∏–Ω–∞—è —Ä–∞–∑–ª–∏—á–Ω—ã–µ –ø–∞–∫–µ—Ç—ã –ø—Ä–µ—Ñ–æ–≤, –∞–∫—Ü–∏–∏ –∏ –¥—Ä—É–≥–∏–µ —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã, –≤–∫–ª—é—á–∞—è –°—É—Ä–≥—É—Ç–Ω–µ—Ñ—Ç–µ–≥–∞–∑, –°–±–µ—Ä–±–∞–Ω–∫, –ù–æ—Ä–Ω–∏–∫–µ–ª—å, –§–æ—Å–ê–≥—Ä–æ, –∞ —Ç–∞–∫–∂–µ –û–§–ó, –±–æ–Ω–¥—ã –£—Ä–∞–ª—å—Å–∫–æ–π —Å—Ç–∞–ª–∏, –≥–¥—Ä –†—É—Å–∞–≥—Ä–æ, –∑–∞–º–µ—â–∞–π–∫–∏ –ì–∞–∑–ø—Ä–æ–º–∞ –∏ GLDRUB_TOM. –û–Ω –≤—ã—Ä–∞–∂–∞–µ—Ç –æ–ø–∞—Å–µ–Ω–∏—è –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ –∞–∫—Ü–∏–π –í–¢–ë, –∫—Ä–∏—Ç–∏–∫—É—è —Ç–æ–ø-–º–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –±–∞–Ω–∫–∞ –∑–∞ –æ–±–µ—â–∞–Ω–∏—è –¥–∏–≤–∏–¥–µ–Ω–¥–æ–≤. –í –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–∏ –∏—Å—Ç–æ—Ä–∏—è —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–µ—Ç –æ —Ç—Ä—É–¥–Ω–æ—Å—Ç—è—Ö, —Å –∫–æ—Ç–æ—Ä—ã–º–∏ —Å—Ç–∞–ª–∫–∏–≤–∞—é—Ç—Å—è –∏–Ω–≤–µ—Å—Ç–æ—Ä—ã –Ω–∞ —Ä–æ—Å—Å–∏–π—Å–∫–æ–º —Ä—ã–Ω–∫–µ, –≤–∫–ª—é—á–∞—è –ø—Ä–æ–±–ª–µ–º—ã —Å –æ–ø–µ—Ä–∞—Ü–∏—è–º–∏ –∫—É–ø–ª–∏/–ø—Ä–æ–¥–∞–∂–∏ –ø–∞–µ–≤ –ü–ò–§–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –≤—ã–Ω—É–∂–¥–∞—é—Ç –¥–µ—Ä–∂–∞—Ç—å –±–æ–ª—å—à–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω–∞–ª–∏—á–Ω—ã—Ö –¥–µ–Ω–µ–≥, –Ω–µ –≤–ª–æ–∂–µ–Ω–Ω—ã—Ö –≤ –±—É–º–∞–≥–∏, —á—Ç–æ –ø—Ä–∏–≤–æ–¥–∏—Ç –∫ –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ–º—É —Ä–æ—Å—Ç—É –ü–ò–§–æ–≤ –≤—Å–ª–µ–¥ –∑–∞ —Ä—ã–Ω–∫–æ–º. –¢–∞–∫–∂–µ —É–ø–æ–º–∏–Ω–∞—é—Ç—Å—è –ø–æ—Ç–µ—Ä–∏ –Ω–∞ –∑–∞–º–æ—Ä–æ–∑–∫–µ –∞–∫—Ü–∏–π –∏ —Ñ–æ–Ω–¥–æ–≤ –Ω–∞ –°–®–ê, –∫–æ—Ç–æ—Ä—ã–µ —Å—Ç–∞–ª–∏ –±–æ–ª—å—à–∏–º —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–º —É—Ä–æ–∫–æ–º –¥–ª—è –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤."
DataSet.loc[5, 'summarize'] = "–í —Ç–µ–∫—Å—Ç–µ –æ–±—Å—É–∂–¥–∞—é—Ç—Å—è —Ç–µ–∫—É—â–∏–µ —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–µ —Å–æ–±—ã—Ç–∏—è –∏ —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ –Ω–æ–≤–æ—Å—Ç–∏, –≤–∫–ª—é—á–∞—è –∏–∑–º–µ–Ω–µ–Ω–∏—è –≤ –∫—É—Ä—Å–µ –¥–æ–ª–ª–∞—Ä–∞ –∏ –∞–∫—Ü–∏–π —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –∫–æ–º–ø–∞–Ω–∏–π. –ü–æ—Å–ª–µ —Å–∫–∞—á–∫–æ–≤ –≤ —Ñ–µ–≤—Ä–∞–ª–µ, –¥–æ–ª–ª–∞—Ä –º–æ–≥ –±—ã—Ç—å –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –±–µ—Å–ø–ª–∞—Ç–Ω–æ –∑–∞ 50-55 —Ä—É–±–ª–µ–π, —á—Ç–æ, –ø–æ –º–Ω–µ–Ω–∏—é –∞–≤—Ç–æ—Ä–∞, —É–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞ –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–µ –∑–∞–Ω–∏–∂–µ–Ω–∏–µ –∫—É—Ä—Å–∞ –≤–∞–ª—é—Ç—ã. –ê–≤—Ç–æ—Ä –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ—Ç, —á—Ç–æ —Ä–æ—Å—Ç –∫—É—Ä—Å–∞ –¥–æ–ª–ª–∞—Ä–∞ –º–æ–∂–µ—Ç –ø—Ä–æ–¥–æ–ª–∂–∞—Ç—å—Å—è, –∏ –∫—É—Ä—Å –º–æ–∂–µ—Ç –¥–∞–∂–µ –æ–ø—É—Å—Ç–∏—Ç—å—Å—è –Ω–∏–∂–µ —Ç–µ–∫—É—â–µ–≥–æ —É—Ä–æ–≤–Ω—è. –†–æ—Å—Ç –∏–º–ø–æ—Ä—Ç–∞ –∏ –≤—ã—Ö–æ–¥ –Ω–µ–¥—Ä—É–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö –Ω–µ—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–≤ —Å —Ä—ã–Ω–∫–∞ —Ç–∞–∫–∂–µ –æ–∫–∞–∂—É—Ç –ø–æ–¥–¥–µ—Ä–∂–∫—É –¥–æ–ª–ª–∞—Ä—É. –í —Ç–µ–∫—Å—Ç–µ —É–ø–æ–º–∏–Ω–∞—é—Ç—Å—è –ø–µ—Ä–≤—ã–µ —Å–ª–æ–≤–µ—Å–Ω—ã–µ –∏–Ω—Ç–µ—Ä–≤–µ–Ω—Ü–∏–∏, —Å–≤—è–∑–∞–Ω–Ω—ã–µ —Å –∫—É—Ä—Å–æ–º —Ä—É–±–ª—è, –∏ –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ–º—ã–µ —Å–∞–Ω–∫—Ü–∏–∏ –ø—Ä–æ—Ç–∏–≤ –õ–∏—Å–∏–Ω–∞ –∏ –ù–õ–ú–ö, –∫–æ—Ç–æ—Ä—ã–µ –º–æ–≥—É—Ç –±—ã—Ç—å –≤–∫–ª—é—á–µ–Ω—ã –≤ 11 –ø–∞–∫–µ—Ç —Å–∞–Ω–∫—Ü–∏–π –ï–≤—Ä–æ—Å–æ—é–∑–∞. –¢–∞–∫–∂–µ –æ–±—Å—É–∂–¥–∞—é—Ç—Å—è –∞–∫—Ü–∏–∏ –∫–æ–º–ø–∞–Ω–∏–π, —Ç–∞–∫–∏—Ö –∫–∞–∫ OZON, FIVE, X5 Group, –¢–∞—Ç–Ω–µ—Ñ—Ç—å, –°–µ–≥–µ–∂–∞, Whoosh, –∏ –õ—É–∫–æ–π–ª, –∞ —Ç–∞–∫–∂–µ –ø–ª–∞–Ω—ã –∫–æ–º–ø–∞–Ω–∏–π –Ω–∞ —Ä–∞–∑–≤–∏—Ç–∏–µ –∏ –∏–∑–º–µ–Ω–µ–Ω–∏–µ —é—Ä–∏—Å–¥–∏–∫—Ü–∏–∏."
DataSet.loc[6, 'summarize'] = "–ò—Ç–æ–≥–∏ –Ω–µ–¥–µ–ª–∏ —Å 27.03.23 –ø–æ 01.04.23 –≤–∫–ª—é—á–∞—é—Ç –æ–±–∑–æ—Ä—ã –ø–æ –∫—Ä–µ–ø–∫–æ–º—É –≥–∞–∑–æ–≤–∏–∫—É –ù–æ–≤–∞—Ç—ç–∫ –∏ Globaltrans, –∞ —Ç–∞–∫–∂–µ –æ—Ç—á–µ—Ç—ã –æ—Ç –∫–æ–º–ø–∞–Ω–∏–π –°–µ–≥–µ–∂–∞ –∏ –¶–ò–ê–ù. –í —Ç–µ–∫—Å—Ç–µ —Ç–∞–∫–∂–µ —É–ø–æ–º–∏–Ω–∞—é—Ç—Å—è –≤–∞–∂–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è, —Ç–∞–∫–∏–µ –∫–∞–∫ –ø–ª–∞–Ω—ã –∫–∏—Ç–∞–π—Å–∫–æ–≥–æ –≥–∏–≥–∞–Ω—Ç–∞ Aliababa –Ω–∞ –ø—Ä–æ–≤–µ–¥–µ–Ω–∏–µ IPO –¥–ª—è —à–µ—Å—Ç–∏ —Å–µ–≥–º–µ–Ω—Ç–æ–≤ —Å–≤–æ–µ–≥–æ –±–∏–∑–Ω–µ—Å–∞, –ø–æ–ø—ã—Ç–∫–∏ —Å–ø–µ–∫—É–ª—è–Ω—Ç–æ–≤ –ø–æ–¥–±–∏—Ç—å –æ–±–∑–æ—Ä–æ–º –±–∞–Ω–∫ Metropolitan Bank Holding Corp., –∞ —Ç–∞–∫–∂–µ –∑–∞—è–≤–ª–µ–Ω–∏—è –§–†–° –°–®–ê –æ –≤–ª–∏—è–Ω–∏–∏ –ø–æ–≤—ã—à–µ–Ω–∏—è –ø—Ä–æ—Ü–µ–Ω—Ç–Ω—ã—Ö —Å—Ç–∞–≤–æ–∫ –Ω–∞ —ç–∫–æ–Ω–æ–º–∏–∫—É –∏ –∏–Ω—Ñ–ª—è—Ü–∏—é. –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —ç–Ω–µ—Ä–≥–µ—Ç–∏–∫–∏ –†–æ—Å—Å–∏–∏ —É–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞ –Ω–µ—Ç–æ—á–Ω–æ—Å—Ç—å –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —Ä–µ—Ñ–µ—Ä–µ–Ω—Å–æ–≤ –ø–æ —Ü–µ–Ω–∞–º –Ω–∞ –Ω–µ—Ñ—Ç—å –∏–∑ –†–æ—Å—Å–∏–∏ –∏ –Ω–∞—á–∏–Ω–∞–µ—Ç —Ä–∞–±–æ—Ç—É –ø–æ —Ñ–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏—é –æ—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä–æ–≤. –¢–∞–∫–∂–µ –ø–ª–∞–Ω–∏—Ä—É–µ—Ç—Å—è –ø—Ä–æ–≤–µ–¥–µ–Ω–∏–µ –≤–µ–±–∏–Ω–∞—Ä–∞ –Ω–∞ —Ç–µ–º—É –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤—ã 2023 –≥–æ–¥–∞ –¥–ª—è —Ä—ã–Ω–∫–æ–≤ –†–§ –∏ –°–®–ê."
DataSet.loc[7, 'summarize'] = "–í—ã—Ä—É—á–∫–∞ VK –≤ 1 –∫–≤–∞—Ä—Ç–∞–ª–µ 2023 –≥–æ–¥–∞ —Å–æ—Å—Ç–∞–≤–∏–ª–∞ 25,9 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π, —á—Ç–æ –Ω–∞ 16,2% –º–µ–Ω—å—à–µ, —á–µ–º –≤ –ø—Ä–µ–¥—ã–¥—É—â–µ–º –∫–≤–∞—Ä—Ç–∞–ª–µ. –≠—Ç–æ —Å–≤–∏–¥–µ—Ç–µ–ª—å—Å—Ç–≤—É–µ—Ç –æ –∫–æ–Ω—Å–µ–Ω—Å—É—Å-–ø—Ä–æ–≥–Ω–æ–∑–µ, —Å–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–º –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–æ–º –Ω–∞ –æ—Å–Ω–æ–≤–µ –æ–ø—Ä–æ—Å–∞ –∞–Ω–∞–ª–∏—Ç–∏–∫–æ–≤ –∏–Ω–≤–µ—Å—Ç–∫–æ–º–ø–∞–Ω–∏–π –∏ –±–∞–Ω–∫–æ–≤. –í —Ü–µ–ª–æ–º, –≤—ã—Ä—É—á–∫–∞ VK –≤ 2023 –≥–æ–¥—É —Å–æ—Å—Ç–∞–≤–∏–ª–∞ –æ–∫–æ–ª–æ 133 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π, —á—Ç–æ –Ω–∞ 35% –±–æ–ª—å—à–µ, —á–µ–º –≤ –ø—Ä–µ–¥—ã–¥—É—â–µ–º –≥–æ–¥—É, –∏ —è–≤–ª—è–µ—Ç—Å—è –Ω–∞–∏–≤—ã—Å—à–∏–º –ø–æ–∫–∞–∑–∞—Ç–µ–ª–µ–º –∑–∞ –Ω–∞–±–ª—é–¥–∞–µ–º—ã–π –ø–µ—Ä–∏–æ–¥ 1."
DataSet.loc[8, 'summarize'] = "–ò–Ω–¥–µ–∫—Å –ú–æ—Å–ë–∏—Ä–∂–∏ –≤ —Å—Ä–µ–¥—É –ø—Ä–∏–±–∞–≤–∏–ª 0,8%, –Ω–µ—Å–º–æ—Ç—Ä—è –Ω–∞ —Å–Ω–∏–∂–µ–Ω–∏–µ –∫–æ—Ç–∏—Ä–æ–≤–æ–∫ –í–¢–ë –Ω–∞ 6,3% –ø–æ—Å–ª–µ —Å–∏–ª—å–Ω–æ–≥–æ —Ä–æ—Å—Ç–∞ –≤ –ø—Ä–µ–¥—ã–¥—É—â–∏–µ —Å–µ—Å—Å–∏–∏. –ë–∞–Ω–∫ –æ–ø—É–±–ª–∏–∫–æ–≤–∞–ª –æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –∑–∞ 2022 –≥–æ–¥, –≤ –∫–æ—Ç–æ—Ä–æ–π —Å–æ–æ–±—â–∏–ª –æ –Ω–µ–ø–ª–∞–Ω–∏—Ä—É–µ–º–æ–π –≤—ã–ø–ª–∞—Ç–µ –¥–∏–≤–∏–¥–µ–Ω–¥–æ–≤ –≤ –±–ª–∏–∂–∞–π—à–µ–º –±—É–¥—É—â–µ–º. –í –ª–∏–¥–µ—Ä–∞—Ö —Ä–æ—Å—Ç–∞ –æ–∫–∞–∑–∞–ª–∏—Å—å –∞–∫—Ü–∏–∏ –†—É—Å–ì–∏–¥—Ä–æ, –∞ —Ç–∞–∫–∂–µ –∞–∫—Ü–∏–∏ –ö–ê–ú–ê–ó–∞, –°–æ–ª–ª–µ—Ä—Å–∞ –∏ —Ä–∞—Å–ø–∏—Å–∫–∏ –ì–ª–æ–±–∞–ª—Ç—Ä–∞–∫. –í–ª–∞–¥–µ–ª–µ—Ü –ì–ª–æ–±–∞–ª—Ç—Ä–∞–∫–∞ –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ –ø—Ä–æ–¥–∞–∂–µ –±–∏–∑–Ω–µ—Å–∞, –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã–º –ø–æ–∫—É–ø–∞—Ç–µ–ª–µ–º –º–æ–∂–µ—Ç —Å—Ç–∞—Ç—å –≥—Ä—É–ø–ø–∞ ¬´–ú–æ–Ω–æ–ø–æ–ª–∏—è¬ª. –ê–∫—Ü–∏–∏ –ì—Ä—É–ø–ø—ã –ì–ê–ó —Ç–∞–∫–∂–µ –ø–æ–∫–∞–∑–∞–ª–∏ —Ä–æ—Å—Ç –ø–æ—Å–ª–µ –Ω–æ–≤–æ—Å—Ç–µ–π –æ–± –∏—Å–∫–µ –∫ Volkswagen –Ω–∞ 15 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π."
DataSet.loc[9, 'summarize'] = "Bashneft –ø–ª–∞–Ω–∏—Ä—É–µ—Ç –≤—ã–ø–ª–∞—Ç–∏—Ç—å –¥–∏–≤–∏–¥–µ–Ω–¥—ã –∑–∞ 2022 –≥–æ–¥ –≤ —Ä–∞–∑–º–µ—Ä–µ 35.5 –º–∏–ª–ª–∏–∞—Ä–¥–æ–≤ —Ä—É–±–ª–µ–π –ø–æ —Å—Ç–∞–≤–∫–µ 199.89 —Ä—É–±–ª–µ–π –∑–∞ –∞–∫—Ü–∏—é, —á—Ç–æ –≤–∫–ª—é—á–∞–µ—Ç 29.55 –º–∏–ª–ª–∏–∞—Ä–¥–æ–≤ —Ä—É–±–ª–µ–π –Ω–∞ –æ–±—ã–∫–Ω–æ–≤–µ–Ω–Ω—ã—Ö –∞–∫—Ü–∏—è—Ö –∏ 5.95 –º–∏–ª–ª–∏–∞—Ä–¥–æ–≤ —Ä—É–±–ª–µ–π –Ω–∞ –ø—Ä–µ–¥–ø–æ—á—Ç–∏—Ç–µ–ª—å–Ω—ã—Ö –∞–∫—Ü–∏—è—Ö —Ç–∏–ø–∞ A 6."
DataSet.loc[10, 'summarize'] = "#MRKU = +5% = –º–∞–∫—Å –∑–∞ 7 –º–µ—Å"
DataSet.loc[11, 'summarize'] = "–í 2022 –≥–æ–¥—É ¬´–Ø–Ω–¥–µ–∫—Å¬ª –ø–æ—á—Ç–∏ –≤–¥–≤–æ–µ —É–≤–µ–ª–∏—á–∏–ª –ø—Ä–æ–¥–∞–∂–∏ –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–æ–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∏ –∏ –≤–ø–µ—Ä–≤—ã–µ —Å—Ç–∞–ª –ª–∏–¥–µ—Ä–æ–º –≤ —ç—Ç–æ–º —Å–µ–≥–º–µ–Ω—Ç–µ, –∑–∞–Ω–∏–º–∞—è –¥–æ–ª—é —Ä—ã–Ω–∫–∞ –≤ 13,2% –ø–æ —á–∏—Å–ª—É –ø—Ä–æ–¥–∞–Ω–Ω—ã—Ö –≥–∞–¥–∂–µ—Ç–æ–≤. –≠—Ç–æ —Å—Ç–∞–ª–æ –≤–æ–∑–º–æ–∂–Ω—ã–º –Ω–∞ —Ñ–æ–Ω–µ —É—Ö–æ–¥–∞ –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö –≤–µ–Ω–¥–æ—Ä–æ–≤ —Å —Ä–æ—Å—Å–∏–π—Å–∫–æ–≥–æ —Ä—ã–Ω–∫–∞, —á—Ç–æ –ø—Ä–∏–≤–µ–ª–æ –∫ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—é –æ–±—ä–µ–º–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–æ–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∏, –∫–æ–º–ø—å—é—Ç–µ—Ä–Ω–æ–π —Ç–µ—Ö–Ω–∏–∫–∏ –∏ –º–æ–±–∏–ª—å–Ω—ã—Ö —É—Å—Ç—Ä–æ–π—Å—Ç–≤ –Ω–∞ 15,4% –¥–æ –æ–∫–æ–ª–æ 2,2 —Ç—Ä–ª–Ω —Ä—É–±–ª–µ–π. –í—Å–µ —Ä–æ—Å—Å–∏–π—Å–∫–∏–µ –º–∞—Ä–∫–∏, —Å–ø–æ—Å–æ–±–Ω—ã–µ –ø—Ä–æ–¥–æ–ª–∂–∞—Ç—å –∏–º–ø–æ—Ä—Ç —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∏ –∏–ª–∏ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç—å –µ–µ –≤–Ω—É—Ç—Ä–∏ —Å—Ç—Ä–∞–Ω—ã —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º –≤–≤–µ–∑–µ–Ω–Ω—ã—Ö –∫–æ–º–ø–ª–µ–∫—Ç—É—é—â–∏—Ö, —Å—Ç–∞–ª–∏ –±–µ–Ω–µ—Ñ–∏—Ü–∏–∞—Ä–∞–º–∏ 2022 –≥–æ–¥–∞. –°—Ä–µ–¥–∏ –Ω–∏—Ö ¬´–Ø–Ω–¥–µ–∫—Å¬ª, Hiper, F+, ¬´–ì—Ä–∞–≤–∏—Ç–æ–Ω¬ª, Nerpa, ICL, –±–æ–ª—å—à–∏–Ω—Å—Ç–≤–æ –∏–∑ –∫–æ—Ç–æ—Ä—ã—Ö —Å–≤—è–∑–∞–Ω—ã —Å –∫—Ä—É–ø–Ω—ã–º–∏ —Ä–æ—Å—Å–∏–π—Å–∫–∏–º–∏ –¥–∏—Å—Ç—Ä–∏–±—É—Ç–æ—Ä–∞–º–∏. –í 2023 –≥–æ–¥—É —Ä—ã–Ω–æ–∫ –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–æ–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∏ –æ–∂–∏–≤–∏–ª—Å—è, –Ω–æ, –∫–∞–∫ –ø—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ—Ç—Å—è, –Ω–µ —Å–º–æ–∂–µ—Ç –ø—Ä–∏–±–ª–∏–∑–∏—Ç—å—Å—è –∫ –ø–æ–∫–∞–∑–∞—Ç–µ–ª—è–º 2020‚Äì2021 –≥–æ–¥–æ–≤ ."
DataSet.loc[12, 'summarize'] = "–°–µ—Ä–±–∏—è –ø–ª–∞–Ω–∏—Ä—É–µ—Ç —Å—Ç–∞—Ç—å –ø–µ—Ä–≤–æ–π —Å—Ç—Ä–∞–Ω–æ–π –≤ –ï–≤—Ä–æ–ø–µ, –≥–¥–µ –±—É–¥—É—Ç –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å—Å—è –±–µ—Å–ø–∏–ª–æ—Ç–Ω—ã–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏, –±–ª–∞–≥–æ–¥–∞—Ä—è —Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤—É —Å —Ä–æ—Å—Å–∏–π—Å–∫–∏–º –Ø–Ω–¥–µ–∫—Å–æ–º. –≠—Ç–æ—Ç –ø—Ä–æ–µ–∫—Ç –≤–∫–ª—é—á–∞–µ—Ç –≤ —Å–µ–±—è —Ä–∞–±–æ—Ç—É —Å–ø–µ—Ü–∏–∞–ª–∏—Å—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ —É–∂–µ –¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É—é—Ç –≤—ã—Å–æ–∫–∏–π —É—Ä–æ–≤–µ–Ω—å –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª–∏–∑–º–∞ –≤ –æ–±–ª–∞—Å—Ç–∏ –±–µ—Å–ø–∏–ª–æ—Ç–Ω—ã—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π. –ü—Ä–µ–∑–∏–¥–µ–Ω—Ç –°–µ—Ä–±–∏–∏ –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –í—É—á–∏—á –≤—ã—Ä–∞–∑–∏–ª –≤–æ—Å—Ö–∏—â–µ–Ω–∏–µ —Ä–∞–±–æ—Ç–æ–π –Ø–Ω–¥–µ–∫—Å–∞, —Å—Ä–∞–≤–Ω–∏–≤ –µ–≥–æ —Å –∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–º Google, –ø–æ–¥—á–µ—Ä–∫–Ω—É–≤, —á—Ç–æ —Ä—É—Å—Å–∫–∏–π –Ø–Ω–¥–µ–∫—Å –∏–≥—Ä–∞–µ—Ç –∫–ª—é—á–µ–≤—É—é —Ä–æ–ª—å –≤ —Ä–∞–∑–≤–∏—Ç–∏–∏ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –≤ —Å—Ç—Ä–∞–Ω–µ."
DataSet.loc[13, 'summarize'] = "—Å—É–¥ —á–∞—Å—Ç–∏—á–Ω–æ —Å–Ω—è–ª –∞—Ä–µ—Å—Ç —Å –∞–∫—Ç–∏–≤–æ–≤ Volkswagen –≤ –†–æ—Å—Å–∏–∏"
DataSet.loc[14, 'summarize'] = "–ì—Ä—É–ø–ø–∞ –ì–ê–ó (GAZA) –∏–º–µ–µ—Ç —Ü–µ–ª–µ–≤—É—é —Ü–µ–Ω—É –≤ —Å–ª—É—á–∞–µ –ø–æ–ª–Ω–æ–≥–æ —É–¥–æ–≤–ª–µ—Ç–≤–æ—Ä–µ–Ω–∏—è –∏—Å–∫–∞ –≤ 1250 —Ä—É–±–ª–µ–π, —á—Ç–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è–µ—Ç –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª —Ä–æ—Å—Ç–∞ –Ω–∞ 80% –æ—Ç —Ç–µ–∫—É—â–µ–π —Ü–µ–Ω—ã –∞–∫—Ü–∏–∏. –≠—Ç–æ –æ—Ü–µ–Ω–∫–∞, –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–Ω–∞—è –≤ –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ –∞–Ω–∞–ª–∏–∑–∞ –∏ –ø—Ä–æ–≥–Ω–æ–∑–æ–≤ –Ω–∞ 2023 –≥–æ–¥, –≥–¥–µ –∞–∫—Ü–∏–∏ –ì—Ä—É–ø–ø—ã –ì–ê–ó —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—é—Ç—Å—è –∫–∞–∫ –æ–¥–∏–Ω –∏–∑ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–æ–Ω–Ω—ã—Ö –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤ "
DataSet.loc[15, 'summarize'] = "–ù–∞ —Å–µ–≥–æ–¥–Ω—è—à–Ω–∏–π –¥–µ–Ω—å, 1 –º–∞—è 2023 –≥–æ–¥–∞, –ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ —Å 1 –º–∞—è –ø–æ–≤—ã—à–∞–µ—Ç –º–∞–∫—Ä–æ–ø—Ä—É–¥–µ–Ω—Ü–∏–∞–ª—å–Ω—ã–µ —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è –ø–æ –∏–ø–æ—Ç–µ—á–Ω—ã–º –∫—Ä–µ–¥–∏—Ç–∞–º. –≠—Ç–∞ –º–µ—Ä–∞ –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∞ –Ω–∞ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–µ —Ä–∏—Å–∫–æ–≤ –∑–∞–µ–º—â–∏–∫–æ–≤ –∏ –±–∞–Ω–∫–æ–≤. –í —Ç–æ –∂–µ –≤—Ä–µ–º—è, —Ç–æ—Ä–≥–∏ –Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–æ–π –±–∏—Ä–∂–µ –Ω–µ –ø—Ä–æ–≤–æ–¥—è—Ç—Å—è, –∞ –Ω–∞ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥—Å–∫–æ–π –±–∏—Ä–∂–µ —Ç–æ—Ä–≥–∏ –ø—Ä–æ–≤–æ–¥—è—Ç—Å—è, –Ω–æ —Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è–º–∏. –í —ç–∫–æ–Ω–æ–º–∏–∫–µ –æ–∂–∏–¥–∞—é—Ç—Å—è –¥–∞–Ω–Ω—ã–µ –ø–æ Manufacturing PMI –∞–ø—Ä–∏–ª—è –∏–∑ –Ø–ø–æ–Ω–∏–∏, –ò–Ω–¥–∏–∏, –ö–∞–Ω–∞–¥—ã, –°–®–ê, –∞ —Ç–∞–∫–∂–µ –ø–æ Construction Spending –º–∞—Ä—Ç–∞ –∏ ISM Manufacturing PMI –∞–ø—Ä–∏–ª—è –∏–∑ –°–®–ê. –ü–æ–ø—É–ª—è—Ä–Ω—ã–µ –æ—Ç—á–µ—Ç–Ω–æ—Å—Ç–∏ –ø–æ –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–∏–º –∫–æ–º–ø–∞–Ω–∏—è–º –≤–∫–ª—é—á–∞—é—Ç –æ—Ç—á–µ—Ç—ã –æ—Ç Norwegian Cruise Line, Microstrategy, NXPI Semiconductors, –∏ MGM Resorts."
DataSet.loc[16, 'summarize'] = "–ú–æ—Å–±–∏—Ä–∂–∞ –Ω–∞–∑–≤–∞–ª–∞ —Å–∞–º—ã–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ –∞–∫—Ü–∏–∏ –º–∞—Ä—Ç–∞ 2023 –≥–æ–¥–∞ —Å—Ä–µ–¥–∏ —á–∞—Å—Ç–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ —Å–æ—Å—Ç–∞–≤–ª—è—é—Ç 31,9% –ø–æ—Ä—Ç—Ñ–µ–ª—è. –°—Ä–µ–¥–∏ –Ω–∏—Ö: SBER: 31,9% SBERp: 8,5% GAZP: 20,6% LKOH: 9,5% GMKN: 8% ROSN: 4,9% SNGSp: 4,8% NVTK: 4,2% YNDX: 4,1% CHMF: 3,5% –≠—Ç–∏ –¥–∞–Ω–Ω—ã–µ –æ—Ç—Ä–∞–∂–∞—é—Ç –ø—Ä–µ–¥–ø–æ—á—Ç–µ–Ω–∏—è —á–∞—Å—Ç–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ –≤ –º–∞—Ä—Ç–µ 2023 –≥–æ–¥–∞, –ø–æ–¥—á–µ—Ä–∫–∏–≤–∞—è –≤–∞–∂–Ω–æ—Å—Ç—å —ç—Ç–∏—Ö –∞–∫—Ü–∏–π –≤ –ø–æ—Ä—Ç—Ñ–µ–ª—è—Ö –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤."
DataSet.loc[17, 'summarize'] = "GAZA = +6%"
DataSet.loc[18, 'summarize'] = "GAZA = +15% = –º–∞–∫—Å –∑–∞ 2 –≥–æ–¥–∞"
DataSet.loc[19, 'summarize'] = "–í —Å—Ç–∞—Ç—å–µ –ë–ö–° –ú–∏—Ä –ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–π –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω—ã –≥–æ–ª—É–±—ã–µ —Ñ–∏—à–∫–∏ —Å –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª–æ–º —Ä–æ—Å—Ç–∞ –±–æ–ª–µ–µ 50% –Ω–∞ —Ä—ã–Ω–∫–µ –∞–∫—Ü–∏–π –†–æ—Å—Å–∏–∏ –≤ 2023 –≥–æ–¥—É. –°—Ä–µ–¥–∏ –Ω–∏—Ö: –Ø–Ω–¥–µ–∫—Å (YNDX): –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –ü–æ–∫—É–ø–∞—Ç—å —Å —Ü–µ–ª—å—é –Ω–∞ –≥–æ–¥ –≤ 3500 —Ä—É–±–ª–µ–π, —á—Ç–æ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –ø—Ä–∏—Ä–æ—Å—Ç –≤ 82%. –¢–∞—Ç–Ω–µ—Ñ—Ç—å (TATN): –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –ü–æ–∫—É–ø–∞—Ç—å —Å —Ü–µ–ª—å—é –Ω–∞ –≥–æ–¥ –≤ 610 —Ä—É–±–ª–µ–π, —á—Ç–æ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –ø—Ä–∏—Ä–æ—Å—Ç –≤ 56%. –ü–æ–ª—é—Å (PLZL): –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –ü–æ–∫—É–ø–∞—Ç—å —Å —Ü–µ–ª—å—é –Ω–∞ –≥–æ–¥ –≤ 15000 —Ä—É–±–ª–µ–π, —á—Ç–æ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –ø—Ä–∏—Ä–æ—Å—Ç –≤ 55%. –≠—Ç–∏ –∫–æ–º–ø–∞–Ω–∏–∏ –≤—ã–¥–µ–ª—è—é—Ç—Å—è –∫–∞–∫ –ª–∏–¥–µ—Ä—ã —Å–≤–æ–∏—Ö –æ—Ç—Ä–∞—Å–ª–µ–π, –≤–µ–¥—É—Ç –ø—Ä–æ–∑—Ä–∞—á–Ω—É—é —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –ø–æ–ª–∏—Ç–∏–∫—É, –∏–º–µ—é—Ç –¥—Ä–∞–π–≤–µ—Ä—ã –¥–ª—è –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏—è –∞–∫—Ü–∏–π –∏ –ø–æ–∑–∏—Ç–∏–≤–Ω—É—é –¥–∏–Ω–∞–º–∏–∫—É –±–∏–∑–Ω–µ—Å-–ø–æ–∫–∞–∑–∞—Ç–µ–ª–µ–π. –ê–≤—Ç–æ—Ä —Å—Ç–∞—Ç—å–∏ –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –∏–Ω–≤–µ—Å—Ç–æ—Ä–∞–º —Ä–∞—Å—Å–º–æ—Ç—Ä–µ—Ç—å —ç—Ç–∏ –∞–∫—Ü–∏–∏ –≤ –∫–∞—á–µ—Å—Ç–≤–µ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤–Ω—ã—Ö –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–π"

DataSet.head(5)

Unnamed: 0,CleanSentences,summarize
0,#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 2022–≥ ...,–°–µ–ª–∏–≥–¥–∞—Ä –Ω–µ –≤—ã–ø–ª–∞—á–∏–≤–∞–µ—Ç –¥–∏–≤–∏–¥–µ–Ω–¥—ã –∑–∞ 2022 –≥–æ–¥.
1,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...,Ozon –∑–∞–ø—É—Å–∫–∞–µ—Ç –Ω–æ–≤—ã–π —Ñ–∏–Ω—Ç–µ—Ö-–ø—Ä–æ–¥—É–∫—Ç ¬´–î–µ–Ω—å–≥–∏ –Ω–∞...
2,‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—è–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ –í–¢–ë +5...,–í–¢–ë –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —Ä–æ—Å—Ç –∞–∫—Ü–∏–π –Ω–∞ 5.1% –ø–æ—Å–ª–µ –ø–∞–¥–µ–Ω–∏...
3,‚Äã‚ÄãWindfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ ...,–í –†–æ—Å—Å–∏–∏ –≤–≤–µ–¥–µ–Ω–∞ –Ω–æ–≤–∞—è —Ñ–æ—Ä–º–∞ –Ω–∞–ª–æ–≥–∞ - Windfall...
4,"–£ –Ω–∞—Å –±—ã–ª–æ 2 –ø–∞–∫–µ—Ç–∞ –ø—Ä–µ—Ñ–æ–≤ –°—É—Ä–≥—É—Ç–Ω–µ—Ñ—Ç–µ–≥–∞–∑–∞, 75...",–ê–≤—Ç–æ—Ä —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–µ—Ç –æ —Å–≤–æ–µ–º –æ–ø—ã—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏...


In [42]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(DataSet, test_size=0.25, random_state=42)

In [50]:
%%time

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –∞—Ä–≥—É–º–µ–Ω—Ç–æ–≤ –æ–±—É—á–µ–Ω–∏—è
training_args = TrainingArguments(
    output_dir='./results',          # –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—è –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    num_train_epochs=3,              # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö –æ–±—É—á–µ–Ω–∏—è
    per_device_train_batch_size=16, # —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
    per_device_eval_batch_size=64,   # —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞ –¥–ª—è –æ—Ü–µ–Ω–∫–∏
    warmup_steps=500,                # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —à–∞–≥–æ–≤ –¥–ª—è —Ä–∞–∑–æ–≥—Ä–µ–≤–∞
    weight_decay=0.01,               # –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–∏
    logging_dir='./logs',            # –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—è –¥–ª—è –ª–æ–≥–æ–≤
    logging_steps=10,                # —à–∞–≥–∏ –¥–ª—è –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
)

# –°–æ–∑–¥–∞–Ω–∏–µ —Ç—Ä–µ–Ω–µ—Ä–∞
trainer = Trainer(
    model=model,                         # –º–æ–¥–µ–ª—å
    args=training_args,                 # –∞—Ä–≥—É–º–µ–Ω—Ç—ã –æ–±—É—á–µ–Ω–∏—è
    train_dataset=train_data,         # –æ–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞
    eval_dataset=valid_data,            # –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω–∞—è –≤—ã–±–æ—Ä–∫–∞
    tokenizer=tokenizer,                 # —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä
)

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
trainer.train()

# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏
trainer.evaluate()

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model.save_pretrained('./fine_tuned_model')


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`