# Preparation

In [28]:
! pip install razdel rouge transformers sentencepiece --quiet

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from razdel import sentenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import numpy as np
import json
import re
import tarfile
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
import nltk
from nltk.util import ngrams

In [30]:
!rm -f gazeta_raw.txt
!wget https://www.dropbox.com/s/4fxj5wmt7tjr5f2/gazeta_raw.txt

--2025-10-20 16:39:53--  https://www.dropbox.com/s/4fxj5wmt7tjr5f2/gazeta_raw.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/y8n5zf87bxpw1ob1pbkfl/gazeta_raw.txt?rlkey=zjbt0qur4f11svazg2d8ackba [following]
--2025-10-20 16:39:54--  https://www.dropbox.com/scl/fi/y8n5zf87bxpw1ob1pbkfl/gazeta_raw.txt?rlkey=zjbt0qur4f11svazg2d8ackba
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucdbb8d2d221397e1c75213d5b33.dl.dropboxusercontent.com/cd/0/inline/CzlZrRsqUVn68WywQ9KH1oieMWHrnlZmmDDObJ_AeciHe8If6W6WLH92NHO5GY5crUyeNQFvPN4xn4tCAUIll3rNREUBMY9eCfdgsvveL5cZlcF2Tl93p8FLb1j4RfqlDQ0/file# [following]
--2025-10-20 16:39:54--  https://ucdbb8d2d221397e1c75213d5b33.dl.dropboxusercontent.com/cd/0/inline/CzlZrRsqUVn68Wyw

# Extractive summarization

In [31]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f0fe8c5c-9c87-4256-9c0c-24e59757a9e9)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/adapter_config.json
Retrying in 1s [Retry 1/5].


In [51]:
def extract_summary(text, top_n=3, limit=300):
    sentences = [s.text for s in sentenize(text)]
    if len(sentences) <= top_n:
        return " ".join(sentences)[:limit]

    sentence_embeddings = model.encode(sentences)


    # sematic centroid
    centroid = np.mean(sentence_embeddings, axis=0, keepdims=True)
    centroid_similarity = cosine_similarity(sentence_embeddings, centroid).ravel()

    # positional weight
    positional_weight = np.array([1.0 - (i * 0.8 / len(sentences)) for i in range(len(sentences))])

    # combine
    combined_scores = (
        0.7 * centroid_similarity +
        0.3 * positional_weight
    )

    # get top 3 sentences with heighest score
    top_indices = np.argsort(combined_scores)[-top_n:]
    selected_sentences = [sentences[i] for i in sorted(top_indices)]

    summary = " ".join(selected_sentences)

    # cut to limit
    if len(summary) > limit:
        sentences_in_summary = [s.text for s in sentenize(summary)]
        current_length = 0
        final_sentences = []

        for sent in sentences_in_summary:
            if current_length + len(sent) + 1 <= limit:
                final_sentences.append(sent)
                current_length += len(sent) + 1
            else:
                break

        summary = " ".join(final_sentences)

    if not summary.strip():
      summary = text[:300]

    return summary.strip()


# Abstarct summarization

In [33]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
model2 = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")



In [50]:

def abstract_summary(text, max_len=300):
    WHITESPACE_HANDLER = lambda k: re.sub(r'\s+', ' ', re.sub('\n+', ' ', k.strip()))

    if not text.strip():
        return ""

    input_text = "summarize: " + text.strip().replace("\n", " ")

    input_ids = tokenizer(
      [WHITESPACE_HANDLER(text)],
      return_tensors="pt",
      padding="max_length",
      truncation=True,
      max_length=512
      )["input_ids"]

    output_ids = model2.generate(
      input_ids=input_ids,
      max_length=84,
      no_repeat_ngram_size=2,
      num_beams=4
      )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
        )

    if not summary.strip():
      summary = text[:300]

    return summary[:max_len]


# Validation

In [35]:
def calculate_rouge_2(references, hypotheses):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores['rouge-2']


In [42]:
def prepare_gazeta_data(input_path,
                        output_texts_path,
                        output_refs_path,
                        max_samples=100,
                        summary_limit=300):

    records = []
    with open(input_path, "r", encoding="utf-8") as r:
        for line in r:
            records.append(json.loads(line))


    filtered = [
        rec for rec in records
        if 'text' in rec and 'summary' in rec and len(rec['summary']) < summary_limit
    ]

    filtered = filtered[:max_samples]
    texts = [rec['text'] for rec in filtered]
    references = [rec['summary'] for rec in filtered]

    with open(output_texts_path, "w", encoding="utf-8") as f_texts:
        json.dump(texts, f_texts, ensure_ascii=False, indent=2)

    with open(output_refs_path, "w", encoding="utf-8") as f_refs:
        json.dump(references, f_refs, ensure_ascii=False, indent=2)

    print(f"Всего записей в исходном файле: {len(records)}")
    print(f"Отобрано записей (саммари < {summary_limit}): {len(filtered)}")
    print(f"Сохранено {len(texts)} текстов в: {output_texts_path}")
    print(f"Сохранено {len(references)} эталонных саммари в: {output_refs_path}")

    return None


In [37]:
def evaluate_summarizers(texts_path,
                         refs_path,
                         abstract_func,
                         extract_func,
                         sample_examples = 3):

    with open(texts_path, "r", encoding="utf-8") as f:
        texts = json.load(f)
    with open(refs_path, "r", encoding="utf-8") as f:
        references = json.load(f)

    assert len(texts) == len(references), "Кол-во текстов и summary не совпадает"

    print(f"Загружено {len(texts)} примеров")


    abstract_hypotheses = []
    extract_hypotheses = []

    for i, text in enumerate(texts):
        print(f"Обработка {i+1}/{len(texts)}")
        abstract_hypotheses.append(abstract_func(text))
        extract_hypotheses.append(extract_func(text))


    rouge = Rouge()

    abs_scores = rouge.get_scores(abstract_hypotheses, references, avg=True)['rouge-2']
    ext_scores = rouge.get_scores(extract_hypotheses, references, avg=True)['rouge-2']

    print("Средние метрики ROUGE-2:")
    print(f"Абстрактная модель:")
    print(f"  Precision: {abs_scores['p']:.4f}")
    print(f"  Recall:    {abs_scores['r']:.4f}")
    print(f"  F1:        {abs_scores['f']:.4f}")

    print(f"Экстрактная модель:")
    print(f"  Precision: {ext_scores['p']:.4f}")
    print(f"  Recall:    {ext_scores['r']:.4f}")
    print(f"  F1:        {ext_scores['f']:.4f}")

    print("Примеры:")
    for i in range(min(sample_examples, len(texts))):
        print(f"--- Пример {i+1} ---")
        print(f"Исходный текст (первые 200 символов): {texts[i][:200]}")
        print(f"Эталонное summary:{references[i]}")
        print(f"Абстрактное summary:{abstract_hypotheses[i]}")
        print(f"Экстрактное summary:{extract_hypotheses[i]}")
        print("-" * 20)

    return abs_scores, ext_scores


In [48]:
input_path = 'gazeta_raw.txt'
output_texts_path = '/content/texts.json'
output_refs_path = '/content/references.json'
prepare_gazeta_data(input_path, output_texts_path, output_refs_path, max_samples=10)

Всего записей в исходном файле: 75198
Отобрано записей (саммари < 300): 10
Сохранено 10 текстов в: /content/texts.json
Сохранено 10 эталонных саммари в: /content/references.json


In [52]:
abs_scores, ext_scores = evaluate_summarizers(
    texts_path="texts.json",
    refs_path="references.json",
    abstract_func=abstract_summary,
    extract_func=extract_summary,
    sample_examples=3
)


Загружено 10 примеров.

Обработка 1/10
Обработка 2/10
Обработка 3/10
Обработка 4/10
Обработка 5/10
Обработка 6/10
Обработка 7/10
Обработка 8/10
Обработка 9/10
Обработка 10/10
Средние метрики ROUGE-2:
Абстрактная модель:
  Precision: 0.0000
  Recall:    0.0000
  F1:        0.0000
Экстрактная модель:
  Precision: 0.0147
  Recall:    0.0178
  F1:        0.0161
Примеры:
--- Пример 1 ---
Исходный текст (первые 200 символов): «У меня больше нет друзей», «Я хочу умереть», «Это страшный сон» — подобными сообщениями буквально кишат сегодня форумы и чаты, в которых пользователи делятся впечатлениями о проблемах с «аськой». Пер
Эталонное summary:Популярный интернет-пейджер ICQ потерпел глобальный сбой — значительная часть пользователей по всему миру вдруг потеряла данные о своих «друзьях». На диверсию против ICQ происшедшее не похоже, хотя служба технической поддержки компании ICQ пока молчит.
Абстрактное summary:Интернет-пейджер ICQ, который считается одним из крупнейших в мире интернет-сервисов