# Саммаризация

In [1]:
!pip install -q transformers datasets torch scipy scikit-learn accelerate evaluate nltk rouge_score sentencepiece sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainingArguments
)
import torch
from torch.utils.data import Dataset
import random

class SimpleDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer):
        # Add special summarization token
        tokenizer.add_special_tokens({'additional_special_tokens': ['<summarize>']})
        
        self.inputs = tokenizer(
            ["<summarize> " + text for text in texts],
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        self.targets = tokenizer(
            summaries,
            max_length=150,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.targets["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }

def train_summarizer():
    # Load model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()
    
    # Add special token and resize embeddings
    tokenizer.add_special_tokens({'additional_special_tokens': ['<summarize>']})
    model.resize_token_embeddings(len(tokenizer))

    # Load 1% of dataset first
    dataset = load_dataset("xsum", split="train[:100%]", trust_remote_code=True)
    
    # Take share of the loaded data if need
    total_examples = len(dataset)
    subsample_size = total_examples
    
    # Randomly sample indices
    all_indices = list(range(total_examples))
    selected_indices = random.sample(all_indices, subsample_size)
    
    # Get subsampled data
    texts = [dataset[i]["document"] for i in selected_indices]
    summaries = [dataset[i]["summary"] for i in selected_indices]
    
    # Create dataset
    train_size = int(len(texts) * 0.95)
    train_dataset = SimpleDataset(texts[:train_size], summaries[:train_size], tokenizer)
    eval_dataset = SimpleDataset(texts[train_size:], summaries[train_size:], tokenizer)
    
    # Training configuration
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        report_to="tensorboard",
        learning_rate=1e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=1,
        predict_with_generate=True,
        logging_dir="./logs",
        logging_steps=5,
        push_to_hub=False,
        save_strategy="epoch"
    )
    
    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    trainer.train()
    trainer.save_model("./simple_summarizer")
    return model, tokenizer

def generate_summary(text, model, tokenizer):
    inputs = tokenizer("<summarize> " + text, return_tensors="pt", max_length=512, truncation=True)
    
    # Move inputs to CUDA
    inputs = {k: v.cuda() for k, v in inputs.items()}
    
    summary_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens =150,
        min_new_tokens =10,
        num_beams=4
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [3]:
def main():
    # Train model
    model, tokenizer = train_summarizer()

    test_texts = [
        """
        Artificial intelligence has significantly changed our lives. Today, AI is used in medicine, education, industry, and many other fields. Machine learning technologies help diagnose diseases, create new materials, and optimize production processes. However, there are concerns about the impact of AI on the job market and data privacy.
        """,
        """
        Climate change is one of the most pressing issues of our time. Rising global temperatures, melting ice caps, and extreme weather events are clear signs of a changing climate. Governments and organizations worldwide are working to reduce carbon emissions and transition to renewable energy sources.
        """,
        """
        The Internet of Things (IoT) is transforming the way we live and work. By connecting everyday devices to the internet, IoT enables smarter homes, more efficient industries, and improved healthcare. However, security and privacy concerns remain significant challenges.
        """,
        """
        Quantum computing is a revolutionary technology that promises to solve complex problems beyond the reach of classical computers. By leveraging the principles of quantum mechanics, quantum computers can perform calculations at unprecedented speeds. This technology has potential applications in cryptography, drug discovery, and optimization.
        """,
        """
        Renewable energy sources like solar, wind, and hydro power are essential for a sustainable future. These clean energy sources reduce greenhouse gas emissions and dependence on fossil fuels. Governments and businesses are investing heavily in renewable energy infrastructure to combat climate change.
        """,
        """
        Blockchain technology is revolutionizing industries by providing a secure and transparent way to record transactions. Originally developed for cryptocurrencies like Bitcoin, blockchain is now being used in supply chain management, healthcare, and finance. Its decentralized nature ensures data integrity and reduces the risk of fraud.
        """,
        """
        Space exploration has always fascinated humanity. Recent advancements in rocket technology and space travel have brought us closer to exploring other planets. Missions to Mars and beyond aim to uncover the mysteries of the universe and potentially find extraterrestrial life.
        """,
        """
        Cybersecurity is a critical concern in the digital age. With the increasing number of cyberattacks, protecting sensitive data and systems has become a top priority for organizations. Advanced technologies like AI and machine learning are being used to detect and prevent cyber threats.
        """,
        """
        The rise of electric vehicles (EVs) is transforming the automotive industry. EVs offer a cleaner and more sustainable alternative to traditional gasoline-powered cars. With advancements in battery technology and charging infrastructure, EVs are becoming more accessible to consumers.
        """,
        """
        Biotechnology is driving innovation in medicine, agriculture, and environmental science. Techniques like gene editing and synthetic biology are enabling scientists to develop new treatments, improve crop yields, and address environmental challenges. The potential of biotechnology to improve human life is immense.
        """
    ]

    for i, test_text in enumerate(test_texts):
        summary = generate_summary(test_text, model, tokenizer)
        print(f"\nExample {i+1}:")
        print("\nOriginal Text:", test_text)
        print("\nSummary:", summary)


if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

(…)SUM-EMNLP18-Summary-Data-Original.tar.gz:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.5871,0.530855



Example 1:

Original Text: 
        Artificial intelligence has significantly changed our lives. Today, AI is used in medicine, education, industry, and many other fields. Machine learning technologies help diagnose diseases, create new materials, and optimize production processes. However, there are concerns about the impact of AI on the job market and data privacy.
        

Summary: AI has changed the lives of many people in the United States.

Example 2:

Original Text: 
        Climate change is one of the most pressing issues of our time. Rising global temperatures, melting ice caps, and extreme weather events are clear signs of a changing climate. Governments and organizations worldwide are working to reduce carbon emissions and transition to renewable energy sources.
        

Summary: Climate change is one of the most pressing issues of our time.

Example 3:

Original Text: 
        The Internet of Things (IoT) is transforming the way we live and work. By connecting everyday 