In [1]:
!pip install datasets
!pip install transformers datasets torch
!pip install transformers datasets torch scipy scikit-learn
!pip install accelerate
!pip install evaluate
!pip install transformers datasets evaluate torch nltk rouge_score
!pip install sentencepiece
!pip install sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c90ea09b88380dae7d6dfcc0bea706c8ff354909da0fec174784f2f19034fd47
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting sacrebleu
  

# Упрощение текста

In [2]:
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    Seq2SeqTrainingArguments
)
import torch
from torch.utils.data import Dataset
import random

class SimplificationDataset(Dataset):
    def __init__(self, original_texts, simple_texts, tokenizer):
        self.inputs = tokenizer(
            ["simplify: " + text for text in original_texts],
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        self.targets = tokenizer(
            simple_texts,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.targets["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }

def train_simplifier():
    # Load model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()

    # Load Wiki-Auto dataset for text simplification
    dataset = load_dataset("wiki_auto", "manual", split="train", trust_remote_code=True)

    # Take share of the loaded data if needed
    total_examples = len(dataset)
    subsample_size = total_examples

    # Randomly sample indices
    all_indices = list(range(total_examples))
    selected_indices = random.sample(all_indices, subsample_size)

    # Get subsampled data
    original_texts = [dataset[i]["normal_sentence"] for i in selected_indices]
    simple_texts = [dataset[i]["simple_sentence"] for i in selected_indices]


    # Create dataset
    train_size = int(len(original_texts) * 0.95)
    train_dataset = SimplificationDataset(
        original_texts[:train_size],
        simple_texts[:train_size],
        tokenizer
    )
    eval_dataset = SimplificationDataset(
        original_texts[train_size:],
        simple_texts[train_size:],
        tokenizer
    )

    # Training configuration
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        report_to="tensorboard",
        learning_rate=1e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=1,
        predict_with_generate=True,
        logging_dir="./logs",
        logging_steps=5,
        push_to_hub=False,
        save_strategy="epoch"
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()
    trainer.save_model("./simplifier")
    return model, tokenizer

def simplify_text(text, model, tokenizer):
    inputs = tokenizer("simplify: " + text, return_tensors="pt", max_length=512, truncation=True)
    # Move inputs to CUDA
    inputs = {k: v.cuda() for k, v in inputs.items()}

    simple_ids = model.generate(
        inputs["input_ids"],
        max_length=512,
        min_length=10,
        num_beams=4,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        early_stopping=True,
        do_sample = True,
    )
    return tokenizer.decode(simple_ids[0], skip_special_tokens=True)

def main():
    # Обучаем модель (заглушка, предполагается, что функция train_simplifier определена)
    print("Training the simplification model...")
    model, tokenizer = train_simplifier()
    print("Model training completed.\n")

    # Тестовые тексты для упрощения
    test_texts = [
        """
        The photosynthetic process assimilates carbon dioxide and releases molecular oxygen as a byproduct through the light-dependent reactions of photosynthesis.
        """,
        """
        Quantum mechanics provides a mathematical description of the wave-particle duality and interactions of matter and energy.
        """,
        """
        The theory of relativity, formulated by Albert Einstein, fundamentally altered our understanding of space, time, and gravitation.
        """,
        """
        Mitochondria are often referred to as the powerhouses of the cell due to their role in generating adenosine triphosphate (ATP), the cell's primary energy currency.
        """,
        """
        The Industrial Revolution, which began in the late 18th century, marked a major turning point in history, with significant advancements in manufacturing and technology.
        """,
        """
        Neural networks, a cornerstone of modern artificial intelligence, are computational models inspired by the structure and function of the human brain.
        """
    ]

    # Упрощаем тексты и выводим результаты
    print("Simplifying texts:\n")
    for i, text in enumerate(test_texts, 1):
        print(f"Original text {i}:")
        print(text.strip())
        print("\nSimplified text {i}:")
        simple_text = simplify_text(text, model, tokenizer)  # Предполагается, что функция simplify_text определена
        print(simple_text)
        print("-" * 10)  # Разделитель для удобства чтения

In [3]:
main()

Training the simplification model...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.4k [00:00<?, ?B/s]

wiki_auto.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/373801 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/73249 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/118074 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0543,0.036866


Model training completed.

Simplifying texts:

Original text 1:
The photosynthetic process assimilates carbon dioxide and releases molecular oxygen as a byproduct through the light-dependent reactions of photosynthesis.

Simplified text {i}:
This means one molecule of ozone is made of 3 oxygen atoms.
----------
Original text 2:
Quantum mechanics provides a mathematical description of the wave-particle duality and interactions of matter and energy.

Simplified text {i}:
It can also be used to measure resistance and amps hence the name of some meters "Agnes".
----------
Original text 3:
The theory of relativity, formulated by Albert Einstein, fundamentally altered our understanding of space, time, and gravitation.

Simplified text {i}:
Some examples of "minor planets" are asteroids, comets and trans-Neptunian objects.
----------
Original text 4:
Mitochondria are often referred to as the powerhouses of the cell due to their role in generating adenosine triphosphate (ATP), the cell's prima