In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")


  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 42.0/42.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|██████████| 1.38k/1.38k [00:00<?, ?B/s]
source.spm: 100%|██████████| 803k/803k [00:00<00:00, 1.39MB/s]
target.spm: 100%|██████████| 1.08M/1.08M [00:00<00:00, 3.28MB/s]
vocab.json: 100%|██████████| 2.60M/2.60M [00:00<00:00, 3.45MB/s]
pytorch_model.bin: 100%|██████████| 307M/307M [00:42<00:00, 7.18MB/s] 
generation_config.json: 100%|██████████| 293/293 [00:00<00:00, 233kB/s]


In [2]:

# Пути к файлам с данными
train_data_file = "train_data.txt"
val_data_file = "val_data.txt"

# Функция для чтения данных из файла
def read_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.readlines()
    return data

# Чтение данных для обучения и валидации
train_data = read_data(train_data_file)
val_data = read_data(val_data_file)

# Преобразование данных в формат, понимаемый Seq2SeqDataset
def process_data(data):
    processed_data = []
    for line in data:
        source_text, target_text = line.strip().split(":")
        processed_data.append({"source_text": source_text.strip(), "target_text": target_text.strip()})
    return processed_data

train_processed_data = process_data(train_data)
val_processed_data = process_data(val_data)

from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data[index]["source_text"]
        target_text = self.data[index]["target_text"]

        # Токенизация и добавление специальных токенов
        source_inputs = self.tokenizer.encode_plus(source_text, max_length=self.max_source_length, padding="max_length", truncation=True, return_tensors="pt")
        target_inputs = self.tokenizer.encode_plus(target_text, max_length=self.max_target_length, padding="max_length", truncation=True, return_tensors="pt")

        source_ids = source_inputs["input_ids"].squeeze()
        source_mask = source_inputs["attention_mask"].squeeze()
        target_ids = target_inputs["input_ids"].squeeze()

        return {
            "input_ids": source_ids,
            "attention_mask": source_mask,
            "labels": target_ids,
        }

# Создание датасетов
train_dataset = CustomDataset(train_processed_data, tokenizer, max_source_length=128, max_target_length=128)
val_dataset = CustomDataset(val_processed_data, tokenizer, max_source_length=128, max_target_length=128)




In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    num_train_epochs=10,
    overwrite_output_dir=True,
    # Добавьте любые другие аргументы, необходимые для fine-tuning
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # ваш тренировочный датасет
    eval_dataset=val_dataset,     # ваш валидационный датасет
    tokenizer=tokenizer
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 10%|█         | 10/100 [00:20<02:40,  1.79s/it]

{'loss': 0.0369, 'grad_norm': 1.88331139087677, 'learning_rate': 4.5e-05, 'epoch': 1.0}



 10%|█         | 10/100 [00:21<02:40,  1.79s/it]Checkpoint destination directory ./results\checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.06804201751947403, 'eval_runtime': 0.3696, 'eval_samples_per_second': 5.411, 'eval_steps_per_second': 2.705, 'epoch': 1.0}


 20%|██        | 20/100 [00:42<02:24,  1.81s/it]

{'loss': 0.0257, 'grad_norm': 0.3814609944820404, 'learning_rate': 4e-05, 'epoch': 2.0}



 20%|██        | 20/100 [00:42<02:24,  1.81s/it]Checkpoint destination directory ./results\checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.028063183650374413, 'eval_runtime': 0.3517, 'eval_samples_per_second': 5.687, 'eval_steps_per_second': 2.843, 'epoch': 2.0}


 30%|███       | 30/100 [01:03<02:04,  1.78s/it]

{'loss': 0.0133, 'grad_norm': 0.8128180503845215, 'learning_rate': 3.5e-05, 'epoch': 3.0}



 30%|███       | 30/100 [01:03<02:04,  1.78s/it]Checkpoint destination directory ./results\checkpoint-30 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.021428033709526062, 'eval_runtime': 0.3676, 'eval_samples_per_second': 5.441, 'eval_steps_per_second': 2.721, 'epoch': 3.0}


 40%|████      | 40/100 [01:24<01:45,  1.75s/it]

{'loss': 0.0093, 'grad_norm': 1.9912145137786865, 'learning_rate': 3e-05, 'epoch': 4.0}



 40%|████      | 40/100 [01:24<01:45,  1.75s/it]Checkpoint destination directory ./results\checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.02559759095311165, 'eval_runtime': 0.3719, 'eval_samples_per_second': 5.377, 'eval_steps_per_second': 2.689, 'epoch': 4.0}


 50%|█████     | 50/100 [01:45<01:27,  1.76s/it]

{'loss': 0.007, 'grad_norm': 0.5645416975021362, 'learning_rate': 2.5e-05, 'epoch': 5.0}



 50%|█████     | 50/100 [01:45<01:27,  1.76s/it]Checkpoint destination directory ./results\checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.015319288708269596, 'eval_runtime': 0.3561, 'eval_samples_per_second': 5.616, 'eval_steps_per_second': 2.808, 'epoch': 5.0}


 60%|██████    | 60/100 [02:06<01:10,  1.75s/it]

{'loss': 0.0048, 'grad_norm': 0.1799599975347519, 'learning_rate': 2e-05, 'epoch': 6.0}



 60%|██████    | 60/100 [02:06<01:10,  1.75s/it]Checkpoint destination directory ./results\checkpoint-60 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.011564637534320354, 'eval_runtime': 0.3722, 'eval_samples_per_second': 5.374, 'eval_steps_per_second': 2.687, 'epoch': 6.0}


 70%|███████   | 70/100 [02:27<00:52,  1.76s/it]

{'loss': 0.0037, 'grad_norm': 0.18561480939388275, 'learning_rate': 1.5e-05, 'epoch': 7.0}



 70%|███████   | 70/100 [02:27<00:52,  1.76s/it]Checkpoint destination directory ./results\checkpoint-70 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.009153323248028755, 'eval_runtime': 0.3866, 'eval_samples_per_second': 5.173, 'eval_steps_per_second': 2.587, 'epoch': 7.0}


 80%|████████  | 80/100 [02:48<00:35,  1.77s/it]

{'loss': 0.0034, 'grad_norm': 0.6635796427726746, 'learning_rate': 1e-05, 'epoch': 8.0}



 80%|████████  | 80/100 [02:48<00:35,  1.77s/it]Checkpoint destination directory ./results\checkpoint-80 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.009955587796866894, 'eval_runtime': 0.3766, 'eval_samples_per_second': 5.31, 'eval_steps_per_second': 2.655, 'epoch': 8.0}


 90%|█████████ | 90/100 [03:09<00:17,  1.74s/it]

{'loss': 0.0036, 'grad_norm': 0.15796494483947754, 'learning_rate': 5e-06, 'epoch': 9.0}



 90%|█████████ | 90/100 [03:09<00:17,  1.74s/it]Checkpoint destination directory ./results\checkpoint-90 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.010440190322697163, 'eval_runtime': 0.3587, 'eval_samples_per_second': 5.576, 'eval_steps_per_second': 2.788, 'epoch': 9.0}


100%|██████████| 100/100 [03:30<00:00,  1.76s/it]

{'loss': 0.0037, 'grad_norm': 0.23933488130569458, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 100/100 [03:30<00:00,  1.76s/it]Checkpoint destination directory ./results\checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.010417242534458637, 'eval_runtime': 0.3582, 'eval_samples_per_second': 5.584, 'eval_steps_per_second': 2.792, 'epoch': 10.0}


100%|██████████| 100/100 [03:32<00:00,  2.12s/it]

{'train_runtime': 212.4354, 'train_samples_per_second': 1.789, 'train_steps_per_second': 0.471, 'train_loss': 0.011130043976008893, 'epoch': 10.0}





TrainOutput(global_step=100, training_loss=0.011130043976008893, metrics={'train_runtime': 212.4354, 'train_samples_per_second': 1.789, 'train_steps_per_second': 0.471, 'train_loss': 0.011130043976008893, 'epoch': 10.0})

In [4]:
def interact_with_model(model, tokenizer, input_text):
    # Предобработка входных данных
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Передача данных в модель
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Декодирование выходных данных
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text


In [12]:
input_text = "плз"
output_text = interact_with_model(model, tokenizer, input_text)
print("Восстановленное сокращение:", output_text)

Восстановленное сокращение: плаз
