In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Пути к файлам с данными
train_data_file = "train_data.txt"
val_data_file = "val_data.txt"

# Функция для чтения данных из файла
def read_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.readlines()
    return data

# Чтение данных для обучения и валидации
train_data = read_data(train_data_file)
val_data = read_data(val_data_file)

# Преобразование данных в формат, понимаемый Seq2SeqDataset
def process_data(data):
    processed_data = []
    for line in data:
        source_text, target_text = line.strip().split(":")
        processed_data.append({"source_text": source_text.strip(), "target_text": target_text.strip()})
    return processed_data

train_processed_data = process_data(train_data)
val_processed_data = process_data(val_data)

from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data[index]["source_text"]
        target_text = self.data[index]["target_text"]

        # Токенизация и добавление специальных токенов
        source_inputs = self.tokenizer.encode_plus(source_text, max_length=self.max_source_length, padding="max_length", truncation=True, return_tensors="pt")
        target_inputs = self.tokenizer.encode_plus(target_text, max_length=self.max_target_length, padding="max_length", truncation=True, return_tensors="pt")

        source_ids = source_inputs["input_ids"].squeeze()
        source_mask = source_inputs["attention_mask"].squeeze()
        target_ids = target_inputs["input_ids"].squeeze()

        return {
            "input_ids": source_ids,
            "attention_mask": source_mask,
            "labels": target_ids,
        }

# Создание датасетов
train_dataset = CustomDataset(train_processed_data, tokenizer, max_source_length=128, max_target_length=128)
val_dataset = CustomDataset(val_processed_data, tokenizer, max_source_length=128, max_target_length=128)

In [3]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=2,
    num_train_epochs=50,
    overwrite_output_dir=True,
    # Добавьте любые другие аргументы, необходимые для fine-tuning
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # ваш тренировочный датасет
    eval_dataset=val_dataset,     # ваш валидационный датасет
    tokenizer=tokenizer
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 10%|█         | 10/100 [00:19<02:39,  1.77s/it]

{'loss': 1.4707, 'grad_norm': 3.0501062870025635, 'learning_rate': 4.5e-05, 'epoch': 1.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.1302180290222168, 'eval_runtime': 0.3667, 'eval_samples_per_second': 5.454, 'eval_steps_per_second': 2.727, 'epoch': 1.0}


 20%|██        | 20/100 [00:41<02:27,  1.85s/it]

{'loss': 0.1836, 'grad_norm': 1.3888813257217407, 'learning_rate': 4e-05, 'epoch': 2.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.10349459946155548, 'eval_runtime': 0.3725, 'eval_samples_per_second': 5.369, 'eval_steps_per_second': 2.684, 'epoch': 2.0}


 30%|███       | 30/100 [01:03<02:14,  1.92s/it]

{'loss': 0.1348, 'grad_norm': 1.1115838289260864, 'learning_rate': 3.5e-05, 'epoch': 3.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.06846167147159576, 'eval_runtime': 0.3996, 'eval_samples_per_second': 5.005, 'eval_steps_per_second': 2.502, 'epoch': 3.0}


 40%|████      | 40/100 [01:25<01:46,  1.77s/it]

{'loss': 0.0972, 'grad_norm': 1.530106544494629, 'learning_rate': 3e-05, 'epoch': 4.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.05031341314315796, 'eval_runtime': 0.3604, 'eval_samples_per_second': 5.549, 'eval_steps_per_second': 2.774, 'epoch': 4.0}


 50%|█████     | 50/100 [01:46<01:28,  1.77s/it]

{'loss': 0.0745, 'grad_norm': 0.9737552404403687, 'learning_rate': 2.5e-05, 'epoch': 5.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.04388190805912018, 'eval_runtime': 0.3562, 'eval_samples_per_second': 5.615, 'eval_steps_per_second': 2.808, 'epoch': 5.0}


 60%|██████    | 60/100 [02:07<01:11,  1.78s/it]

{'loss': 0.0571, 'grad_norm': 0.6224616765975952, 'learning_rate': 2e-05, 'epoch': 6.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.038413237780332565, 'eval_runtime': 0.3491, 'eval_samples_per_second': 5.73, 'eval_steps_per_second': 2.865, 'epoch': 6.0}


 70%|███████   | 70/100 [02:34<01:00,  2.02s/it]

{'loss': 0.0517, 'grad_norm': 0.9510853290557861, 'learning_rate': 1.5e-05, 'epoch': 7.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.03591769561171532, 'eval_runtime': 0.3664, 'eval_samples_per_second': 5.459, 'eval_steps_per_second': 2.73, 'epoch': 7.0}


 80%|████████  | 80/100 [02:54<00:35,  1.75s/it]

{'loss': 0.0446, 'grad_norm': 0.9817093014717102, 'learning_rate': 1e-05, 'epoch': 8.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.032530754804611206, 'eval_runtime': 0.3623, 'eval_samples_per_second': 5.521, 'eval_steps_per_second': 2.76, 'epoch': 8.0}


 90%|█████████ | 90/100 [03:15<00:17,  1.75s/it]

{'loss': 0.0398, 'grad_norm': 0.9286895990371704, 'learning_rate': 5e-06, 'epoch': 9.0}


                                                
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.03183097764849663, 'eval_runtime': 0.3614, 'eval_samples_per_second': 5.534, 'eval_steps_per_second': 2.767, 'epoch': 9.0}


100%|██████████| 100/100 [03:36<00:00,  1.76s/it]

{'loss': 0.0351, 'grad_norm': 0.8112730979919434, 'learning_rate': 0.0, 'epoch': 10.0}


                                                 
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


{'eval_loss': 0.03182034566998482, 'eval_runtime': 0.3526, 'eval_samples_per_second': 5.672, 'eval_steps_per_second': 2.836, 'epoch': 10.0}


100%|██████████| 100/100 [03:38<00:00,  2.19s/it]

{'train_runtime': 218.5028, 'train_samples_per_second': 1.739, 'train_steps_per_second': 0.458, 'train_loss': 0.21889390975236891, 'epoch': 10.0}





TrainOutput(global_step=100, training_loss=0.21889390975236891, metrics={'train_runtime': 218.5028, 'train_samples_per_second': 1.739, 'train_steps_per_second': 0.458, 'train_loss': 0.21889390975236891, 'epoch': 10.0})

In [3]:
def interact_with_model(model, tokenizer, input_text):
    # Предобработка входных данных
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Передача данных в модель
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Декодирование выходных данных
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text


In [4]:
input_text = "нзч"
output_text = interact_with_model(model, tokenizer, input_text)
print("Восстановленное сокращение:", output_text)

Восстановленное сокращение: nsch


##  Сохранение

In [5]:
# Путь для сохранения модели
output_model_dir = "./trained_model"

# Сохранение модели и токенизатора
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.json',
 './trained_model\\source.spm',
 './trained_model\\target.spm',
 './trained_model\\added_tokens.json')