In [None]:
from tokenizers.models import Unigram
import transformers
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
import pandas as pd
import nltk
#nltk.download('punkt')
import string
import os
import random
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer



# Создание посимвольного токенизатора:


In [None]:
tokenizer = Tokenizer(Unigram())

In [None]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
 
normalizer = normalizers.Sequence([NFD(), StripAccents()])
tokenizer.normalizer = normalizer

In [None]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [None]:
from tokenizers.trainers import UnigramTrainer

trainer = UnigramTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [None]:
files = [f"./dataset_{split}_eq.csv" for split in ["test", "train"]]

tokenizer.train(files, trainer)

In [None]:
tokenizer.save("./tokenizer-equations-all.json")

Далее из созданного файла *tokenizer-equations-all.json* были удалены "лишние" символы. Оставлены только те, которые были использованы для конкректной задачи (символы, используемые для квадратного уравнения).

# Загрузка посимвольного кастомного токенизатора

In [1]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object= Tokenizer.from_file("./tokenizer-equations.json"))

NameError: name 'PreTrainedTokenizerFast' is not defined

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding = True

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#model.to(device)

In [None]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
dataset = load_dataset('csv', data_files={'train': 'dataset_train_eq.csv',
                                'val': 'dataset_test_eq.csv'})

Переведем исходный датасет из квадратных уравнений в датасет из токенов, чтобы узнать максимальную и минимальную длину входной и выходной последовательности

In [5]:
import pandas as pd

res_train = pd.read_csv('dataset_train_eq.csv', delimiter=',')
res_test = pd.read_csv('dataset_test_eq.csv', delimiter=',')

In [6]:
result = pd.concat([res_train, res_test], ignore_index=True)

In [8]:
res = result.rename(columns={'input': "token_ids", "output": "labels"})

In [7]:
dict_of_token = {'0' : 0, '1' : 1, '2' : 2, '3' : 3, '4' : 4, '5' : 5, '6' : 6, '7' : 7, '8' : 8, '9' : 9, \
       'x' : 10, 'D' : 11, 'n' : 12, 'a': 13, \
        '+' : 14, '-' : 15, '*': 16, '/': 17, '^': 18, '=': 19, '.' : 20, '(': 21, ')' : 22, ";": 23}

# Распечатать словарь

print(dict_of_token)

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, 'x': 10, 'D': 11, 'n': 12, 'a': 13, '+': 14, '-': 15, '*': 16, '/': 17, '^': 18, '=': 19, '.': 20, '(': 21, ')': 22, ';': 23}


In [9]:
symbol2token = dict_of_token

def tokenize(string):
    return [symbol2token[c] for c in string]

In [10]:
max_count_labels = []
for token in (res['token_ids']):
    max_count_labels.append((len(token)))

print(max(max_count_labels))

max_count_input = []
for token in (res['labels']):
    max_count_input.append((len(token)))

print(max(max_count_input))

23
106


In [None]:
#возьмем с небольшим запасом
max_input_length = 30
max_target_length = 120

def preprocess_function(examples):
    inputs =  examples["input"]
    
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        labels = tokenizer(examples["output"], max_length=max_target_length, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
tokenizer.save_pretrained('t5-base')

In [None]:
!wandb login 73ec644a1563460e6ce79991d1c959ea5e20b053

In [None]:
def compute_metrics4token(eval_pred):
    batch_size = 32
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_preds =  ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels =  ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    answer_accuracy = []
    token_accuracy = []
    num_correct, num_total = 0, 0
    num_answer = 0
    number_eq = 0
    for p, l in zip(decoded_preds, decoded_labels):
        text_pred = p.split(' ')
        text_labels = l.split(' ')
        m = min(len(text_pred), len(text_labels))
        if np.array_equal(text_pred, text_labels):
            num_answer += 1
        for i, j in zip(text_pred, text_labels):
            if i == j:
                num_correct += 1
        num_total += len(text_labels)
        number_eq += 1
    token_accuracy = num_correct / num_total
    answer_accuracy = num_answer / number_eq
    result = {'token_acc': token_accuracy, 'answer_acc': answer_accuracy}
    result = {key: value for key, value in result.items()}
    for key, value in result.items():
        wandb.log({key: value})        
    return {k: round(v, 4) for k, v in result.items()}


<div class="alert alert-success">
    
# Обучение модели

In [None]:
import wandb

wandb.init(project="T5-4-6", entity="kronesine")

In [None]:
wandb.init(project="T5-4-6", entity="kronesine")
df_marks = pd.DataFrame()
model_name = 'АДАМВ'
for head in [4]:
    #wandb.login()
    for layer in [6]:
        for lr_scheduler_type in ['linear']:
                for lr in [1.4e-4, 1e-1]:
                    for optimizer in ['adamw_torch', 'adafactor']:
                        wandb.init(name=f"{model_name}-head_{head}-layer_{layer}-optim_{optimizer}-lr_{lr}-lr_scheduler_type_{lr_scheduler_type}", project="T5-base-4-2", entity="kronesine")
                        config = T5Config(decoder_start_token_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], vocab_size=37, num_layers=layer, num_heads=head, d_kv=64, d_ff=2048, dropout_rate=0.1, max_length=120)
                        model = T5ForConditionalGeneration(config=config).to(device)
                        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
                        batch_size = 32
                        args = Seq2SeqTrainingArguments(
                            f"{model_name}-head_{head}-layer_{layer}-optim_{optimizer}-lr_{lr}-lr_scheduler_type_{lr_scheduler_type}",
                            overwrite_output_dir=True,
                            logging_first_step = True,
                            evaluation_strategy = "epoch",
                            save_strategy="epoch",
                            learning_rate=lr,
                            lr_scheduler_type=lr_scheduler_type,
                            optim=optimizer, 
                            per_device_train_batch_size=batch_size,
                            per_device_eval_batch_size=batch_size,
                            weight_decay=0.01,
                            save_total_limit=3,
                            num_train_epochs=100,
                            predict_with_generate=True,
                            fp16=True,
                            metric_for_best_model = "token_acc",
                            greater_is_better=True,   
                            load_best_model_at_end=True,
                            report_to="wandb")
                        trainer = Seq2SeqTrainer(
                            model,
                            args,
                            train_dataset=(tokenized_datasets["train"]),
                            eval_dataset=(tokenized_datasets["val"]),
                            data_collator=data_collator,
                            tokenizer=tokenizer,
                            compute_metrics=compute_metrics4token
                        )
                        trainer.train()
                        print(f"{model_name}-head_{head}-layer_{layer}-optim_{optimizer}-lr_{lr}-lr_scheduler_type_{lr_scheduler_type}")
                        trainer.save_model()
                        out = trainer.evaluate()
                        new_row = {'dict_metrics': [out], 'combination': f"{model_name}-head_{head}-layer_{layer}-optim_{optimizer}-lr_{lr}-lr_scheduler_type_{lr_scheduler_type}"}
                        df_marks = df_marks.append(new_row, ignore_index=True)
                        #wandb.finish()    

**Best model with parameters:**

head_4-layer_6-optim_adamw_torch-lr_0.00014-lr_scheduler_type_linear
