In [34]:
%load_ext autoreload
%autoreload 2

from mint.model.transformer import Transformer, TransformerConfig
from mint.common import create_config, to_dict
from mint.trainer import Trainer, TrainerConfig
from datasets import load_from_disk
from transformers import AutoTokenizer
import os
from mint.translator import Translator

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
DATASET_PATH = "../preprocessed_dataset"

dataset = load_from_disk(os.path.join(DATASET_PATH, "paired_tokenized_dataset"))
source_tokenizer = AutoTokenizer.from_pretrained(os.path.join(DATASET_PATH, "source_tokenizer"))
target_tokenizer = AutoTokenizer.from_pretrained(os.path.join(DATASET_PATH, "target_tokenizer"))
source_tokenizer.pad_token = "<pad>"
target_tokenizer.pad_token = "<pad>"

In [36]:
config = create_config(TransformerConfig())
config.glob.d_model = 512
config.glob.n_heads = 8
config.glob.max_seq_len = 128
config.glob.d_feedforward = 2048
config.glob.p_dropout = 0.1

config.encoder_config.n_blocks = 10
config.encoder_config.vocab_size = 10000
config.decoder_config.n_blocks = 10
config.decoder_config.vocab_size = 10000

model = Transformer(**to_dict(config))

print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

Number of parameters: 126471680


In [None]:
trainer_config = create_config(TrainerConfig())
trainer_config.logger_config.experiment_name = "exp2"
trainer_config.warmup_steps = 3000
trainer_config.learning_rate = 1e-4
trainer = Trainer(model, dataset, **to_dict(trainer_config))
trainer.train(10)

avg loss: 2.1740:  12%|█▏        | 3375/28140 [18:10<2:12:58,  3.10it/s]

In [17]:
translator = Translator(model, source_tokenizer, target_tokenizer)

translator.translate("Good evening", max_length=128)

['Dobré večera<|endoftext|>']