In [1]:
%load_ext autoreload
%autoreload 2
    
%load_ext tensorboard
import sys, os
sys.path.append('../paraphrase/')
from paraphraser_args import ModelArguments, DataTrainingArguments, TrainingArguments
from paraphraser_dataloader import load_dataset_pseudo
from paraphraser_trainer import ParaphraserTrainer
from transformers import AutoTokenizer, AutoModelWithLMHead, HfArgumentParser

In [2]:
data_dir = "../data/pseudo"
task = "formality_diff"
# task = "shakespeare"
model_name = "t5-small"
# model_nick = "t5_transfer_shakespeare"
model_nick = "t5_transfer_diff"
output_dir = "../models/"
epochs = "5"
train_batch_size = "16"
eval_batch_size = "16"
save_log_steps = "800"

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    model_name,
    "--model_nick",
    model_nick,
    "--data_dir",
    data_dir,
    "--output_dir",
    os.path.join(output_dir, model_nick),
    "--cache_dir",
    os.path.join(output_dir,"cache"),
    "--overwrite_cache",
    "--per_device_train_batch_size",
    train_batch_size,
    "--per_device_eval_batch_size",
    eval_batch_size,
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    save_log_steps,
    "--save_steps",
    save_log_steps,
    "--data_parallel",
    "True",
    "--meta_task",
    "transfer",
    "--meta_task_type",
    "intra"
])


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
model = AutoModelWithLMHead.from_pretrained(model_args.model_name_or_path)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/vivek/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
    

In [4]:
if training_args.meta_task_type=='intra':
    train_dataset = load_dataset_pseudo(data_args.data_dir, tokenizer, mode="train", tasks = task.split('+'), n_proc=2048)
    dev_dataset = load_dataset_pseudo(data_args.data_dir, tokenizer, mode="dev",  tasks = task.split('+'), n_proc=2048)

100%|██████████| 16/16 [00:05<00:00,  2.96it/s]
100%|██████████| 4/4 [00:01<00:00,  3.40it/s]


In [5]:
tokenizer.decode(train_dataset[0][0], skip_special_tokens=True), tokenizer.decode(train_dataset[0][2], skip_special_tokens=True)

("transfer: my lord, it's not necessary to apologize. | input: low | output: mid",
 'My lord, there needs no such apology.')

In [6]:
trainer = ParaphraserTrainer([training_args,model_args, data_args], model, tokenizer, train_dataset, dev_dataset)
trainer.train()

***** Running training *****
Num examples = 31081
Num Epochs = 5.0
Total train batch size = 32
Gradient Accumulation steps = 1
Total optimization steps = 4860.0
Logging steps = 800
Save steps = 800
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

step 0/0 loss=28.84:   0%|          | 0/972 [00:03<?, ?it/s][A
step 0/0 loss=28.84:   0%|          | 1/972 [00:03<52:09,  3.22s/it][A
step 1/1 loss=28.85:   0%|          | 1/972 [00:03<52:09,  3.22s/it][A
step 1/1 loss=28.85:   0%|          | 2/972 [00:03<39:06,  2.42s/it][A
step 2/2 loss=27.84:   0%|          | 2/972 [00:04<39:06,  2.42s/it][A
step 2/2 loss=27.84:   0%|          | 3/972 [00:04<30:01,  1.86s/it][A
step 3/3 loss=26.48:   0%|          | 3/972 [00:04<30:01,  1.86s/it][A
step 3/3 loss=26.48:   0%|          | 4/972 [00:04<23:36,  1.46s/it][A
step 4/4 loss=25.39:   0%|          | 4/972 [00:05<23:36,  1.46s/it][A
step 4/4 loss=25.39:   1%|          | 5/972 [00:05<19:08,  1.19s/it][A
step 5/5 loss=24.24:   1%|          | 5/972 

KeyboardInterrupt: 

In [3]:
training_args.meta_task

'transfer'

In [4]:
# Eval
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, model_nick))

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/vivek/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
    

In [8]:
# example = "transfer: haha, u really think so, huh? | input: low | output: high </s>"
example = "transfer: haha, u really think so? | input to output: high </s>"
# example = "transfer: did you hear about his problems, lol | input formality: low | output formality: high"
t = tokenizer(example, return_tensors='pt')

In [9]:
gen = model.generate(input_ids= t.input_ids, attention_mask = t.attention_mask, num_return_sequences = 3, num_beams = 12)

In [10]:
tokenizer.batch_decode(gen, skip_special_tokens=True)

['do you think so?',
 'do you really think so?',
 "he thinks he's really going to think about it?"]