## Prepare the data

### Setup the tokenizer

In [1]:
from transformers import AutoTokenizer

# Ensure the order
SPECIAL_TOKENS = ["<persuader>", "<persuadee>"]
MPATH = "microsoft/DialoGPT-medium"

tokenizer = AutoTokenizer.from_pretrained(MPATH)
tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS})
tokenizer.pad_token = tokenizer.eos_token

### Setup the data modules

In [2]:
from datautils import DialogDataModule, DialogBatcher


DPATH = r"..\data\persuasionforgood_corpus"

batcher = DialogBatcher(tokenizer)
dm = DialogDataModule(
    DPATH,
    batcher=batcher,
    batch_size=1,
    purpose_text="Persuade others to donate to charity in a conversation.",
)

# Prepare the model

In [4]:
from modelling import DialogAgent


SAVE_PATH = f"../models/base"
CKPT = ""

model = DialogAgent(
    MPATH,
    embedding_size=len(tokenizer)
)
print("Loaded fresh model")

Loaded fresh model


In [5]:
tokenizer.save_pretrained(SAVE_PATH)

('../models/base\\tokenizer_config.json',
 '../models/base\\special_tokens_map.json',
 '../models/base\\vocab.json',
 '../models/base\\merges.txt',
 '../models/base\\added_tokens.json',
 '../models/base\\tokenizer.json')

### Setup Callbacks

In [6]:
from pytorch_lightning.callbacks import(
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary
)

checkpoint_callback = ModelCheckpoint(
    SAVE_PATH,
    filename=f'baseagent-{{epoch}}-{{val_loss:.2f}}',
    monitor='val_loss',
    save_weights_only=True,
)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=1e-4, patience=3,
    verbose=False,
    mode="min"
)

# Train the model

In [7]:
from pytorch_lightning import Trainer, seed_everything


seed_everything(42, workers=True)

trainer = Trainer(
    max_epochs=-1,
    deterministic=True,
    accumulate_grad_batches=16,
    callbacks=[checkpoint_callback, early_stop_callback, RichModelSummary()],
    accelerator='gpu',
    log_every_n_steps=16
)

trainer.fit(model, dm)

Global seed set to 42
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Enumerating the Utterances: 100%|██████████| 571/571 [00:09<00:00, 59.77it/s]
Enumerating the Utterances: 100%|██████████| 191/191 [00:02<00:00, 64.40it/s]
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]