## Prepare the data

### Setup the tokenizer

In [1]:
from transformers import AutoTokenizer

# Ensure the order
SPECIAL_TOKENS = ["<persuader>", "<persuadee>"]
MPATH = "microsoft/DialoGPT-medium"

tokenizer = AutoTokenizer.from_pretrained(MPATH)
tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS})
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


### Setup the data modules

In [2]:
from datautils import DialogDataModule, DialogBatcher


DPATH = r"..\data\persuasionforgood_corpus"

is_pursuader = True
batcher = DialogBatcher(tokenizer)
dm = DialogDataModule(
    DPATH,
    batcher=batcher,
    batch_size=1,
    purpose_text="Convince people to donate to charity.",
    is_pursuader=is_pursuader
)

# Prepare the model

In [3]:
from modelling import DialogAgent


ROLE = "persuader" if is_pursuader else "persuadee"
SAVE_PATH = f"../models/base/{ROLE}"
CKPT = ""

model = DialogAgent(
    MPATH,
    embedding_size=len(tokenizer)
)
print("Loaded fresh model")

Loaded fresh model


### Setup Callbacks

In [4]:
from pytorch_lightning.callbacks import(
    EarlyStopping,
    ModelCheckpoint,
    RichModelSummary
)

checkpoint_callback = ModelCheckpoint(
    SAVE_PATH,
    filename=f'baseagent-{ROLE}-{{epoch}}-{{val_loss:.2f}}',
    monitor='val_loss',
    save_weights_only=True,
)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=1e-4, patience=3,
    verbose=False,
    mode="min"
)

# Train the model

In [5]:
from pytorch_lightning import Trainer, seed_everything


seed_everything(42, workers=True)

trainer = Trainer(
    max_epochs=-1,
    deterministic=True,
    accumulate_grad_batches=16,
    callbacks=[checkpoint_callback, early_stop_callback, RichModelSummary()],
    accelerator='gpu',
    log_every_n_steps=16
)

trainer.fit(model, dm)

Global seed set to 42
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Enumerating the Utterances: 100%|██████████| 762/762 [00:11<00:00, 67.41it/s]
Enumerating the Utterances: 100%|██████████| 255/255 [00:03<00:00, 71.31it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


                                                                           

  rank_zero_warn(


Epoch 5: 100%|██████████| 7939/7939 [48:33<00:00,  2.72it/s, v_num=19, train_loss=2.610, val_loss=2.520] 
