In [None]:
!nvidia-smi

In [None]:
to_run = 'hyperparam_search' # ["hyperparam_search", "training"]

In [None]:
import torch
import random
import numpy as np
import time
import optuna
import datetime
import seaborn as sns
import pandas as pd
import os
import gc
import pathlib
import json
import math
import re
from random import randrange
import multiprocessing
import datasets
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, GPTNeoForCausalLM
from transformers import AdamW, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup
from transformers import Trainer, TrainingArguments, TrainerCallback

In [None]:
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

In [None]:
seed = random.randint(0, 2 ** 32 - 1)
random.seed(seed)
block_size = 64
datasets.logging.set_verbosity(datasets.logging.ERROR)
# Tell pytorch to run this model on the GPU.
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
# device_name = "cpu"
device = torch.device(device_name)
print(f"Will use {device_name} for training with seed: {seed}")

In [None]:
if os.path.isdir("/opt/awsw"):
  # In case we run this locally (in Docker)
  work_dir = os.path.join("/opt", "awsw")
else:
  from google.colab import drive
  drive.mount('/content/drive')
  work_dir = os.path.join("/content", "drive", "MyDrive", "endless_awsw")

# Split data

In [None]:
with open(os.path.join(work_dir, "awsw_story_input.txt")) as f:
    data = f.read()
lines = data.split("\n")
player_dragon_pairs = {}
last_player_talk = []
closed_player_talk = False
re_player_talk = re.compile(r'c "(.*?)"')
for line in lines:
    line = line.strip()
    line_split = line.split(" ")
    if len(line_split) <= 1:
        continue
    
    if line_split[0] == "c":
        if closed_player_talk:
            closed_player_talk = False
            last_player_talk = []
        last_player_talk.append(re.sub(re_player_talk, r"\1", line))
    else:
        if not closed_player_talk:
            last_player_talk = json.dumps(last_player_talk)
            if not last_player_talk in player_dragon_pairs:
                player_dragon_pairs[last_player_talk] = []
            closed_player_talk = True
            
        line = "DragonReply " + line
        if last_player_talk is not None:
            player_dragon_pairs[last_player_talk].append(line)
    
train_lines = []
eval_lines = []
eval_per_character = 0

for player_line_str in player_dragon_pairs.keys():
    player_lines = json.loads(player_line_str)
    dragon_lines = player_dragon_pairs[player_line_str]
    compiled_line = " ".join([f'PlayerReply c "{player_line}"' for player_line in player_lines]) + " " + " ".join(dragon_lines)
    train_lines.append(compiled_line)
    
test_bucket = {}
for l in train_lines:
    l_split = l.split(" ")
    character = None
    for i, ls in enumerate(l_split):
        if ls == "DragonReply":
            character = l_split[i + 1]
            break
    if not character in test_bucket:
        test_bucket[character] = []
    test_bucket[character].append(l)
    
for i in range(eval_per_character):
    for character in test_bucket.keys():
        random_line = test_bucket[character][randrange(len(test_bucket[character]))]
        eval_lines.append(random_line)
        for i2, t in enumerate(train_lines):
            if t == random_line:
                del train_lines[i2]
                break
    
joined_eval_lines = "\n".join(eval_lines[:5])
print(f"eval_lines: {joined_eval_lines}")
joined_train_lines = "\n".join(train_lines[:5])
print(f"train_lines: {joined_train_lines}")

random.shuffle(train_lines)

if not os.path.isfile(os.path.join(work_dir, "data_train.txt")):
    with open(os.path.join(work_dir, "data_train.txt"), "w") as f:
        for l in train_lines:
            f.write(l + "\n")
            
if not os.path.isfile(os.path.join(work_dir, "data_test.txt")):
    with open(os.path.join(work_dir, "data_test.txt"), "w") as f:
        for l in eval_lines:
            f.write(l + "\n")

In [None]:
def get_dataset(tokenizer):
    dataset = load_dataset('text', data_files={'train': os.path.join(work_dir, "data_train.txt"), 'test': os.path.join(work_dir, "data_test.txt")})

    class AWSWDataset(torch.utils.data.IterableDataset):
        def __init__(self, dataset, dataset_type, do_shuffle=False):
            self.current_dataset = dataset
            self.dataset_type = dataset_type
            self.do_shuffle = do_shuffle
            self.shuffled_datasets = []
            self.current_idx = 0
            for i in range(1):
                self.current_dataset = self.current_dataset.shuffle()
                mapped_dataset = self.current_dataset.map(
                    group_texts,
                    batched=True,
                    batch_size=dataset_batch_size,
                    num_proc=dataset_map_cores
                )
                self.shuffled_datasets.append(mapped_dataset)

        def approx_len(self):
            return len(self.shuffled_datasets[0][self.dataset_type])

        def __iter__(self):
            self.current_idx = (self.current_idx + 1) % len(self.shuffled_datasets)
            return iter(self.shuffled_datasets[self.current_idx][self.dataset_type])

    def encode(batch):
        result = []
        attention_mask = []
        for item in batch['text']:
            #tokens = [tokenizer.bos_token_id] + tokenizer.encode(item) + [tokenizer.eos_token_id]
            #tokens = tokenizer.encode(item)
            tokens = tokenizer.encode(item) + [tokenizer.eos_token_id]
            result.append(tokens)
            attention_mask.append([1] * len(tokens))
        return {
            'attention_mask': attention_mask,
            'input_ids': result
        }

    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        #random_shift = random.randint(0, 64)
        #concatenated_examples['input_ids'] = concatenated_examples['input_ids'][random_shift:]
        #concatenated_examples['attention_mask'] = concatenated_examples['attention_mask'][random_shift:]
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # Pad the end
        to_add = (math.ceil(total_length / block_size) * block_size) - total_length
        if to_add > 0:
            concatenated_examples['input_ids'] += [tokenizer.pad_token_id] * to_add
            concatenated_examples['attention_mask'] += [0] * to_add
            total_length += to_add
        # Split by chunks of block_size.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    def map_dragon_reply_text(batch):
        result = {'text': []}
        for item in batch['text']:
            item_split = item.split(" ")
            player_replies = []
            dragon_replies = []
            current_reply = []
            handling_reply = None
            for token in item_split:
                if token == "PlayerReply":
                    if handling_reply is None:
                        handling_reply = "PlayerReply"
                    else:
                        if handling_reply == "PlayerReply":
                            # We need to store the PlayerReply
                            player_replies.append(" ".join(current_reply))
                            current_reply = []
                elif token == "DragonReply":
                    if handling_reply == "DragonReply":
                        # We need to store the DragonReply
                        dragon_replies.append(" ".join(current_reply))
                        current_reply = []

                    if handling_reply == "PlayerReply":
                        # We need to store the PlayerReply
                        player_replies.append(" ".join(current_reply))
                        current_reply = []

                    handling_reply = "DragonReply"
                    current_reply = []

                if handling_reply is not None:
                    current_reply.append(token)

            # There's always a dragon reply at the end.
            dragon_replies.append(" ".join(current_reply))
            for player_idx in range(len(player_replies)):
                for dragon_idx in range(len(dragon_replies)):
                    result['text'].append(player_replies[player_idx] + " " + dragon_replies[dragon_idx])

        return result

    dataset_map_cores = min(multiprocessing.cpu_count(), 10)
    dataset_batch_size = 1000

    dataset = dataset.map(
        map_dragon_reply_text,
        batched=True,
        batch_size=dataset_batch_size,
        num_proc=dataset_map_cores
    )

    dataset = dataset.map(
        encode,
        batched=True,
        batch_size=dataset_batch_size,
        remove_columns=["text"],
        num_proc=dataset_map_cores
    )

    return dataset.map(
        group_texts,
        batched=True,
        batch_size=dataset_batch_size,
        num_proc=dataset_map_cores
    )

In [None]:
def get_model(name):
    tokenizer = GPT2Tokenizer.from_pretrained(name, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
    model = None
    if name == 'distilgpt2':
        model = GPT2LMHeadModel.from_pretrained(name, pad_token_id = tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)
    else:
        model = GPTNeoForCausalLM.from_pretrained(name, pad_token_id = tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)
    
    model.config.attention_dropout = 0.01
    model.config.embed_dropout = 0.01
    
    model.to(device)
    model.resize_token_embeddings(len(tokenizer))
    print(f"Model attached to {device_name}")
    return model, tokenizer

def random_model_folder():
    now = int(time.time())
    models_dir = os.path.join(work_dir, "models", str(now))
    if not os.path.isdir(models_dir):
        pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
    return models_dir

def train_model(params_dict):
    model_name = params_dict['model_name']
    model, tokenizer = get_model(model_name)
    named_parameters = list(model.named_parameters())
    dataset = get_dataset(tokenizer)
    lr = params_dict['lr']
    batch_size = 64
    train_len = len(dataset['train'])
    num_training_steps = math.ceil(train_len / batch_size)
    num_epoch = 10
    num_total_steps = num_training_steps * num_epoch
    num_warmup_steps = num_training_steps * params_dict['warmup_factor']
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler_str = params_dict['scheduler']
    scheduler = None
    if scheduler_str == "cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps)
    elif scheduler_str == "cosine_with_hard_restarts_schedule_with_warmup":
        cycles = params_dict['cycles']
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps, cycles)
    elif scheduler_str == "polynomial_decay_schedule_with_warmup":
        lr_end = params_dict['lr_end']
        power = params_dict['power']
        scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps, power=power, lr_end=lr_end)

    last_loss = None
        
    class AWSWTrainer(Trainer):
        def _get_train_sampler(self):
            return None

    class AWSWTrainerCallback(TrainerCallback):
        def __init__(self, optimizer):
            self.old_freeze_part_layers = False
            self.optimizer = optimizer

        def on_train_end(self, args, state, control, **kwargs):
            nonlocal last_loss
            learning_rate_history = [h['learning_rate'] for h in state.log_history if 'learning_rate' in h]
            loss_history = [h['loss'] for h in state.log_history if 'loss' in h]
            fig, axs = plt.subplots(2)
            fig.suptitle('Learning rate and loss')
            axs[0].plot(learning_rate_history)
            axs[1].plot(loss_history)
            last_loss = loss_history[-1]

        def on_step_begin(self, args, state, control, **kwargs):
            # Freeze a part
            learning_rate = self.optimizer.param_groups[0]['lr']
            freeze_layer_rate = params_dict['freeze_layer_rate']
            freeze_part_layers = learning_rate > freeze_layer_rate
            if self.old_freeze_part_layers is not freeze_part_layers:
                print(f"set freeze_part_layers: {freeze_part_layers}")
                to_freeze_count = params_dict['to_freeze_count']
                for name, param in named_parameters[:to_freeze_count * -1]:
                    param.requires_grad = not freeze_part_layers
                self.old_freeze_part_layers = freeze_part_layers

    def train(model, dataset):
        training_args = TrainingArguments(
            random_model_folder(),
            seed=seed,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epoch,
            logging_steps=50,
        )
        trainer = Trainer(
            model=model, 
            args=training_args, 
            train_dataset=dataset['train'],
            optimizers=(optimizer, scheduler),
            callbacks=[AWSWTrainerCallback(optimizer)]
        )
        trainer.train()
        del training_args
        del trainer
        gc.collect()
        torch.distributed.destroy_process_group()
        torch.cuda.empty_cache()

    train(model, dataset)
    del model
    del dataset
    del tokenizer
    del optimizer
    return last_loss

study = optuna.create_study()
study.optimize(objective, n_trials=100)

In [None]:
print(study.best_params)
#{'model_name': 'distilgpt2', 'lr': 0.004019767138102094, 'warmup_factor': 1, 'scheduler': 'cosine_schedule_with_warmup', 'freeze_layer_rate': 0.000872986593035212, 'to_freeze_count': 50}
optuna.visualization.plot_optimization_history(study)

In [None]:
batch_size = 64
train_len = len(dataset['train'])
num_training_steps = math.ceil(train_len / batch_size)
num_epoch = 4
num_total_steps = num_training_steps * num_epoch
num_warmup_steps = num_training_steps * 2
print(f"train_len: {train_len} num_training_steps: {num_training_steps} num_total_steps: {num_total_steps}")
def get_optimizer_and_scheduler(params):
    optimizer = AdamW(params, lr=0.001)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps)
    #scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps, power=0.5, lr_end=1e-10)
    #scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_total_steps, 4)
    return optimizer, scheduler

In [None]:
lrs = []
optimizer, scheduler = get_optimizer_and_scheduler([torch.tensor(0.1)])
for i in range(num_total_steps):
    optimizer.step()
    scheduler.step()
    lrs.append(optimizer.param_groups[0]["lr"])
plt.plot(lrs)
plt.show()
del lrs
del optimizer
del scheduler

In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_fp16_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

# Testing

We created a few past (for context) + present prompts (player input) and see the different reactions. This way, we can test the models across different iterations.

In [None]:
def generate_dragon_reply(past, prompt, top_k=None, top_p=None):
    model.eval()
    prompt = f'{past} PlayerReply c "{prompt}" DragonReply'
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    sample_outputs = model.generate(
        generated, 
        do_sample=(top_k is not None and top_p is not None),
        top_p=top_p,
        top_k=top_k,
        pad_token_id=tokenizer.eos_token_id,
        max_length=block_size,
        num_return_sequences=1
    )
    return tokenizer.decode(sample_outputs[0], skip_special_tokens=False)[len(prompt):].strip()

prompts = [
    ('PlayerReply c "Hey Remy!" DragonReply Ry "Hey!"', "How are you?"),
    ('PlayerReply c "I was with Lorem today." DragonReply Ad "That\'s awesome. He\'s a cute fellow."', "What do you think of Lorem?"),
    ('DragonReply m "In Tatsu park, Adine and I sat down."', "Oh my god, Adine. What is this?"),
    ('DragonReply m "I sat down on a chair in Anna\'s lab."', "What will we do here?"),
]

# Set a fixed seed to make sure we get the same response every time.
torch.manual_seed(80085)
for (past, prompt) in prompts:
    reply = generate_dragon_reply(past, prompt)
    print(f"Prompt: {prompt}\nReply: {reply}\n\n")

# Sampling test

Which combination is the best?

In [None]:
for i in range(100):
    torch.manual_seed(80085)
    top_k = random.randint(0, 100)
    top_p = round(random.uniform(0, 1), 2)
    for (past, prompt) in prompts:
        reply = generate_dragon_reply(past, prompt, top_k = top_k, top_p = top_p)
        print(f"[Test {i + 1} top_k: {top_k}, top_p: {top_p}] -> Prompt: {prompt}\nReply: {reply}\n")
    print("-------------")

In [None]:
def generate_reply(prompt):
    model.eval()
    prompt = f'PlayerReply c "{prompt}" DragonReply'
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)
    print(prompt, generated)

    sample_outputs = model.generate(
        generated, 
        do_sample=True,   
        eos_token_id=tokenizer.eos_token_id,
        top_length = block_size,
        top_p=0.95, 
        num_return_sequences=3
    )

    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=False)))

print("What to say?")
print(generate_reply(input()))