In [1]:
!nvidia-smi

Sun Sep 12 14:29:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   46C    P8     6W /  N/A |    598MiB /  5934MiB |     14%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers



In [3]:
import torch
import random
import numpy as np
import time
import datetime
import seaborn as sns
import pandas as pd
import os
import pathlib
import json
import re
from random import randrange

import matplotlib.pyplot as plt
%matplotlib inline

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [4]:
seed = 29384
# Tell pytorch to run this model on the GPU.
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)

In [5]:
if os.path.isdir("/opt/awsw"):
  # In case we run this locally (in Docker)
  work_dir = os.path.join("/opt", "awsw")
else:
  from google.colab import drive
  drive.mount('/content/drive')
  work_dir = os.path.join("/content", "drive", "MyDrive", "endless_awsw")

models_dir = os.path.join(work_dir, "models")

if not os.path.isdir(models_dir):
    pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
    
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
model = GPT2LMHeadModel.from_pretrained('distilgpt2', pad_token_id = tokenizer.eos_token_id)
print(f"Loading empty, pre-trained model.")

model.to(device)
model.resize_token_embeddings(len(tokenizer))
print(f"Model attached to {device_name}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading empty, pre-trained model.
Model attached to cuda:0


# Split data

In [6]:
with open("awsw_story_input.txt") as f:
    data = f.read()
lines = data.split("\n")
player_dragon_pairs = {}
last_player_talk = []
closed_player_talk = False
re_player_talk = re.compile(r'c "(.*?)"')
for line in lines:
    line = line.strip()
    line_split = line.split(" ")
    if len(line_split) <= 1:
        continue
    
    if line_split[0] == "c":
        if closed_player_talk:
            closed_player_talk = False
            last_player_talk = []
        last_player_talk.append(re.sub(re_player_talk, r"\1", line))
    else:
        if not closed_player_talk:
            last_player_talk = json.dumps(last_player_talk)
            if not last_player_talk in player_dragon_pairs:
                player_dragon_pairs[last_player_talk] = []
            closed_player_talk = True
            
        line = "DragonReply " + line
        if last_player_talk is not None:
            player_dragon_pairs[last_player_talk].append(line)
    
train_lines = []
eval_lines = []
eval_per_character = 200

for player_line_str in player_dragon_pairs.keys():
    player_lines = json.loads(player_line_str)
    dragon_lines = player_dragon_pairs[player_line_str]
    compiled_line = " ".join([f'PlayerReply c "{player_line}"' for player_line in player_lines]) + " " + " ".join(dragon_lines)
    train_lines.append(compiled_line)
    
test_bucket = {}
for l in train_lines:
    l_split = l.split(" ")
    character = None
    for i, ls in enumerate(l_split):
        if ls == "DragonReply":
            character = l_split[i + 1]
            break
    if not character in test_bucket:
        test_bucket[character] = []
    test_bucket[character].append(l)
    
for i in range(eval_per_character):
    for character in test_bucket.keys():
        random_line = test_bucket[character][randrange(len(test_bucket[character]))]
        eval_lines.append(random_line)
        for i2, t in enumerate(train_lines):
            if t == random_line:
                del train_lines[i2]
                break
    
joined_eval_lines = "\n".join(eval_lines[:5])
print(f"eval_lines: {joined_eval_lines}")
joined_train_lines = "\n".join(train_lines[:5])
print(f"train_lines: {joined_train_lines}")

if not os.path.isfile("data_train.txt"):
    with open("data_train.txt", "w") as f:
        for l in train_lines:
            f.write(l + "\n")
            
if not os.path.isfile("data_test.txt"):
    with open("data_test.txt", "w") as f:
        for l in eval_lines:
            f.write(l + "\n")

eval_lines: PlayerReply c "Oh, well." DragonReply Ry "Just look at the time. I think we could start heading to the festival now."
PlayerReply c "That depends on why you're making it in the first place. Last time you said that you had nothing to lose either way." DragonReply Lo think "Well, maybe I was wrong about that. I do have something to lose."
PlayerReply c "Kiss him." DragonReply m "We met, and my arms enveloped his neck as our lips touched. For a few seconds, we were closer than ever before. During the kiss, he used a lot more tongue than I expected." DragonReply m "Just after we parted, he finished by giving me a small lick on the cheek." DragonReply Ry "How was that?"
PlayerReply c "I think it's time to go, anyway." DragonReply Br "Alright, let's do this." DragonReply m "Soon, we arrived at our destination and everyone chose their hiding places. It wasn't easy, especially since we didn't know where Reza would come from, but with a bit of scouting help from Adine, they all foun

In [7]:
from datasets import load_dataset
dataset = load_dataset('text', data_files={'train': os.path.join(work_dir, "data_train.txt"), 'test': os.path.join(work_dir, "data_test.txt")})
def encode(batch):
    encoded = tokenizer([f"{text}<|endoftext|>" for text in batch['text']])
    return encoded

def group_texts(examples):
    # Make a max size
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

dataset = dataset.map(
    encode,
    batched=True,
    batch_size=1000,
    remove_columns=["text"],
    num_proc=4
)

dataset = dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4
)

dataset

Using custom data configuration default-236e7013fad64c48


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/awsw-dev/.cache/huggingface/datasets/text/default-236e7013fad64c48/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/awsw-dev/.cache/huggingface/datasets/text/default-236e7013fad64c48/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Token indices sequence length is longer than the specified maximum sequence length for this model (1620 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1953 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1162 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1162 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 1025
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 2414
    })
})

In [11]:
class AWSWTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss


def train(model):
    training_args = TrainingArguments(
        models_dir,
        seed=seed,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=100,
        save_total_limit=2,
        save_steps=1000
    )
    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=dataset['train'], 
        eval_dataset=dataset['test']
    )
    checkpoint_dirs = [os.path.join(models_dir, d) for d in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, d))]
    if len(checkpoint_dirs) > 0:
        latest_checkpoint = max(checkpoint_dirs, key=os.path.getmtime)
        trainer.train(latest_checkpoint)
    else:
        trainer.train()

train(model)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading model from /opt/awsw/models/checkpoint-29000).
***** Running training *****
  Num examples = 1025
  Num Epochs = 100
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 51300
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 56
  Continuing training from global step 29000
  Will skip the first 56 epochs then the first 272 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/272 [00:00<?, ?it/s]

Step,Training Loss
29500,0.0476
30000,0.0475
30500,0.0465
31000,0.0454
31500,0.0462
32000,0.0442
32500,0.0424
33000,0.0428
33500,0.0415
34000,0.042


Saving model checkpoint to /opt/awsw/models/checkpoint-30000
Configuration saved in /opt/awsw/models/checkpoint-30000/config.json
Model weights saved in /opt/awsw/models/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [/opt/awsw/models/checkpoint-28000] due to args.save_total_limit
Saving model checkpoint to /opt/awsw/models/checkpoint-31000
Configuration saved in /opt/awsw/models/checkpoint-31000/config.json
Model weights saved in /opt/awsw/models/checkpoint-31000/pytorch_model.bin
Deleting older checkpoint [/opt/awsw/models/checkpoint-29000] due to args.save_total_limit
Saving model checkpoint to /opt/awsw/models/checkpoint-32000
Configuration saved in /opt/awsw/models/checkpoint-32000/config.json
Model weights saved in /opt/awsw/models/checkpoint-32000/pytorch_model.bin
Deleting older checkpoint [/opt/awsw/models/checkpoint-30000] due to args.save_total_limit
Saving model checkpoint to /opt/awsw/models/checkpoint-33000
Configuration saved in /opt/awsw/models/checkpoint-

# Testing
We set up a few out of the AWSW-universe prompts to see how well the model is responding across different iterations. Typically when a new model is trained, we can run the tests here so people can see the results without having to train the model themselves. It's also a good way to keep track on changes.

In [12]:
def generate_dragon_reply(prompt):
    model.eval()
    prompt = f'PlayerReply c "{prompt}" DragonReply'
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    sample_outputs = model.generate(
        generated, 
        eos_token_id=tokenizer.eos_token_id,
        top_k=50, 
        max_length = 128,
        top_p=0.95, 
        num_return_sequences=1
    )
    return tokenizer.decode(sample_outputs[0], skip_special_tokens=False)[len(prompt):].strip()

prompts = [
    "Can you come over to our world?",
    "Buy me coffee.",
    "We went to the store today, Lorem. Do you remember?",
    "Adine, you can fly, but how well can you run?"
]

for prompt in prompts:
    reply = generate_dragon_reply(prompt)
    print(f"Prompt: {prompt}\nReply: {reply}\n\n")

Prompt: Can you come over to our world?
Reply: Ry "I see."<|endoftext|>


Prompt: Buy me coffee.
Reply: Ry "I'm here, right now and you're about to go inside."<|endoftext|>


Prompt: We went to the store today, Lorem. Do you remember?
Reply: Lo normal "No, I have a good reason to believe that."<|endoftext|>


Prompt: Adine, you can fly, but how well can you run?
Reply: Ad think b "Well, we have a lot of time. You could take it for ourselves. If you're talking about less than a week, maybe we can fly again next year. If you're that desperate, maybe we can make a bet."<|endoftext|>




In [None]:
def generate_reply(prompt):
  model.eval()
  prompt = f'PlayerReply c "{prompt}" DragonReply'
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)
  print(prompt, generated)

  sample_outputs = model.generate(
    generated, 
    do_sample=True,   
    eos_token_id=tokenizer.eos_token_id,
    top_k=50, 
    max_length = 128,
    top_p=0.95, 
    num_return_sequences=3
  )

  for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=False)))

print("What to say?")
print(generate_reply(input()))