In [1]:
!nvidia-smi

Wed Sep 15 18:31:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   55C    P8     6W /  N/A |    347MiB /  5934MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers



In [3]:
import torch
import random
import numpy as np
import time
import datetime
import seaborn as sns
import pandas as pd
import os
import pathlib
import json
import re
from random import randrange

import matplotlib.pyplot as plt
%matplotlib inline

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, GPTNeoForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [4]:
seed = random.randint(0, 2 ** 32 - 1)
# Tell pytorch to run this model on the GPU.
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print(f"Will use {device_name} for training with seed: {seed}")

Will use cuda:0 for training with seed: 3916134612


In [5]:
if os.path.isdir("/opt/awsw"):
  # In case we run this locally (in Docker)
  work_dir = os.path.join("/opt", "awsw")
else:
  from google.colab import drive
  drive.mount('/content/drive')
  work_dir = os.path.join("/content", "drive", "MyDrive", "endless_awsw")

models_dir = os.path.join(work_dir, "models")

if not os.path.isdir(models_dir):
    pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
    
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
# model = GPT2LMHeadModel.from_pretrained('EleutherAI/gpt-neo-125M', pad_token_id = tokenizer.pad_token_id)
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M', pad_token_id = tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
model.config.attention_dropout = 0.2
model.config.embed_dropout = 0.2
print(f"Loading empty, pre-trained model.")

model.to(device)
model.resize_token_embeddings(len(tokenizer))
print(f"Model attached to {device_name}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading empty, pre-trained model.
Model attached to cuda:0


# Split data

In [6]:
with open("awsw_story_input.txt") as f:
    data = f.read()
lines = data.split("\n")
player_dragon_pairs = {}
last_player_talk = []
closed_player_talk = False
re_player_talk = re.compile(r'c "(.*?)"')
for line in lines:
    line = line.strip()
    line_split = line.split(" ")
    if len(line_split) <= 1:
        continue
    
    if line_split[0] == "c":
        if closed_player_talk:
            closed_player_talk = False
            last_player_talk = []
        last_player_talk.append(re.sub(re_player_talk, r"\1", line))
    else:
        if not closed_player_talk:
            last_player_talk = json.dumps(last_player_talk)
            if not last_player_talk in player_dragon_pairs:
                player_dragon_pairs[last_player_talk] = []
            closed_player_talk = True
            
        line = "DragonReply " + line
        if last_player_talk is not None:
            player_dragon_pairs[last_player_talk].append(line)
    
train_lines = []
eval_lines = []
eval_per_character = 25

for player_line_str in player_dragon_pairs.keys():
    player_lines = json.loads(player_line_str)
    dragon_lines = player_dragon_pairs[player_line_str]
    compiled_line = " ".join([f'PlayerReply c "{player_line}"' for player_line in player_lines]) + " " + " ".join(dragon_lines)
    train_lines.append(compiled_line)
    
test_bucket = {}
for l in train_lines:
    l_split = l.split(" ")
    character = None
    for i, ls in enumerate(l_split):
        if ls == "DragonReply":
            character = l_split[i + 1]
            break
    if not character in test_bucket:
        test_bucket[character] = []
    test_bucket[character].append(l)
    
for i in range(eval_per_character):
    for character in test_bucket.keys():
        random_line = test_bucket[character][randrange(len(test_bucket[character]))]
        eval_lines.append(random_line)
        for i2, t in enumerate(train_lines):
            if t == random_line:
                del train_lines[i2]
                break
    
joined_eval_lines = "\n".join(eval_lines[:5])
print(f"eval_lines: {joined_eval_lines}")
joined_train_lines = "\n".join(train_lines[:5])
print(f"train_lines: {joined_train_lines}")

if not os.path.isfile("data_train.txt"):
    with open("data_train.txt", "w") as f:
        for l in train_lines:
            f.write(l + "\n")
            
if not os.path.isfile("data_test.txt"):
    with open("data_test.txt", "w") as f:
        for l in eval_lines:
            f.write(l + "\n")

eval_lines: PlayerReply c "Yeah." DragonReply Ry "And now you're hunting the person who was supposed to be your partner. That must be really hard for you." DragonReply Lo think "Now, here is an interesting question: What species did that human turn into?" DragonReply Lo "There are a number of dragon species nowadays that aren't genetically compatible with each other. Did the human choose one of them? Did they, perhaps, procreate?" DragonReply Lo happy "A shared ancestor is one option. This would mean that the different species split after the human’s involvement in our creation." DragonReply Lo normal "If human DNA was involved in some way, this might explain how our different species came to be." DragonReply Lo happy "Take a far ancestor of ours and apply various amounts of human DNA. The result would be a number of different species, each with a different amount of resemblance to humans." DragonReply Lo relieved flip "You just want to study [player_name]!" DragonReply Lo think flip "

In [7]:
from datasets import load_dataset
dataset = load_dataset('text', data_files={'train': os.path.join(work_dir, "data_train.txt"), 'test': os.path.join(work_dir, "data_test.txt")})
def encode(batch):
    encoded = tokenizer([f"{text}<|endoftext|>" for text in batch['text']])
    return encoded

def group_texts(examples):
    # Make a max size
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

def map_dragon_reply_text(batch):
    # Make a max size
    block_size = 128
    result = {'text': []}
    for item in batch['text']:
        # PlayerReply c "A lot." PlayerReply c "A little." PlayerReply c "None at all." DragonReply An "I see."
        item_split = item.split(" ")
        player_replies = []
        dragon_replies = []
        current_reply = []
        handling_reply = None
        for token in item_split:
            if token == "PlayerReply":
                if handling_reply is None:
                    handling_reply = "PlayerReply"
                else:
                    if handling_reply == "PlayerReply":
                        # We need to store the PlayerReply
                        player_replies.append(" ".join(current_reply))
                        current_reply = []
            elif token == "DragonReply":
                if handling_reply == "DragonReply":
                    # We need to store the DragonReply
                    dragon_replies.append(" ".join(current_reply))
                    current_reply = []
                    
                if handling_reply == "PlayerReply":
                    # We need to store the PlayerReply
                    player_replies.append(" ".join(current_reply))
                    current_reply = []
                    
                handling_reply = "DragonReply"
                current_reply = []
                    
            if handling_reply is not None:
                current_reply.append(token)
                
        # There's always a dragon reply at the end.
        dragon_replies.append(" ".join(current_reply))
        for player_idx in range(len(player_replies)):
            for dragon_idx in range(len(dragon_replies)):
                result['text'].append(player_replies[player_idx] + " " + dragon_replies[dragon_idx])
                
    return result

dataset = dataset.map(
    map_dragon_reply_text,
    batched=True,
    batch_size=1000,
    num_proc=4
)

dataset = dataset.map(
    encode,
    batched=True,
    batch_size=1000,
    remove_columns=["text"],
    num_proc=4
)

dataset = dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4
)

tokenizer.decode(dataset['train'][0]['input_ids'])

Using custom data configuration default-d0c839bae2d70a4a
Reusing dataset text (/home/awsw-dev/.cache/huggingface/datasets/text/default-d0c839bae2d70a4a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


'PlayerReply c "Hey, Remy!" DragonReply Ry "Hello, [player_name]."<|endoftext|>PlayerReply c "Is there any particular reason why you wanted to meet here?" DragonReply Ry "I enjoy Tatsu Park is all. Have you been here before?"<|endoftext|>PlayerReply c "Can\'t say I have." DragonReply Ry "I see."<|endoftext|>PlayerReply c "Can\'t say I have." DragonReply Ry "Well, what do you think of it?"<|endoftext|>PlayerReply c "A few times." DragonReply Ry "I see."<|endoftext|>PlayerReply c "A few times." DragonReply Ry "Well, what'

In [8]:
class AWSWTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

def train(model):
    training_args = TrainingArguments(
        models_dir,
        seed=seed,
        per_device_train_batch_size=5,
        per_device_eval_batch_size=5,
        num_train_epochs=100,
        save_total_limit=2,
        save_steps=1000
    )
    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=dataset['train'], 
        eval_dataset=dataset['test']
    )
    checkpoint_dirs = [os.path.join(models_dir, d) for d in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, d))]
    if len(checkpoint_dirs) > 0:
        latest_checkpoint = max(checkpoint_dirs, key=os.path.getmtime)
        trainer.train(latest_checkpoint)
    else:
        trainer.train()

train(model)

Loading model from /opt/awsw/models/checkpoint-64000).
***** Running training *****
  Num examples = 3205
  Num Epochs = 100
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 64100
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 99
  Continuing training from global step 64000
  Will skip the first 99 epochs then the first 541 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/541 [00:00<?, ?it/s]

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




# Testing

We created a few past (for context) + present prompts (player input) and see the different reactions. This way, we can test the models across different iterations.

In [9]:
def generate_dragon_reply(past, prompt, top_k=50, top_p=0.95):
    block_size = 128
    model.eval()
    prompt = f'{past} PlayerReply c "{prompt}" DragonReply'
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    sample_outputs = model.generate(
        generated, 
        pad_token_id=tokenizer.eos_token_id,
        max_length = block_size,
        do_sample=True,
        top_k=top_k, 
        top_p=top_p, 
        num_return_sequences=1
    )
    return tokenizer.decode(sample_outputs[0], skip_special_tokens=False)[len(prompt):].strip()

prompts = [
    ('PlayerReply c "Hey Remy!" DragonReply Ry "Hey!"', "How are you?"),
    ('PlayerReply c "I was with Lorem today." DragonReply Ad "That\'s awesome. He\'s a cute fellow."', "What do you think of Lorem?"),
    ('DragonReply m "In Tatsu park, Adine and I sat down."', "Oh my god, Adine. What is this?"),
    ('DragonReply m "I sat down on a chair in Anna\'s lab."', "What will we do here?"),
]

# Set a fixed seed to make sure we get the same response every time.
torch.manual_seed(80085)
for (past, prompt) in prompts:
    reply = generate_dragon_reply(past, prompt)
    print(f"Prompt: {prompt}\nReply: {reply}\n\n")

Prompt: How are you?
Reply: Ry "Everything's still very dark, so I'll have to wait."<|endoftext|>


Prompt: What do you think of Lorem?
Reply: Ad "He's such a huge, strong person. I thought he was going to miss, but he just went ahead and stay inside and I got him some help."<|endoftext|>


Prompt: Oh my god, Adine. What is this?
Reply: Ad "This is where it ends. I guess it's not really my fault if someone else did something like that, but if something was your fault, how did you even do this, anyway?"<|endoftext|>


Prompt: What will we do here?
Reply: m "She told me that she was still working on a few maneuvers when it happened."<|endoftext|>




# Sampling test

Which combination is the best?

In [10]:
random.seed(80085)
for i in range(100):
    torch.manual_seed(80085)
    top_k = random.randint(0, 100)
    top_p = round(random.uniform(0, 1), 2)
    for (past, prompt) in prompts:
        reply = generate_dragon_reply(past, prompt, top_k = top_k, top_p = top_p)
        print(f"[Test {i + 1} top_k: {top_k}, top_p: {top_p}] -> Prompt: {prompt}\nReply: {reply}\n")
    print("-------------")

[Test 1 top_k: 64, top_p: 0.67] -> Prompt: How are you?
Reply: Ry "Oh, is everything fine? I was already wondering if you wanted to meet me, though."<|endoftext|>

[Test 1 top_k: 64, top_p: 0.67] -> Prompt: What do you think of Lorem?
Reply: Ad "He's such a huge, strong person. I thought he would never be able to create such a good batch of friends."<|endoftext|>

[Test 1 top_k: 64, top_p: 0.67] -> Prompt: Oh my god, Adine. What is this?
Reply: Ad "There's information in it that needs doing, though."<|endoftext|>

[Test 1 top_k: 64, top_p: 0.67] -> Prompt: What will we do here?
Reply: m "I had to prepare myself for this conversation. Everything was still up in the air as I heard the sound of the portal starting. I wondered if the dragons were already inside, but as I put my hands on the catch, I couldn't decide how to feel."<|endoftext|>

-------------
[Test 2 top_k: 61, top_p: 0.28] -> Prompt: How are you?
Reply: Ry "Oh, is everything fine? I was already wondering if you wanted to vis

In [11]:
def generate_reply(prompt):
  model.eval()
  prompt = f'PlayerReply c "{prompt}" DragonReply'
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)
  print(prompt, generated)

  sample_outputs = model.generate(
    generated, 
    do_sample=True,   
    eos_token_id=tokenizer.eos_token_id,
    top_k=50, 
    max_length = 128,
    top_p=0.95, 
    num_return_sequences=3
  )

  for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=False)))

print("What to say?")
print(generate_reply(input()))

What to say?


 Holy shit


PlayerReply c "Holy shit" DragonReply tensor([[14140, 36875,   269,   366, 33336,  7510,     1,  2851, 36875]],
       device='cuda:0')
0: PlayerReply c "Holy shit" DragonReply Ry "This might take a while, so make yourself at home."<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>


1: PlayerReply c "Holy shit" DragonReply m "It didn't take long to find your way back into the violent world we had to believe we were making of this city."<|endoftext|>


2: PlayerReply c "Holy shit" DragonReply Ry "This might be an interesting couple."<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>


None
