In [1]:
import os
import sys
sys.path.append("../")

import pickle
import json
import glob
from tqdm.auto import trange, tqdm
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from transformers import pipeline
from features import merge_entries, prepare_entry
import nltk
from utills import chunker, get_num_chunks
import torch
from torch.utils.data import Dataset, random_split
import numpy as np
from nltk.tokenize import sent_tokenize

In [2]:
PREPROCESSED_DATA_PATH = '/scratch/jnw301/av_public/temp_data/pan/'
TEMP_DATA_PATH = '/scratch/jnw301/av_public/temp_data/pan/finetuning/'

In [29]:
# model_path = TEMP_DATA_PATH + 'results_finetune/checkpoint-200000/'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
# model = GPT2LMHeadModel.from_pretrained(model_path)
# model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
path = PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl'
max_length = 1024
sent_chunk_sz = 50
max_fanfics = 50000
dataset = []
i = 0
with open(path, 'r') as f:
    for l in tqdm(f, total=max_fanfics//2):
        d = json.loads(l)
        chunks = list(chunker(sent_tokenize(merge_entries(d['pair'][0])['preprocessed']), sent_chunk_sz))
        dataset.extend([' '.join(c) for c in chunks])

        chunks = list(chunker(sent_tokenize(merge_entries(d['pair'][1])['preprocessed']), sent_chunk_sz))
        dataset.extend([' '.join(c) for c in chunks])
        i += 2
        if i > max_fanfics:
            break

In [26]:
num_records = len(dataset)
input_ids = np.memmap(TEMP_DATA_PATH + 'input_ids.npy', dtype='int32', mode='w+', shape=(num_records, max_length))
attention_mask = np.memmap(TEMP_DATA_PATH + 'attention_mask.npy', dtype='int32', mode='w+', shape=(num_records, max_length))

In [30]:
i = 0
for text in tqdm(dataset):
    encodings_dict = tokenizer('<|startoftext|>' + text + '<|endoftext|>', truncation=True,
                   max_length=max_length, padding="max_length")

    input_ids[i] = encodings_dict['input_ids']
    attention_mask[i] = encodings_dict['attention_mask']
    i += 1

  0%|          | 0/409994 [00:00<?, ?it/s]

In [31]:
with open(TEMP_DATA_PATH + 'metadata.p', 'wb') as f:
    pickle.dump((num_records, max_length), f)

In [57]:
class PANDataset(Dataset):
    def __init__(self, input_ids_path, attention_mask_path, num_records, max_length):
        
        self.input_ids = np.memmap(input_ids_path, dtype='int32', mode='r', shape=(num_records, max_length))
        self.attention_mask = np.memmap(attention_mask_path, dtype='int32', mode='r', shape=(num_records, max_length))
        self.num_records = num_records

    def __len__(self):
        return self.num_records

    def __getitem__(self, idx):
        return torch.from_numpy(np.array(self.input_ids[idx])), torch.from_numpy(np.array(self.attention_mask[idx]))

In [58]:
dataset = PANDataset(TEMP_DATA_PATH + 'input_ids.npy', TEMP_DATA_PATH + 'attention_mask.npy', num_records, max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [60]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

PyTorch: setting up devices


In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 1776696
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1776696


In [9]:
model = torch.load('./results_finetune/pytorch_model.pt')

In [21]:
text = """
<|startoftext|>Sarah smiled at me. I looked around. "Uh... Purple shirt."
"""
generated = tokenizer(text, return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=1000, top_p=0.95, temperature=.5, num_return_sequences=5)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 
Sarah smiled at me. I looked around. "Uh... Purple shirt."
 said. "Huh?" "Huh?" "Oh, they are so much better than what I"ve seen." "I think they are." "You think that"s a good point?" "I think they"re too much to be honest. They"re so much to me." "I know. I mean, you can tell them that, but they"re still so much
1: 
Sarah smiled at me. I looked around. "Uh... Purple shirt."
 said as I pulled her close to me, "You know that." "Oh, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, my, m

In [8]:
text = """
Sarah smiled at me. I looked around. "Uh... Purple shirt." "My mummy said the goblin king should be ashamed, bringing a mortal here, and <|startoftext|>
"""
generated = tokenizer(text, return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=5, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=5)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 
Sarah smiled at me. I looked around. "Uh... Purple shirt." "My mummy said the goblin king should be ashamed, bringing a mortal here, and 
"I was just joking, but she didn." I laughed. I had never laughed at the way her in front of the goblin. "Oh, you mean you were talking... about her? You were talking about her? "She... you were talking..." "Oh..." I laughed. "I mean she was a bit of an old lady, but I don't know... I just... I don... don"t think she ever got to know who I am or what I mean..." Sarah said, sounding a lot more serious than before.
"I don"t think so." I laughed, but I didn't know it yet. She had been looking forward to the goblin invasion, but she had always been afraid. She had always felt a bit uneasy about the goblin people. They were all were so different. They didn"t seem like they knew who she really was, but she knew that she would get to know them. They were her friends now, though. She felt like she had been betrayed. It was the goblin king of goblins who 