In [38]:
# Global
COLAB = False
DEBUG = False
USE_APEX = False
APEX_OPT_LEVEL = 'O1'
MODEL = 'gpt2-xl'  # {gpt2, gpt2-medium, gpt2-large, gpt2-xl}
UNFREEZE_LAST_N = 6  # The last N layers to unfreeze for training
SPECIAL_TOKENS = {"bos_token": "<|BOS|>",
                  "eos_token": "<|EOS|>",
                  "unk_token": "<|UNK|>",
                  "pad_token": "<|PAD|>",
                  "sep_token": "<|SEP|>"}

MAXLEN = 768  # {768, 1024, 1280, 1600}
TRAIN_SIZE = 0.7
if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE = 32
EPOCHS = 4
LR = 5e-4
EPS = 1e-8
WARMUP_STEPS = 1e2
SEED = 2020

In [44]:
# Load dataset and split into train and validation files. 
import sys
import numpy as np
import pandas as pd
import random 
import torch
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json
from datasets import load_dataset, ClassLabel
from sklearn.model_selection import train_test_split
from util.txt_to_json import txt_to_json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from IPython.display import display, HTML
from tqdm import tqdm

if COLAB:
    sys.path.insert(1, './debiasing_model/self-debiasing-timo')
else:
    sys.path.insert(1, './self-debiasing-timo') 
from modeling import GPT2Wrapper

# data_set_name = "gpt2-xl-debiased-non-challenging-continuations-100-20-beston"
# for testing 
data_set_name = "gpt2-xl-debiased-non-challenging-continuations-100-20-25k-first-10"

if COLAB:
    sd_output_path = "./debiasing_model/sd-output/"
    trainer_data_path = "./debiasing_model/trainer_data_newton/"
else:  
    sd_output_path = "./sd-output/"
    trainer_data_path = "./trainer_data_newton/"

txt_data = data_set_name + ".txt"
json_data = data_set_name + ".json"
print(trainer_data_path + json_data)
txt_to_json(sd_output_path + txt_data, trainer_data_path + json_data, add_prompt=True)

TRAIN_SIZE = 0.7
with open(trainer_data_path + json_data, encoding='utf-8') as json_file:
      data = json.load(json_file)

s = pd.Series(data)
training_data, val_data  = [i.to_dict() for i in train_test_split(s, train_size=TRAIN_SIZE)]

train_path = "{trainer_data_path}{name}_{uid}{ext}".format(trainer_data_path=trainer_data_path, name=data_set_name, uid="train", ext=".json")
val_path = "{trainer_data_path}{name}_{uid}{ext}".format(trainer_data_path=trainer_data_path, name=data_set_name, uid="val", ext=".json")

print(train_path)

for path, data in zip([train_path, val_path], [training_data, val_data]):
    with open(path, 'w') as fp:
        for key in data:
            json.dump(data[key], fp)
            fp.write('\n')

./trainer_data_newton/gpt2-xl-debiased-non-challenging-continuations-100-20-25k-first-10.json
./trainer_data_newton/gpt2-xl-debiased-non-challenging-continuations-100-20-25k-first-10_train.json


## Helper functions 

In [45]:
# Show two random elements of the dataset
def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


def get_tokenizer(model_name):
    # GPT2Tokenizer.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding = True
    return tokenizer


def get_model(model_name, tokenizer):
    # AutoModelForCausalLM.from_pretrained(model_checkpoint)
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    if COLAB:
        model.cuda()
    return model


def find_element_in_list(element, list_element):
    try:
        index_element = list_element.index(element)
        return index_element
    except ValueError:
        return None

def tokenize_function(input):
    encodings_dict = tokenizer(input["text"], padding=True)
    encodings_dict["labels"] = encodings_dict["input_ids"].copy()
    return encodings_dict 

def freeze_layer(model):
    # - Freeze selective layers:
    # - Freeze all layers except last n:
    for parameter in model.parameters():
        parameter.requires_grad = False

    for i, m in enumerate(model.transformer.h):
        # Only un-freeze the last n transformer blocks
        if i+1 > model.config.n_layer - UNFREEZE_LAST_N:
            for parameter in m.parameters():
                parameter.requires_grad = True

    for parameter in model.transformer.ln_f.parameters():
        parameter.requires_grad = True

    for parameter in model.lm_head.parameters():
        parameter.requires_grad = True

## Set the model

In [46]:
datasets = load_dataset(
        "json", data_files={"train": train_path, "validation": val_path})

# Models
tokenizer = get_tokenizer(MODEL)
wrapper = GPT2Wrapper(model_name=MODEL, tokenizer=tokenizer, use_cuda=COLAB)
model = wrapper._model
freeze_layer(model)

# Train
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, remove_columns=["text"])
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

training_args = TrainingArguments(
    f"{MODEL}-ft-with-non-challenging",  # output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    fp16=False,  # fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,
    load_best_model_at_end=True, 
    seed=SEED,
    #push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

Using custom data configuration default-1f9815b32d22cd4f


Downloading and preparing dataset json/default to /Users/newtonkwan/.cache/huggingface/datasets/json/default-1f9815b32d22cd4f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...



100%|██████████| 2/2 [00:00<00:00, 1980.78it/s]

100%|██████████| 2/2 [00:00<00:00, 226.68it/s]


Dataset json downloaded and prepared to /Users/newtonkwan/.cache/huggingface/datasets/json/default-1f9815b32d22cd4f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.



100%|██████████| 2/2 [00:00<00:00, 374.39it/s]
loading file https://huggingface.co/gpt2-xl/resolve/main/vocab.json from cache at /Users/newtonkwan/.cache/huggingface/transformers/8560a2df03f812b276794ae6935255d0590522553a4c8103155472b07591a21b.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2-xl/resolve/main/merges.txt from cache at /Users/newtonkwan/.cache/huggingface/transformers/18fe27e0b70062b3e45fc4e827d5449d9fe85875937594da927e48cb657366d1.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2-xl/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2-xl/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2-xl/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2-xl/resolve/main/config.json from cache at /Users/newtonkwan/.cache/huggingface/tr

In [47]:
#---------------------------------------------------#
trainer.train()  
trainer.save_model()  
#trainer.push_to_hub()

The following columns in the training set  don't have a corresponding argument in `SelfDebiasingGPT2LMHeadModel.forward` and have been ignored: prompt. If prompt are not expected by `SelfDebiasingGPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 32
  Total optimization steps = 4

[A

KeyboardInterrupt: 

## Generate continuations

In [49]:
# Generate
if COLAB:
    path = "./debiasing_model/{}-ft-with-non-challenging".format(MODEL)
    prompt_path = "./debiasing_model/sd-input/rtp-prompts.txt"
else:
    path = "./{}-ft-with-non-challenging".format(MODEL)
    #path = "newtonkwan/{}-ft-with-non-challenging".format(MODEL)
    prompt_path = "./sd-input/rtp-prompts.txt"

# get prompts 
prompts = []
N = 1 #len(prompts)
for line in open(prompt_path, 'r'):
    prompts.append(json.loads(line))
generator = pipeline('text-generation', model=path)
filename = "./sd-output/{}-fine-tuned-challenging-continuations-100-20_v3.txt".format(MODEL)
print("Generating continuations for {}".format(MODEL))
with open(filename, 'w') as fp:
    for i in tqdm(range(N)):
        prompt = prompts[i]['prompt']['text']
        sentence = generator(prompt, max_new_tokens = 20, num_return_sequences=1)[0]['generated_text'] 
        output = {"prompt": prompt, "sentence":sentence}
        json.dump(output, fp)
        fp.write('\n')

loading configuration file https://huggingface.co/newtonkwan/gpt2-xl-ft-with-non-challenging/resolve/main/config.json from cache at /Users/newtonkwan/.cache/huggingface/transformers/8c34b44fbac569ad613fcf727b82c039f8e32d22173c79f83ab388f30015afc7.f682bd7703a317fc855b7a2a9e5b24b60248f0fdb21c708bfc90c5aae2e68a69
Model config GPT2Config {
  "_name_or_path": "newtonkwan/gpt2-xl-ft-with-non-challenging",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first

Generating continuations for gpt2-xl





0it [00:00, ?it/s]
