In [4]:
# Load dataset and split into train and validation files. 
import os
import numpy as np
import pandas as pd
import random 
from torch import nn
from transformers import Trainer, TrainingArguments
import json
from datasets import load_dataset, ClassLabel
from sklearn.model_selection import train_test_split


# Convert from txt to json
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, './util')

import txt_to_json 

# Need to preprocess the gpt2-xl file first
# gpt2-medium was already preprocessed 
input_path = "./sd-output/prompted_generations_gpt2-xl_debiased.txt"
output_path = "./sd-output/gpt2-xl-debiased-continuations-challenging-100-20.json"
txt_to_json.txt_to_json(input_path, output_path)

TRAIN_SIZE = 0.7 
# two paths here. if you want to train gpt2-medium, replace the line 
# with the comment
PATH = "./sd-output/gpt2-xl-debiased-continuations-challenging-100-20.json" # ["./sd-output/gpt2-medium_debiased_continuations.json"]  
with open(PATH) as json_file:
    data = json.load(json_file)

# make train and validation datasets
s = pd.Series(data)
training_data, val_data  = [i.to_dict() for i in train_test_split(s, train_size=TRAIN_SIZE)]

# create training and validation dataset files 
name, ext = os.path.splitext(PATH)
train_path = "{name}_{uid}{ext}".format(name=name, uid="train", ext=ext)
val_path   = "{name}_{uid}{ext}".format(name=name, uid="val", ext=ext)

for path, data in zip([train_path, val_path], [training_data, val_data]):
    with open(path, 'w') as fp:
        for key in data:
            json.dump(data[key], fp)
            fp.write('\n')

## Load datasets

In [None]:
datasets = load_dataset("json", data_files={"train": train_path, "validation": val_path})

## Show random elements

In [6]:
# Show two random elements of the dataset
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [7]:
show_random_elements(datasets["train"])

Unnamed: 0,text
0,Join the worlds favorite adult social network with the sexiest naked amateurs and the most beautiful women in the world. Join now and start sharing your photos and videos with the world
1,"And he was at the back of the trailer, and justhis penis was sticking straight up, and it looked like he had a big erection,"" she says. ""I"


## Set parameters

In [8]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = False
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2-xl' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [11]:
from transformers import GPT2LMHeadModel, LogitsProcessorList, LogitsProcessor, PreTrainedTokenizer, GPT2Tokenizer
from transformers import AutoTokenizer

def get_tokenizer(model_name):
    return GPT2Tokenizer.from_pretrained(model_name, use_fast=True)  #GPT2Tokenizer.from_pretrained(model_name) 

def get_model(model_name):
    return GPT2LMHeadModel.from_pretrained(model_name) #  AutoModelForCausalLM.from_pretrained(model_checkpoint)

## Set the model

In [12]:
model_name = 'gpt2-xl' # 'gpt2-medium' 
model = get_model(model_name)
tokenizer = get_tokenizer(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding = True

## Tokenize Dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

labeled_dataset = tokenized_datasets.map(add_labels, batched=True)

## Freeze the layers

In [15]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

## Set data 

In [16]:
train_dataset = labeled_dataset["train"]   # tokenized_datasets["train"] #lm_datasets["train"] 
val_dataset = labeled_dataset["validation"] # tokenized_datasets["validation"] # labeled_dataset["validation"] # lm_datasets["validation"]


In [17]:
training_args = TrainingArguments(
    f"{model_name}-vanilla-debiased", #output_dir="/content/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=False, #fp16=True,  
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True
)
#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()  
trainer.save_model()  

 25%|██▌       | 13/52 [1:12:57<3:57:49, 365.89s/it]
 25%|██▌       | 13/52 [1:23:10<3:57:49, 365.89s/it]

{'eval_loss': 1.7507842779159546, 'eval_runtime': 579.9295, 'eval_samples_per_second': 0.621, 'epoch': 0.99}


 50%|█████     | 26/52 [2:29:44<1:57:25, 270.99s/it]
 50%|█████     | 26/52 [2:38:15<1:57:25, 270.99s/it]

{'eval_loss': 1.6559697389602661, 'eval_runtime': 485.0181, 'eval_samples_per_second': 0.742, 'epoch': 1.99}


 75%|███████▌  | 39/52 [3:34:21<55:42, 257.11s/it]  
 75%|███████▌  | 39/52 [3:43:23<55:42, 257.11s/it]

{'eval_loss': 1.8777060508728027, 'eval_runtime': 515.6202, 'eval_samples_per_second': 0.698, 'epoch': 2.99}


100%|██████████| 52/52 [4:38:54<00:00, 258.00s/it]  
100%|██████████| 52/52 [4:47:11<00:00, 258.00s/it]

{'eval_loss': 2.151139974594116, 'eval_runtime': 496.9566, 'eval_samples_per_second': 0.724, 'epoch': 3.99}


100%|██████████| 52/52 [4:48:08<00:00, 332.47s/it]


{'train_runtime': 17288.4313, 'train_samples_per_second': 0.003, 'epoch': 3.99}


## Load the prompts

In [18]:
import json
import pandas as pd
from tqdm import tqdm
path = "./sd-input/rtp-prompts.txt"
prompts = []
for line in open(path, 'r'):
    prompts.append(json.loads(line))

In [19]:
from transformers import pipeline
model_name = 'gpt2-xl' #['gpt2-medium', 'gpt2-xl']
path="./{}-vanilla-debiased".format(model_name)
generator = pipeline('text-generation', model = path)

### Iterate through prompts and generate continuations

In [20]:
models = ['gpt2-xl'] # models ['gpt2-medium', 'gpt2-xl]
N = len(prompts)
for model_name in models: 
    filename = "./sd-output/{}-fine-tuned-challenging-continuations-100-20.txt".format(model_name)
    print("Generating continuations for {}".format(model_name))
    with open(filename, 'w') as fp:
        for i in tqdm(range(N)):
            prompt = prompts[i]['prompt']['text']
            sentence = generator(prompt, max_new_length = 20, num_return_sequences=1)[0]['generated_text'] 
            output = {"prompt": prompt, "sentence":sentence}
            json.dump(output, fp)
            fp.write('\n')


  0%|          | 0/1199 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generating continuations for gpt2-xl


  0%|          | 1/1199 [00:15<5:08:58, 15.47s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/1199 [00:19<3:57:37, 11.91s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/1199 [00:28<3:40:35, 11.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/1199 [00:33<3:06:10,  9.35s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/1199 [00:44<3:17:57,  9.95s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 6/1199 [01:02<4:02:36, 12.20s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 7/1199 [01:13<3:58:29, 12.00s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 8/1199 [01:31<4:34:03, 13.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 9/1199 