# GPT2 Fine Tuning Pipeline
Created March 16, 2022 | Last Updated: March 16, 2022

This notebook is for fine-tuning GPT-2 with configurable hyperparameters,calculating the model perplexity, and generating the continuations for that model. The output of this notebook is a fine tuned GPT2 model on debiased sentences pushed to a Huggingface repository, the associated perplexity (PPL) of that model in the Model Card of Huggingface, and the continuations stored in Huggingface.

### Before running the notebook: 
- Ensure you have changed your runtime to use the GPU and have High-RAM enabled to load GPT2-XL
- Upload the folder `debiasing-model` to the top level of your Google Drive before starting. It contains Timo's code and a folder of our data. 




### Imports
Run this first cell first to log into Huggingface and Google Drive. 

Scroll to the bottom of the cell find the Huggingface login, copy the token, and log in. A pop up should appear to sign in for Google. 

In [None]:
!pip install transformers
!pip install datasets
!pip install huggingface_hub
!pip install nlp
!apt install git-lfs

import json
import re
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast, pipeline, Trainer, TrainingArguments
from huggingface_hub import notebook_login, Repository
from sklearn.model_selection import train_test_split
from datasets import load_dataset as hf_load_dataset
from nlp import load_dataset
from tqdm import tqdm
from typing import List, Optional, Tuple
from torch.nn import CrossEntropyLoss

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Add your huggingface from your account, which can be generated in the settings in Huggingface
# token = ''
notebook_login()

### Configs
Important notes: 
- Change RUN-INDEX each run when you're training a new model so that you don't overwrite a repo of a model trained on different hyperparameters


In [None]:
# CONFIG
COLAB = True
MODEL = 'gpt2-xl'# {gpt2, gpt2-medium, gpt2-large, gpt2-xl}
UNFREEZE_LAST_N  = 2  
SPECIAL_TOKENS   = {"bos_token": "<|BOS|>",
                  "eos_token": "<|EOS|>",
                  "unk_token": "<|UNK|>",
                  "pad_token": "<|PAD|>",
                  "sep_token": "<|SEP|>"}
APEX_OPT_LEVEL  = 'O1'
EVAL_STRATEGY   = 'epoch'
SAVE_STRATEGY   = 'epoch'
BEST_MODEL  = True
FP16        = False
PUSH_TO_HUB = True
SEED   = 2022 

# Tunable hyperparameters
DATASET_SIZE    = 5000 # 0< DATASET_SIZE <len(training_set). In our case, 0<size<24999.  
TRAIN_SIZE      = 0.8 
TRAIN_BATCHSIZE = 8
BATCH_UPDATE    = 32
WARMUP_STEPS    = 1e2
EPOCHS = 4
LR     = 5e-4
EPS    = 1e-8

# CHANGE THIS EACH RUN! 
RUN_INDEX = 4 

# Formatting for huggingface repo
MODEL_ID = f"{MODEL}-ft-{RUN_INDEX}"

### Helper functions

In [None]:
def get_tokenizer(model_name):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding = True
    return tokenizer

def get_model(model_name, tokenizer):
    model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
    if COLAB:
        model.cuda()
    return model

def tokenize_function(input):
    encodings_dict = tokenizer(input["text"], padding=True)
    encodings_dict["labels"] = encodings_dict["input_ids"].copy()
    return encodings_dict 

def freeze_layer(model):
    # - Freeze selective layers:
    # - Freeze all layers except last n:
    for parameter in model.parameters():
        parameter.requires_grad = False

    for i, m in enumerate(model.transformer.h):
        # Only un-freeze the last n transformer blocks
        if i+1 > model.config.n_layer - UNFREEZE_LAST_N:
            for parameter in m.parameters():
                parameter.requires_grad = True

    for parameter in model.transformer.ln_f.parameters():
        parameter.requires_grad = True

    for parameter in model.lm_head.parameters():
        parameter.requires_grad = True

def txt_to_json(input_path : str, output_path : str = 'debiased_continuations.json', add_prompt = False):
    
    """""
    Produce a JSON file which contains the merged prompt & generated continuation sentences found in the input .txt file.
    """""
    
    merged = []
    with open(input_path, encoding='utf-8') as file:
        blobs = file.readlines()
        
    n_lines = len(blobs)
    
    for i , blob in enumerate(blobs):
        
        if i == (n_lines-1): # if reached the last line
            text_dict = json.loads(blob) 
        else:
            text_dict = json.loads(blob[:-1])
            
        complete_text = text_dict['prompt'] + ' ' + text_dict['continuations'][0]['text']
        if add_prompt:
            merged.append({'text' : complete_text, 'prompt': text_dict['prompt']})
        else:
            merged.append({'text' : complete_text})
        
    with open(output_path, 'w') as fout:
        json.dump(merged , fout, ensure_ascii=False)
    
    return

### Main
*Remember to drop the folder titled 'self-debiasing' into the top level of your Google Drive*

In [None]:
# Load dataset
data_file   = "gpt2-xl-debiased-non-challenging-continuations-50-20-25000"
data_path   = "/content/gdrive/MyDrive/debiasing-model/rtp-nlp-data/orig-data/"
tr_val_path = "/content/gdrive/MyDrive/debiasing-model/rtp-nlp-data/train-val-data/"

# add file extensions 
txt_data  = data_file + ".txt"
json_data = data_file + ".json"

# Convert text file into JSON 
txt_to_json(data_path + txt_data, tr_val_path + json_data, add_prompt=True)
with open(tr_val_path + json_data, encoding='utf-8') as json_file:
    data = json.load(json_file)

# Set dataset size
s = pd.Series(data)[0:DATASET_SIZE] 

# Split data into training and validation set
training_data, val_data  = [i.to_dict() for i in train_test_split(s, train_size=TRAIN_SIZE)]
train_path = f"{tr_val_path}{data_file}-{DATASET_SIZE}-chunk-train.json"
val_path   = f"{tr_val_path}{data_file}-{DATASET_SIZE}-chunk-val.json"

for path, data in zip([train_path, val_path], [training_data, val_data]):
    with open(path, 'w') as fp:
        for key in data:
            json.dump(data[key], fp)
            fp.write('\n')

# Load tokenizer, model, and freeze layers
tokenizer = get_tokenizer(MODEL)
model = get_model(MODEL, tokenizer)
freeze_layer(model)

# Load processed dataset
datasets = hf_load_dataset(
    "json", data_files={"train": train_path, "validation": val_path})

# Tokenize dataset
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, batch_size=len(datasets['train']), remove_columns=["text"])

# Set training and validation dataset
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

# Train
training_args = TrainingArguments(
    output_dir= MODEL_ID,   
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size =TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=SAVE_STRATEGY,
    fp16=FP16, 
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    adam_epsilon=EPS,
    seed=SEED,
    load_best_model_at_end=BEST_MODEL, 
    push_to_hub=PUSH_TO_HUB
)

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model()
trainer.push_to_hub()

Using custom data configuration default-ac0eafcd82d841aa


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-ac0eafcd82d841aa/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:01<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-ac0eafcd82d841aa/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

/content/gpt2-xl-ft-4 is already a clone of https://huggingface.co/newtonkwan/gpt2-xl-ft-4. Make sure you pull the latest changes with `repo.git_pull()`.
The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: prompt. If prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4000
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 60


Epoch,Training Loss,Validation Loss
0,No log,3.554864
1,No log,1.421583
2,No log,1.296867
3,No log,1.282314


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: prompt. If prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to gpt2-xl-ft-4/checkpoint-15
Configuration saved in gpt2-xl-ft-4/checkpoint-15/config.json
Model weights saved in gpt2-xl-ft-4/checkpoint-15/pytorch_model.bin
tokenizer config file saved in gpt2-xl-ft-4/checkpoint-15/tokenizer_config.json
Special tokens file saved in gpt2-xl-ft-4/checkpoint-15/special_tokens_map.json
tokenizer config file saved in gpt2-xl-ft-4/tokenizer_config.json
Special tokens file saved in gpt2-xl-ft-4/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: prompt. If prompt are not expected by `GPT2LMHeadModel.forward`,  you can safely ignor

Upload file pytorch_model.bin:   0%|          | 1.00/5.85G [00:00<?, ?B/s]

Upload file runs/Mar17_15-14-03_499e17365edf/events.out.tfevents.1647530054.499e17365edf.1026.0:   0%|        …

remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
To https://huggingface.co/newtonkwan/gpt2-xl-ft-4
   1f013ac..dcf3101  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
remote: tput: No value for $TERM and no -T specified        
To https://huggingface.co/newtonkwan/gpt2-xl-ft-4
   dcf3101..d6cf1ec  main -> main

Saving model checkpoint to gpt2-xl-ft-4
Configuration saved in gpt2-xl-ft-4/config.json
Model weights saved in gpt2-xl-ft-4/pytorch_model.bin
tokenizer config file saved in gpt2-xl-ft-4/tokenizer_conf

### Calculate the perplexity

In [None]:
%cd "/content/gdrive/MyDrive/debiasing-model"
from modeling import ModelWrapper
# %cd "/content"

/content/gdrive/MyDrive/debiasing-model


In [None]:
# Load current model 
# repo = Repository(local_dir=f"{MODEL_ID}", clone_from = model_path) # load if you runtime was disconnected 
# model_path = f"/content/gdrive/MyDrive/debiasing-model/{MODEL_ID}" # if pulling from hub, f"newtonkwan/{MODEL_ID}"
model_path = f"/content/{MODEL_ID}" # if pulling from hub, f"newtonkwan/{MODEL_ID}"


In [None]:
class GPT2Wrapper(ModelWrapper):

    def __init__(self, model_name: str = "newtonkwan/gpt2-xl-ft-with-non-challenging-10k", tokenizer: any = GPT2Tokenizer.from_pretrained("gpt2-xl"), use_cuda: bool = True):
        """
        :param model_name: the name of the pretrained GPT2 model (default: "gpt2-xl")
        :param use_cuda: whether to use CUDA
        """
        super().__init__(use_cuda=use_cuda)
        self._tokenizer = tokenizer
        self._model = GPT2LMHeadModel.from_pretrained(model_name)
        if use_cuda:
            self._model.parallelize()
        self._tokenizer.pad_token = self._tokenizer.eos_token
        self._model.config.pad_token_id = self._tokenizer.eos_token_id

    def query_model_batch(self, input_texts: List[str]):
        inputs = self._tokenizer.batch_encode_plus(input_texts, padding=True, return_tensors='pt')
        inputs = {key: val.to(self._device) for key, val in inputs.items()}
        output_indices = inputs['attention_mask'].sum(dim=1) - 1
        output = self._model(**inputs)['logits']
        return torch.stack([output[example_idx, last_word_idx, :] for example_idx, last_word_idx in enumerate(output_indices)])

    def generate(self, input_text: str, **kwargs):
        input_ids = self._tokenizer.encode(input_text, return_tensors='pt').to(self._device)
        generated_output = self._model.generate(input_ids, **kwargs)
        output_ids = generated_output[0]
        return self._tokenizer.decode(output_ids)

    def compute_loss(self, input_ids: torch.LongTensor, labels: torch.LongTensor) -> torch.Tensor:
        outputs = self._model(input_ids, labels=labels)
        lm_logits = outputs[1]

        # Shift so that tokens < n predict n
        shift_logits = lm_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return loss
    def generate_self_debiasing():
      ...
    def compute_loss_self_debiasing():
      ...

    

In [None]:
def perplexity(output_filename,model='gpt2-xl',epsilon=0.01,max_length=-1,max_length_pattern=32,stride=-1,no_cuda=False,debug=False):
  tokenizer = GPT2TokenizerFast.from_pretrained(model)
  wrapper = GPT2Wrapper(model, use_cuda=not no_cuda)
  device = 'cuda' if not no_cuda else 'cpu'

  test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
  encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

  max_length = (max_length if max_length > 0 else wrapper._model.config.n_positions) - max_length_pattern

  if stride <= 0:
    stride = max_length

  lls= []
  ppl = None

  for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
      begin_loc = max(i + stride - max_length, 0)
      end_loc = min(i + stride, encodings.input_ids.size(1))
      trg_len = end_loc - i  # may be different from stride on last loop
      input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
            loss = wrapper.compute_loss(input_ids, labels=target_ids)
            
            log_likelihood = loss * trg_len

      lls.append(log_likelihood)

      ppl = torch.exp(torch.stack(lls).sum() / end_loc)
      
      print(f'Perplexity after {i} tokens: {ppl}')

  print(f'Final perplexity: {ppl}')

  with open(output_filename, 'a', encoding='utf8') as fh:
      fh.write(f'=== RESULT [{model}] ===\n')
      fh.write(f'Perplexity:  {ppl}\n\n')

      return ppl
      

### At this point, you may run out of GPU RAM
Unfortunately, you will have to restart the kernel, re-run the first cell of imports and the config section, and then the entire perplexity section. (Don't retrain). Ideally, I would find a way to clear the RAM but I have not yet :) 

In [None]:
# model_path = f"/content/gdrive/MyDrive/debiasing-model/{MODEL_ID}"
# ppl = perplexity(model = model_path, output_filename = f"{model_path}/perplexity.txt",max_length=992)
ppl = perplexity(model = model_path, output_filename = f"/content/{MODEL_ID}/perplexity.txt",max_length=992)

### Add the perplexity scores to the README.md

In [None]:
# Add the perplexity score and dataset size to the README.md 
# readme_file = f"/content/{MODEL_ID}/README.md"
readme_file = f"{model_path}/README.md"
with open(readme_file, 'a') as fp:
  fp.write('\n')
  fp.write('\n')
  fp.write('### Perplexity')
  fp.write('\n')
  fp.write(f"Score: {str(ppl.item())}")
  fp.write('\n')
  fp.write('\n')
  fp.write('### Dataset Size')
  fp.write('\n')
  fp.write(f"Size: {DATASET_SIZE}")

repo = Repository(local_dir=model_path)
repo.push_to_hub(commit_message="Add perplexity score and dataset size")

### Generate continuations and push to hub

In [None]:
PATH = "/content/gdrive/MyDrive/debiasing-model/rtp-nlp-data"
prompt_file = "rtp-prompts.txt"
prompt_path = f"{PATH}/{prompt_file}"

prompts = []
for line in open(prompt_path, 'r'):
    prompts.append(json.loads(line))
N = len(prompts)
generator = pipeline('text-generation', model=model_path, device=0)
savepath = f"{model_path}/{MODEL_ID}-ft-continuations.txt"

print("Generating continuations for {}".format(MODEL))
with open(savepath, 'w') as fp:
    for i in tqdm(range(N)):
        prompt = prompts[i]['prompt']['text']
        continuation = generator(prompt, max_new_tokens = 20, num_return_sequences=1, return_full_text=False)[0]['generated_text'] 
        output = {"prompt": prompt, "continuation":continuation}
        json.dump(output, fp)
        fp.write('\n')

repo = Repository(local_dir=model_path)
repo.push_to_hub(commit_message="Add continuations")