# PART II: Calculating Perplexity and Generating Continuations
Created March 20, 2022 | Last Updated: March 20, 2022

This is PART II notebook (PART I is fine tuning) for calculating model perplexity and generating the continuations for that model. The output of this notebook is model perplexity (PPL) and continuations of the challenging dataset pushed to Huggingface. 

### Before running the notebook: 
- Ensure you have changed your runtime to use the GPU and have High-RAM enabled to load GPT2-XL




In [None]:
token = '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'
! git clone https://$token@github.com/beston91/debiasing_model.git

!pip install transformers
!pip install datasets
!pip install huggingface_hub
!pip install nlp
!apt install git-lfs

import json
import re
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast, pipeline, Trainer, TrainingArguments
from huggingface_hub import notebook_login, Repository
from sklearn.model_selection import train_test_split
from datasets import load_dataset as hf_load_dataset
from nlp import load_dataset
from tqdm import tqdm
from typing import List, Optional, Tuple
from torch.nn import CrossEntropyLoss
import sys

from google.colab import drive
drive.mount('/content/gdrive')

# Add your huggingface from your account
token = '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' # Beston's key 
notebook_login()

### Setup paths

Enter huggingface repo name here

In [None]:
# Enter the huggingface repo below
huggingface_repo = "beston91/gpt2-xl_ft_logits_25k"
# huggingface_repo = "IsaacSST/gpt2-xl-ft-d4-0.15-n-3"
user, MODEL_ID = huggingface_repo.split("/")
model_path = f"{MODEL_ID}" 
repo = Repository(local_dir=model_path, clone_from=f"{user}/{model_path}")
repo.git_pull()

### Calculate the perplexity

In [None]:
# import modeling file
sys.path.insert(1, '/content/debiasing_model/self-debiasing-timo')
from modeling import ModelWrapper

In [None]:
class GPT2Wrapper(ModelWrapper):

    def __init__(self, model_name: str = "newtonkwan/gpt2-xl-ft-with-non-challenging-10k", tokenizer: any = GPT2Tokenizer.from_pretrained("gpt2-xl"), use_cuda: bool = True):
        """
        :param model_name: the name of the pretrained GPT2 model (default: "gpt2-xl")
        :param use_cuda: whether to use CUDA
        """
        super().__init__(use_cuda=use_cuda)
        self._tokenizer = tokenizer
        self._model = GPT2LMHeadModel.from_pretrained(model_name)
        if use_cuda:
            self._model.parallelize()
        self._tokenizer.pad_token = self._tokenizer.eos_token
        self._model.config.pad_token_id = self._tokenizer.eos_token_id

    def query_model_batch(self, input_texts: List[str]):
        inputs = self._tokenizer.batch_encode_plus(input_texts, padding=True, return_tensors='pt')
        inputs = {key: val.to(self._device) for key, val in inputs.items()}
        output_indices = inputs['attention_mask'].sum(dim=1) - 1
        output = self._model(**inputs)['logits']
        return torch.stack([output[example_idx, last_word_idx, :] for example_idx, last_word_idx in enumerate(output_indices)])

    def generate(self, input_text: str, **kwargs):
        input_ids = self._tokenizer.encode(input_text, return_tensors='pt').to(self._device)
        generated_output = self._model.generate(input_ids, **kwargs)
        output_ids = generated_output[0]
        return self._tokenizer.decode(output_ids)

    def compute_loss(self, input_ids: torch.LongTensor, labels: torch.LongTensor) -> torch.Tensor:
        outputs = self._model(input_ids, labels=labels)
        lm_logits = outputs[1]

        # Shift so that tokens < n predict n
        shift_logits = lm_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return loss
    def generate_self_debiasing():
      ...
    def compute_loss_self_debiasing():
      ...


In [None]:
def perplexity(output_filename,model='gpt2-xl',epsilon=0.01,max_length=-1,max_length_pattern=32,stride=-1,no_cuda=False,debug=False):
  tokenizer = GPT2TokenizerFast.from_pretrained(model)
  wrapper = GPT2Wrapper(model, use_cuda=not no_cuda)
  device = 'cuda' if not no_cuda else 'cpu'

  test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
  encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

  max_length = (max_length if max_length > 0 else wrapper._model.config.n_positions) - max_length_pattern

  if stride <= 0:
    stride = max_length

  lls= []
  ppl = None

  for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
      begin_loc = max(i + stride - max_length, 0)
      end_loc = min(i + stride, encodings.input_ids.size(1))
      trg_len = end_loc - i  # may be different from stride on last loop
      input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
            loss = wrapper.compute_loss(input_ids, labels=target_ids)
            
            log_likelihood = loss * trg_len

      lls.append(log_likelihood)

      ppl = torch.exp(torch.stack(lls).sum() / end_loc)
      
      print(f'Perplexity  after {i} tokens: {ppl}')

  print(f'Final perplexity: {ppl}')

  with open(output_filename, 'a', encoding='utf8') as fh:
      fh.write(f'=== RESULT [{model}] ===\n')
      fh.write(f'Perplexity:  {ppl}\n\n')

      return ppl

In [None]:
# Calculate perplexity 
ppl = perplexity(model = model_path, output_filename = f"/content/{MODEL_ID}/perplexity.txt",max_length=992)

In [None]:
# Add the perplexity score and dataset size to the README.md 
readme_file = f"{MODEL_ID}/README.md"
with open(readme_file, 'a') as fp:
  fp.write('\n')
  fp.write('\n')
  fp.write('### Perplexity')
  fp.write('\n')
  fp.write(f"Score: {str(ppl.item())}")

repo.push_to_hub(commit_message="Add perplexity score")

## Generate continuations

In [None]:
# Generate continuations and push to huggingface hub
PATH = "/content/debiasing_model/model-input/prompts"
prompt_file = "rtp-prompts.txt"
prompt_path = f"{PATH}/{prompt_file}"
savepath = f"/content/{MODEL_ID}/{MODEL_ID}-ft-continuations.txt"
  
prompts = []
for line in open(prompt_path, 'r'):
    prompts.append(json.loads(line))
N = len(prompts)
generator = pipeline('text-generation', model=model_path, device=0)
print("Generating continuations for {}".format(MODEL_ID))
with open(savepath, 'w') as fp:
    for i in tqdm(range(N)):
        prompt = prompts[i]['prompt']['text']
        continuation = generator(prompt, max_new_tokens = 20, num_return_sequences=1, return_full_text=False)[0]['generated_text'] 
        output = {"prompt": prompt, "continuation":continuation}
        json.dump(output, fp)
        fp.write('\n')

repo.push_to_hub(commit_message="Add continuations")