# setup

In [None]:
!pip install transformers
!pip install datasets

In [24]:
from transformers import OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer, BertTokenizer, BertForMaskedLM, logging
from datasets import load_dataset
import numpy as np
from numpy.random import randint
import torch
from torch import tensor
from torch.nn.functional import softmax
from time import time
import matplotlib.pyplot as plt

logging.set_verbosity_error()

In [None]:
wikitext2 = load_dataset('wikitext', 'wikitext-2-v1')
wikitext2 = [x['text'].strip() for x in wikitext2['test']]

# functions

In [4]:
# generate sequences with random length from 1-100
def get_gpt2_sequences(data, batch_size=256, seq_len=100):
    response = []
        
    tokenized = [tokenizer(x)['input_ids'] for x in data]
    tokenized = [x for x in tokenized if len(x) >= seq_len]
        
    while len(response) < batch_size:
        # pick a random line
        line = tokenized[randint(len(tokenized))]

        # get random start position
        start = 0 if len(line) == seq_len else randint(len(line) - seq_len)
        end = start + seq_len
        
        window = line[start:end]
        predict_index = randint(seq_len)
        replaced = window[predict_index]
        window = window[0:predict_index]
        
        if len(window) == 0:
            continue
        
        window, attention_mask = pad_sequence(window, seq_len)
        
        r = {
            'context_length': predict_index,
            'window': window,
            'replaced_token': replaced,
            'attention_mask': attention_mask
        }
        
        response.append(r)
    return response

In [5]:
def get_bert_sequences(data, batch_size=256, seq_len=100):
  MASK_TOKEN = 103
  response = []

  tokenized = [tokenizer(x)['input_ids'] for x in data]
  tokenized = [x for x in tokenized if len(x) >= seq_len]

  while len(response) < batch_size:
    # pick a random line
    line = tokenized[randint(len(tokenized))]

    start = 0 if len(line) == seq_len else randint(len(line) - seq_len)
    end = start + seq_len

    window = line[start:end]
    predict_index = randint(seq_len)
    actual_token = window[predict_index]
    window[predict_index] = MASK_TOKEN

    r = {
      'window': window,
      'replaced_token': actual_token,
      'position': predict_index
    }
    
    response.append(r)
  return response

In [27]:
def predict_gpt2(batch, model):
    actual = [x['replaced_token'] for x in batch]
    context_lengths = [x['context_length'] for x in batch]
    context = { 'input_ids': tensor([x['window'] for x in batch]),
                'attention_mask': tensor([x['attention_mask'] for x in batch])
                }
    with torch.no_grad():
      predicted = model.generate(**context, max_length=1)
      predicted = [x[-1].item() for x in predicted]
    
    return predicted, actual, context_lengths

In [7]:
def predict_bert(batch, model):
  actual = [x['replaced_token'] for x in batch]
  positions = [x['position'] for x in batch]
  position_mask = tensor([[x] for x in positions]).to('cuda')

  with torch.no_grad():
    predictions = bert(input_ids=tensor([x['window'] for x in batch]).to('cuda'))
    predictions = predictions[0].argmax(dim=-1)
    predictions = [x.item() for x in predictions.gather(1, position_mask)]

  return predictions, actual, positions

In [8]:
def pad_sequence(sequence, length=100, pad_token=50256):
    pad_len = length - len(sequence)
    padding = [pad_token] * pad_len
    attention_mask = [0] * pad_len + [1 for x in sequence]
    
    padded = padding + sequence
    return padded, attention_mask

In [9]:
def evaluate(model, sequence_getter, predict_fn, batch_size=256):
  all_predicted, all_actual, all_context_lengths = [], [], []
  epoch = 0
  while len(all_predicted) < 10_000:
    epoch += 1
    batch = sequence_getter(wikitext2, batch_size)
    predicted, actual, context_lengths = predict_fn(batch, model)
    all_predicted += predicted
    all_actual += actual
    all_context_lengths += context_lengths

    print('\r', f'epoch {epoch}\trunning accuracy: {np.mean([1 if x==y else 0 for x,y in zip(all_predicted, all_actual)])}', end='')

  return all_predicted, all_actual, all_context_lengths

# gpt2 evaluation

In [10]:
evaluation = {}

gpt_models = [('gpt2', 256), ('gpt2-medium', 128)]

for model_name, batch_size in gpt_models:
  print(f'{"#"*10} {model_name} {"#"*10}')
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  tokenizer.padding_side = "left"
  tokenizer.pad_token = tokenizer.eos_token

  model = GPT2LMHeadModel.from_pretrained(model_name).to('cuda')
  predicted, actual, lengths = evaluate(model, get_gpt2_sequences, predict_gpt2, batch_size)
  evaluation[model_name] = {'predicted': predicted, 'actual': actual, 'pos': lengths}

del model

 epoch 40	running accuracy: 0.3212890625

# bert evaluation

In [13]:
bert_models = [('bert-base-cased', 256), ('bert-base-cased', 256), ('bert-large-cased', 128), ('bert-large-uncased', 128)]

for model_name, batch_size in bert_models:
  tokenizer = BertTokenizer.from_pretrained(model_name)
  tokenizer.padding_side = "left"
  model = BertForMaskedLM.from_pretrained(model_name).to('cuda')
  predicted, actual, mask_positions = evaluate(model, get_bert_sequences, predict_bert)
  evaluation[model_name] = {'predicted':predicted, 'actual':actual, 'mask_positions': mask_positions}

del model

 epoch 40	running accuracy: 0.63720703125

# analysis

In [None]:
accuracies = {}
models = evaluation.keys()
for m in models:
    model_data = data[m]
    model_type = data[m].pop('type', None)
    model_accuracy = [[] for x in range(100)]

    for pred, act, length in [a for a in zip(*model_data.values())]:
        model_accuracy[length].append(1 if pred == act else 0)
    accuracies[m] = [np.mean(x) if len(x) > 0 else 0 for x in model_accuracy]

In [18]:
import json
with open('eval.json', 'w') as f:
  json.dump(accuracies, f)