# Test GPT2 Logic Expressions Generation

In [1]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

## Configure Paths & Load Model

In [2]:
# Configure device
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('DEVICE: ', DEVICE)

GENERATIVE_MODEL_PATH = os.path.join('saved_models', 'generative')

# Configure paths to .txt files for vocabulary (and merges) and get vocabulary size
# NOTE: For the BERT model, we were using .txt but for GPT2 we will use .json
VOCABULARY_PATH = os.path.join('data', 'vocabulary.json')
VOCABULARY_SIZE = len(open(VOCABULARY_PATH, 'r').readlines())
MERGES_PATH = os.path.join('data', 'merges.txt')

DEVICE:  cpu


### Regenerate Tokenizer for Decoding

In [3]:
# Configure the tokenizer
tokenizer = GPT2Tokenizer(vocab_file=VOCABULARY_PATH,
                          merges_file=MERGES_PATH,
                          errors='replace',
                          unk_token='[UNK]',
                          bos_token='[BOS]',
                          eos_token='[EOS]',
                          pad_token='[PAD]',
                          sep_token='[SEP]',
                          mask_token='[MASK]')

In [4]:
# Get the MLM model -- we will use the loss on CLOZE task to detect anomaly
model = GPT2LMHeadModel.from_pretrained(GENERATIVE_MODEL_PATH, pad_token_id=tokenizer.eos_token_id)
print('GPT2 Num. Parameters: %d' % model.num_parameters())
# Place model on device
model = model.to(DEVICE)

GPT2 Num. Parameters: 2702208


## Decode!

### (1) Greedy Decoding

In [13]:
def greedy_decode(start_expression, split_token='[SEP]', max_length=150):
    start_expression = start_expression if isinstance(start_expression, list) else [start_expression] 
    input_ids = tokenizer.encode(start_expression, return_tensors='pt')
    greedy_output = model.generate(input_ids, max_length=max_length)
    decoded_tokens = ''.join(tokenizer.convert_ids_to_tokens(greedy_output[0].tolist()))
    return decoded_tokens.split(split_token)

In [14]:
greedy_decode('(')

['(((~p∨p)∨~p)∨(~p∨p))',
 '((~p∨p))',
 '(~p∨p))',
 '(~p)',
 '(~p∨p)',
 '(~(((~p∨p∨p)p)∧(~q∨p)∧(~q)∧(~q)∧(~(~q∨p)∧(~r∨p)))∧(~q))∧(~r∨p))))∧(~q)∧(~r∨p)∧(~q)',
 '(~r∨p)',
 '(~r)))',
 '(((']

In [15]:
greedy_decode('[BOS]')

['[BOS]((~p∨p)∨~p)∨(~p∨p)',
 '(~p∨p)',
 '(~p∨p)',
 '(~p)',
 '(~p∨p))',
 '(~q∨p)',
 '(~q∨p)',
 '(~((~q∨p)p∧(~q)∧(~q))∧(~(~q∨~q))))))',
 '(~(~q)',
 '(~q∨p)',
 '(~q)',
 '(~r∨p))',
 '(~(~(~q∨p))',
 '(~r∨p)))',
 '(~r∨']

In [16]:
greedy_decode('p')

['p((~p∨p)∨~p)∨(~p∨p)',
 '(~p∨p))',
 '(~p∨p)',
 '(~p∨p)',
 '(~q∨p))',
 '(~q∨p)',
 '(~q∨p)',
 '(~(~q∨p)∧(~q)∧(~q)∧(~(~q∨p∧(~q)∧(~q))))))',
 '(~q)',
 '(~r∨p∧[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]∧(~q)',
 '(~r∨p))',
 '(~r∨p))',
 '(~r']

In [17]:
greedy_decode('q')

['q((~q∨q)∨(~q∨q)∧(~q∨q))∧(~q∨q))',
 '(((~q∨q)∧(~q)∧(~q∨q)∧T))∧T)',
 '((~(~q∨q)∧T∧T))∧T)',
 '((~(~((~p∨q∨~p∨q)∧(~p∨q)∧(~r∨q)∧T)∧(~p)))∧((~p)∧(~r∧(~r∨q))∧(~r∨q)))))∧']

In [18]:
greedy_decode(['p', '∨', 'p'])

['p∨p∨(~p∨p)∨(~p∨p)',
 '(p∨p)',
 '(~p∨p))',
 '(~p∨p)',
 '(~p)',
 '(~p)',
 '(~p∨p)p)',
 '(~((~p∨p)p)∧(~q)∧(~q)∧(~~r∨p)∧[PAD][PAD][PAD][PAD][PAD][PAD][PAD]∧(~q)))',
 '(~q)',
 '(~(~r∨p)',
 '(~r)',
 '(~(~r∨p)',
 '(~r∨p))',
 '(~r)',
 '(~p)',
 '(']

In [19]:
greedy_decode(['T'])

['T((~p∨p)∨~p)∨(~p∨p)',
 '(~p∨p)',
 '(~p∨p)',
 '(~p∨p)',
 '(~q∨p)',
 '(~q∨p)',
 '(~q∨p)',
 '(~q∨p)',
 '(~q∨p)',
 '(~((~q∨p∧(~q)))))',
 '(~p)',
 '(~∧(~q∨~q)',
 '(~q))',
 '(~q)',
 '(~p))',
 '(~p)',
 '((((~q∨~q∨p∧(~p∧(~']

In [20]:
greedy_decode(['F'])  # Never saw this during training, bad performance is expected!

['F((~p∨p)∨(~p∨p)∧(~p∨p))',
 '(((~p∨p)∧(~p)∧(~p∨p)))',
 '(~q∨p))',
 '(~(~q∨p)∧T∧T)))',
 '(~(~((~q∨p)∧T∧T)∧(~q)∧(~r∨p)∧(~r∨p)))∧(~r)∧(~r∨p))∧(~r))∧(~q)∧(~q))∧(~r∨p)))))',
 '']

### (2) Beam Search

In [21]:
def beam_search_decode(start_expression, split_token='[SEP]', max_length=150, num_beams=10, early_stopping=True):
    start_expression = start_expression if isinstance(start_expression, list) else [start_expression] 
    input_ids = tokenizer.encode(start_expression, return_tensors='pt')
    beam_search_output = model.generate(input_ids, max_length=max_length, num_beams=num_beams, early_stopping=early_stopping)
    decoded_tokens = ''.join(tokenizer.convert_ids_to_tokens(beam_search_output[0].tolist()))
    return decoded_tokens.split(split_token)

In [22]:
beam_search_decode('(')

['(((~q∨q)∧(~q∨q))∧(~q∨q))∧T)',
 '(T∧T)',
 '(T∧T)',
 '(T∧(T∧(~q∨~q∨q))∧T)',
 '(T)',
 '(T∧T)',
 '(T∧T)',
 '(T∧T)∧T)',
 '(T)∧(T)∧T)',
 '(T)',
 '(T)',
 '(T)',
 '(T∧T)',
 '(T)',
 '(T)',
 '(T∧(T)',
 '(T)',
 '(T)∧(T)',
 '(T)',
 '(T)',
 '(']

In [24]:
beam_search_decode('[BOS]')

['[BOS]((~q∨q)∧(~q∨q)∧(~q∨q))∧T)',
 '(T∧T)',
 '(T∧T)',
 '(T∧(~q∨q))',
 '(T∧(~q∨q)∧T))',
 '(T∧(~q∨q)∧T)',
 '(T)',
 '(T)',
 '(T∧[PAD][PAD][PAD][PAD][PAD][PAD][PAD]∧T)',
 '(T)',
 '(T)',
 '(T)∧T)',
 '(T)',
 '(T)',
 '(T∧T)',
 '(T)',
 '(T)',
 '(T)',
 '(T)',
 '(T)',
 '((']

#### TODOs
Look into other decoding approaches such as (i) beam search with repeating-ngram penalty, (ii) top-k sampling, and (iii) top-p nucleus sampling.