In [2]:
# %matplotlib inline

import os
import sys
sys.path.append('../examples')

import torch
import torch.nn.functional as F
import numpy as np

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

import generate_with_calibration as gencal
# import calibrate as cal

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [3]:
def set_seed(seed=42, n_gpu=0):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [5]:
# setup cell 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpus = torch.cuda.device_count()

set_seed()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)
model.eval()

MAX_LENGTH = int(10000)
length = 100

if length < 0 and model.config.max_position_embeddings > 0:
    length = model.config.max_position_embeddings
elif 0 < model.config.max_position_embeddings < length:
    length = model.config.max_position_embeddings  # No generation bigger than model size 
elif length < 0:
    length = MAX_LENGTH 

vocab_size = tokenizer.vocab_size
raw_text = 'In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.'
context_tokens = tokenizer.encode(raw_text)
# print(context_tokens)

print(f'VOCAB SIZE: {vocab_size}')
print(f'DEVICE: {device}')
print(f'N GPUS: {n_gpus}')

# variables set: model, vocab_size, length, device, n_gpus, context_tokens

11/22/2019 17:08:32 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/michael/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
11/22/2019 17:08:32 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/michael/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
11/22/2019 17:08:33 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/michael/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d2

VOCAB SIZE: 50257
DEVICE: cpu
N GPUS: 0


In [6]:
context_tokens

[818,
 257,
 14702,
 4917,
 11,
 11444,
 5071,
 257,
 27638,
 286,
 28000,
 19942,
 2877,
 287,
 257,
 6569,
 11,
 4271,
 31286,
 1850,
 19272,
 11,
 287,
 262,
 843,
 274,
 21124,
 13,
 3412,
 517,
 6452,
 284,
 262,
 4837,
 373,
 262,
 1109,
 326,
 262,
 28000,
 19942,
 5158,
 2818,
 3594,
 13]

In [None]:
# alpha = cal.calibrate()

In [None]:
# TODO: adjust batch size param
# TODO: implement calibration
out, ents = gencal.sample_sequence_calibrated(
    model = model,
    context = context_tokens,
    length = length,
    batch_size = 256,
    alpha = 0.01, # change this
    device = device
    )