In [1]:
import sys, os, re, json, random
import time
from importlib import reload

from typing import List, Dict

import numpy as np
import pandas as pd

import tensorflow as tf
print("tensorflow", tf.__version__)

import torch
print("pytorch", torch.__version__)

  from ._conv import register_converters as _register_converters


tensorflow 1.8.0
pytorch 0.4.0


Additional dependencies (install with `conda`):
- `ftfy`
- `spacy`

Also download spaCy `en` model with:
```
python -m spacy download en
```
(see https://spacy.io/usage/models)

In [2]:
import ftfy
print("ftfy", ftfy.__version__)
import spacy
print("spacy", spacy.__version__)

ftfy 5.4.1
spacy 2.0.11


# Tokenize text and apply BPE

The `TextEncoder` object handles tokenization and applying BPE to raw text, giving a list of IDs. See https://github.com/openai/finetune-transformer-lm/blob/master/text_utils.py and https://github.com/openai/finetune-transformer-lm/blob/master/utils.py#L14

In order to align with original text tokenization, we probably want a two-stage process:
1. Recover the processed spaCy tokens by a reverse lookup, and project annotations to spaCy tokenization.
2. Project annotations from the spaCy tokenization to the BPE pieces.

In [3]:
from src.openai_transformer_lm import utils as openai_utils
reload(openai_utils)

<module 'src.openai_transformer_lm.utils' from '/nfs/jsalt/home/iftenney/jiant/src/openai_transformer_lm/utils.py'>

In [4]:
text = "how much wood would a woodchuck chuck"
e = openai_utils.encode([text])
e

                                                                                

[[718, 889, 2510, 636, 246, 8210, 7961, 7961]]

In [5]:
openai_utils.tokenize(text)

                                                                                

['how</w>',
 'much</w>',
 'wood</w>',
 'would</w>',
 'a</w>',
 'wood',
 'chuck</w>',
 'chuck</w>']

In [6]:
openai_dir = os.path.dirname(openai_utils.__file__)
print('openai_dir', openai_dir)
openai_data_dir = openai_utils.OPENAI_DATA_DIR
print('openai_data_dir', openai_data_dir)

openai_dir /nfs/jsalt/home/iftenney/jiant/src/openai_transformer_lm
openai_data_dir /nfs/jsalt/home/iftenney/jiant/src/openai_transformer_lm/tf_original/model


In [7]:
list(openai_utils.decode_partial(e))

[['how</w>',
  'much</w>',
  'wood</w>',
  'would</w>',
  'a</w>',
  'wood',
  'chuck</w>',
  'chuck</w>']]

In [8]:
list(openai_utils.decode_full(e))

['how much wood would a woodchuck chuck']

For their model, they process the ID list by adding start and end IDs, `encoder['_start_']` and `encoder['_delimiter_']`, then pad with `clf_token = encoder['_classify_']`. All of these are set equal to `n_vocab = len(encoder)`.

In [9]:
openai_utils.N_VOCAB
list(openai_utils.decode_partial([[246, 6264, 4144]]))

[['a</w>', 'recent</w>', 'report</w>']]

# Project Annotations

In [10]:
from src import utils
from src import retokenize

fname = "/nfs/jsalt/share/glue_data/edges/spr2/train.edges.json"
records = list(utils.load_json_data(fname))

In [11]:
def space_tokenize_with_eow(sentence):
    """Add </w> markers to ensure word-boundary alignment."""
    return [t + "</w>" for t in sentence.split()]

In [12]:
text = "how much wood would a woodchuck chuck"
ta = retokenize.TokenAligner(space_tokenize_with_eow(text), openai_utils.tokenize(text))
print(ta)

                                                                                

TokenAligner(7, 8):
  0 -> [0]
  1 -> [1]
  2 -> [2]
  3 -> [3]
  4 -> [4]
  5 -> [5, 6]
  6 -> [7]





In [13]:
record = records[10]
text = record['text']
ta = retokenize.TokenAligner(space_tokenize_with_eow(text), openai_utils.tokenize(text))
print(ta)

                                                                                

TokenAligner(17, 19):
  0 -> [0]
  1 -> [1]
  2 -> [2]
  3 -> [3, 4]
  4 -> [5]
  5 -> [6]
  6 -> [7]
  7 -> [8]
  8 -> [9]
  9 -> [10]
  10 -> [11]
  11 -> [12, 13]
  12 -> [14]
  13 -> [15]
  14 -> [16]
  15 -> [17]
  16 -> [18]





In [14]:
openai_utils.tokenize(text)

                                                                                

['i</w>',
 'have</w>',
 'a</w>',
 'pre',
 'order</w>',
 'and</w>',
 'am</w>',
 'even</w>',
 'considering</w>',
 'getting</w>',
 'a</w>',
 'second</w>',
 'pre',
 'order</w>',
 'to</w>',
 'have</w>',
 'multiple</w>',
 'accounts</w>',
 '.</w>']

# OpenAI TensorFlow Model

Adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py#L163 to just export weights

In [15]:
from src.openai_transformer_lm import transformer_tf_simplified
reload(transformer_tf_simplified)
from src.openai_transformer_lm.tf_original import utils as openai_tf_utils
assert openai_utils.N_VOCAB == transformer_tf_simplified.n_vocab

SHAPES_FILE = os.path.join(openai_data_dir, "params_shapes.json")
PARAMS_FILE_TMPL = os.path.join(openai_data_dir, "params_{}.npy")

seed = 42
random.seed(seed)
np.random.seed(seed)
tf.set_random_seed(seed)

with tf.Graph().as_default():
    X_in = tf.placeholder(tf.int32, [None, transformer_tf_simplified.n_ctx, 2])
    h = transformer_tf_simplified.model_abbreviated(tf.expand_dims(X_in, 1))
    
    params = openai_tf_utils.find_trainable_variables("model")
    print("Running initializer...")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    sess.run(tf.global_variables_initializer())
    transformer_tf_simplified.load_params(sess, params, SHAPES_FILE, PARAMS_FILE_TMPL)
    
    h_val = sess.run(h, {X_in:openai_utils.prep_ids(e)})

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Running initializer...
Loading pre-trained params...
Assigning pre-trained params...
Done!


In [16]:
h_val.shape

(1, 512, 768)

In [17]:
h_val

array([[[ 0.16202518, -0.00763676,  0.14871871, ...,  0.17555687,
         -0.0820915 ,  0.15243863],
        [-0.22960284, -0.7131108 ,  0.25970727, ...,  0.29409307,
         -0.28634194, -0.3005669 ],
        [-0.33067906, -0.11411758,  0.37495106, ..., -0.23809306,
         -1.2012712 , -0.27857336],
        ...,
        [ 0.50163245,  0.84288955,  0.8027246 , ..., -0.03076262,
          1.1621311 ,  0.7543984 ],
        [ 0.51369536,  0.7608172 ,  0.8307392 , ..., -0.05948838,
          1.0978718 ,  0.6896318 ],
        [ 0.47562498,  0.79977787,  0.8108253 , ..., -0.0525886 ,
          1.1286504 ,  0.7206507 ]]], dtype=float32)

# Huggingface PyTorch port

Code from https://github.com/huggingface/pytorch-openai-transformer-lm

In [18]:
from src.openai_transformer_lm.pytorch_huggingface import model_pytorch
reload(model_pytorch)

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

args = model_pytorch.DEFAULT_CONFIG
n_special = transformer_tf_simplified.n_special
model = model_pytorch.TransformerModel(args, vocab=40990+n_special)
loader_args = dict(n_special=n_special)
loader_args['path'] = openai_data_dir + "/"
loader_args['path_names'] = os.path.dirname(model_pytorch.__file__) + "/"
model_pytorch.load_openai_pretrained_model(model, **loader_args)

Loading weights...


In [19]:
args

{'n_embd': 768,
 'n_head': 12,
 'n_layer': 12,
 'embd_pdrop': 0.1,
 'attn_pdrop': 0.1,
 'resid_pdrop': 0.1,
 'afn': 'gelu',
 'clf_pdrop': 0.1}

In [20]:
t = torch.arange(16).repeat(4,1)
t.size()[0]
torch.stack([t,t], dim=2).size()

torch.Size([4, 16, 2])

In [21]:
model.eval()
ids = torch.LongTensor(openai_utils.prep_ids(e))
h_val_pytorch = model(ids).detach().numpy()

In [22]:
h_val_pytorch.shape

(1, 512, 768)

In [23]:
h_val_pytorch

array([[[ 0.1620266 , -0.00763825,  0.14871885, ...,  0.17555933,
         -0.08209433,  0.15244102],
        [-0.22960448, -0.71311027,  0.25970703, ...,  0.2940926 ,
         -0.28634158, -0.3005667 ],
        [-0.33067977, -0.11411723,  0.37495226, ..., -0.23809452,
         -1.2012721 , -0.2785728 ],
        ...,
        [ 0.50163686,  0.8428891 ,  0.80272746, ..., -0.03076063,
          1.1621205 ,  0.7543992 ],
        [ 0.51369643,  0.76081246,  0.8307426 , ..., -0.05948911,
          1.0978615 ,  0.6896316 ],
        [ 0.47562388,  0.7997696 ,  0.81082684, ..., -0.05258931,
          1.1286354 ,  0.7206483 ]]], dtype=float32)

In [24]:
np.sqrt(np.mean((h_val - h_val_pytorch)**2.0))

3.0493553e-05

Woohoo! Looks like the PyTorch implementation loads the weights correctly and matches the original implementation.

# TensorFlow Checkpoints

Inspect these and see if we can load the weights into the PyTorch model.

In [25]:
# ckpt_path = "/nfs/jsalt/home/iftenney/checkpoints/bwb_shuffled/model.ckpt-1000000"
ckpt_path = "/nfs/jsalt/home/iftenney/checkpoints/orig_openai_checkpoint/init_model.ckpt"
tf.train.list_variables(ckpt_path)

[('model/h0/attn/c_attn/b', [2304]),
 ('model/h0/attn/c_attn/w', [1, 768, 2304]),
 ('model/h0/attn/c_proj/b', [768]),
 ('model/h0/attn/c_proj/w', [1, 768, 768]),
 ('model/h0/ln_1/b', [768]),
 ('model/h0/ln_1/g', [768]),
 ('model/h0/ln_2/b', [768]),
 ('model/h0/ln_2/g', [768]),
 ('model/h0/mlp/c_fc/b', [3072]),
 ('model/h0/mlp/c_fc/w', [1, 768, 3072]),
 ('model/h0/mlp/c_proj/b', [768]),
 ('model/h0/mlp/c_proj/w', [1, 3072, 768]),
 ('model/h1/attn/c_attn/b', [2304]),
 ('model/h1/attn/c_attn/w', [1, 768, 2304]),
 ('model/h1/attn/c_proj/b', [768]),
 ('model/h1/attn/c_proj/w', [1, 768, 768]),
 ('model/h1/ln_1/b', [768]),
 ('model/h1/ln_1/g', [768]),
 ('model/h1/ln_2/b', [768]),
 ('model/h1/ln_2/g', [768]),
 ('model/h1/mlp/c_fc/b', [3072]),
 ('model/h1/mlp/c_fc/w', [1, 768, 3072]),
 ('model/h1/mlp/c_proj/b', [768]),
 ('model/h1/mlp/c_proj/w', [1, 3072, 768]),
 ('model/h10/attn/c_attn/b', [2304]),
 ('model/h10/attn/c_attn/w', [1, 768, 2304]),
 ('model/h10/attn/c_proj/b', [768]),
 ('model/h10/

In [26]:
40481 + 512

40993

In [27]:
for name, p in model.named_parameters():
    print(name, list(p.shape))

embed.weight [40993, 768]
h.0.attn.c_attn.w [768, 2304]
h.0.attn.c_attn.b [2304]
h.0.attn.c_proj.w [768, 768]
h.0.attn.c_proj.b [768]
h.0.ln_1.g [768]
h.0.ln_1.b [768]
h.0.mlp.c_fc.w [768, 3072]
h.0.mlp.c_fc.b [3072]
h.0.mlp.c_proj.w [3072, 768]
h.0.mlp.c_proj.b [768]
h.0.ln_2.g [768]
h.0.ln_2.b [768]
h.1.attn.c_attn.w [768, 2304]
h.1.attn.c_attn.b [2304]
h.1.attn.c_proj.w [768, 768]
h.1.attn.c_proj.b [768]
h.1.ln_1.g [768]
h.1.ln_1.b [768]
h.1.mlp.c_fc.w [768, 3072]
h.1.mlp.c_fc.b [3072]
h.1.mlp.c_proj.w [3072, 768]
h.1.mlp.c_proj.b [768]
h.1.ln_2.g [768]
h.1.ln_2.b [768]
h.2.attn.c_attn.w [768, 2304]
h.2.attn.c_attn.b [2304]
h.2.attn.c_proj.w [768, 768]
h.2.attn.c_proj.b [768]
h.2.ln_1.g [768]
h.2.ln_1.b [768]
h.2.mlp.c_fc.w [768, 3072]
h.2.mlp.c_fc.b [3072]
h.2.mlp.c_proj.w [3072, 768]
h.2.mlp.c_proj.b [768]
h.2.ln_2.g [768]
h.2.ln_2.b [768]
h.3.attn.c_attn.w [768, 2304]
h.3.attn.c_attn.b [2304]
h.3.attn.c_proj.w [768, 768]
h.3.attn.c_proj.b [768]
h.3.ln_1.g [768]
h.3.ln_1.b [768]
h

Verify that converted checkpoint weights match the directly-loaded weights in the PyTorch model:

In [31]:
np.all(p.shape == (768,))

True

In [28]:
for name, p in model.named_parameters():
    path = name.split(".")
    if path[0] == "h":
        tf_name = f"model/{path[0]}{path[1]}/" + "/".join(path[2:])
        print(f"{name} -> {tf_name}")
        var_np = tf.train.load_variable(ckpt_path, tf_name)
    elif name == "embed.weight":
        tf_names = ["model/we", "model/pe"]
        print(f"{name} -> {tf_names}")
        vars_np = [tf.train.load_variable(ckpt_path, tf_name) for tf_name in tf_names]
        var_np = np.concatenate(vars_np, axis=0)
    else:
        raise ValueError(f"Unrecognized name: {name}")
    print(p.shape, var_np.shape)
    delta = np.abs((p.detach().numpy() - var_np))
    assert (np.max(delta) < 1e-5)

embed.weight -> ['model/we', 'model/pe']
torch.Size([40993, 768]) (40993, 768)
h.0.attn.c_attn.w -> model/h0/attn/c_attn/w
torch.Size([768, 2304]) (1, 768, 2304)
h.0.attn.c_attn.b -> model/h0/attn/c_attn/b
torch.Size([2304]) (2304,)
h.0.attn.c_proj.w -> model/h0/attn/c_proj/w
torch.Size([768, 768]) (1, 768, 768)
h.0.attn.c_proj.b -> model/h0/attn/c_proj/b
torch.Size([768]) (768,)
h.0.ln_1.g -> model/h0/ln_1/g
torch.Size([768]) (768,)
h.0.ln_1.b -> model/h0/ln_1/b
torch.Size([768]) (768,)
h.0.mlp.c_fc.w -> model/h0/mlp/c_fc/w
torch.Size([768, 3072]) (1, 768, 3072)
h.0.mlp.c_fc.b -> model/h0/mlp/c_fc/b
torch.Size([3072]) (3072,)
h.0.mlp.c_proj.w -> model/h0/mlp/c_proj/w
torch.Size([3072, 768]) (1, 3072, 768)
h.0.mlp.c_proj.b -> model/h0/mlp/c_proj/b
torch.Size([768]) (768,)
h.0.ln_2.g -> model/h0/ln_2/g
torch.Size([768]) (768,)
h.0.ln_2.b -> model/h0/ln_2/b
torch.Size([768]) (768,)
h.1.attn.c_attn.w -> model/h1/attn/c_attn/w
torch.Size([768, 2304]) (1, 768, 2304)
h.1.attn.c_attn.b -> mod

torch.Size([768]) (768,)
h.10.mlp.c_fc.w -> model/h10/mlp/c_fc/w
torch.Size([768, 3072]) (1, 768, 3072)
h.10.mlp.c_fc.b -> model/h10/mlp/c_fc/b
torch.Size([3072]) (3072,)
h.10.mlp.c_proj.w -> model/h10/mlp/c_proj/w
torch.Size([3072, 768]) (1, 3072, 768)
h.10.mlp.c_proj.b -> model/h10/mlp/c_proj/b
torch.Size([768]) (768,)
h.10.ln_2.g -> model/h10/ln_2/g
torch.Size([768]) (768,)
h.10.ln_2.b -> model/h10/ln_2/b
torch.Size([768]) (768,)
h.11.attn.c_attn.w -> model/h11/attn/c_attn/w
torch.Size([768, 2304]) (1, 768, 2304)
h.11.attn.c_attn.b -> model/h11/attn/c_attn/b
torch.Size([2304]) (2304,)
h.11.attn.c_proj.w -> model/h11/attn/c_proj/w
torch.Size([768, 768]) (1, 768, 768)
h.11.attn.c_proj.b -> model/h11/attn/c_proj/b
torch.Size([768]) (768,)
h.11.ln_1.g -> model/h11/ln_1/g
torch.Size([768]) (768,)
h.11.ln_1.b -> model/h11/ln_1/b
torch.Size([768]) (768,)
h.11.mlp.c_fc.w -> model/h11/mlp/c_fc/w
torch.Size([768, 3072]) (1, 768, 3072)
h.11.mlp.c_fc.b -> model/h11/mlp/c_fc/b
torch.Size([3072])

# AllenNLP Implementation

See code here: https://github.com/allenai/allennlp/blob/master/allennlp/modules/openai_transformer.py

This is ported from the Huggingface implementation and looks a bit cleaner, but it isn't immediately clear how to use it - it expects a tarfile containing the weights, but it's not clear how to generate this. We'd also need to update AllenNLP, which might break other experiments using `jiant`.