In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import os
import json
import torch
import math
import torch.nn as nn
from einops import rearrange

In [2]:
# gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_large = GPT2LMHeadModel.from_pretrained('gpt2-large')
tokenizer_large = GPT2Tokenizer.from_pretrained('gpt2-large')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
# print(vars(gpt2).keys())
# print(vars(gpt2)['config'])
# "n_ctx": 1024,
# "n_embd": 768,
# "n_head": 12,
# "n_inner": null,
# "n_layer": 12,
# "n_positions": 1024,

# print(vars(gpt2_large)['config'])
# "n_ctx": 1024,
# "n_embd": 1280,
# "n_head": 20,
# "n_inner": null,
# "n_layer": 36,
# "n_positions": 1024,

In [3]:
print(vars(gpt2_large).keys())
# print(gpt2_large._modules)



In [4]:
# From https://github.com/huggingface/transformers/issues/18282
print(gpt2_large._modules['transformer'].wte.weight.requires_grad)
print(type(gpt2_large._modules['transformer']))
print(type(gpt2_large.transformer))
print(len(gpt2_large.transformer.h))
print(gpt2_large.lm_head)
print(gpt2_large.lm_head.weight.requires_grad)

True
<class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>
36
Linear(in_features=1280, out_features=50257, bias=False)
True


In [37]:
print(type(gpt2_large.base_model))
print(type(gpt2_large.base_model.parameters()))
print(type(gpt2_large.base_model.named_parameters()))

# The following demo of named parameters are from https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L667
# for name, p in gpt2_large.base_model.named_parameters():
#     print(name)

# We can notice that wte.weight and lm_head are actually shared parameters
print(torch.equal(gpt2_large.lm_head.weight,
                    gpt2_large.transformer.wte.weight))
# The two tensors are equal

<class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>
<class 'generator'>
<class 'generator'>
True


In [28]:
# from: https://huggingface.co/transformers/v4.5.1/training.html
trainable_params_count = 0
frozen_params_count = 0
total_params_count = 0
for param in gpt2_large.base_model.parameters():
    if param.requires_grad:
        trainable_params_count += 1
    else:
        frozen_params_count += 1
    total_params_count += 1
    # print(param.requires_grad)

print('total params #:', total_params_count)
print('trainable params #:', trainable_params_count)
print('frozen params #:', frozen_params_count)

total params #: 436
trainable params #: 0
frozen params #: 436


In [27]:
gpt2_large.lm_head.weight.requires_grad = False
print(gpt2_large.transformer.h[-1])

print(gpt2_large._modules['transformer'].wte.weight.requires_grad)
print(gpt2_large.lm_head.weight.requires_grad)

GPT2Block(
  (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
False
False


In [None]:
def _load_split_bykey(data_dir, source, split, key='text', n=np.inf):
    path = os.path.join(data_dir, f'{source}.{split}.jsonl')
    data = []
    for i, line in enumerate(open(path)):
        if i >= n:
            break
        data.append(json.loads(line)[key])
    return data

In [None]:
# webtext_train = _load_split_bykey('data/', 'webtext', 'train', key='text')
webtext_train_lens = _load_split_bykey('data/', 'webtext', 'train', key='length')

In [None]:
print(max(webtext_train_lens))
# no problem, <= n_positions == 1024

In [None]:
# Examine degen data
def _load_degen_data(data_dir: str, filename: str, n=np.inf, return_type='json'):
    path = os.path.join(data_dir, filename)
    data = []
    for i, line in enumerate(open(path, 'r')):
        if i >= n:
            break
        try:
            obj = json.loads(line)
        except Exception:
            print(line)
            raise

        if return_type == 'json':
            data.append(obj)
        else:
            data.append(obj['string'])

    return data

In [None]:
lines = _load_degen_data('data/data_degen/unconditional', 'unconditional_gold.jsonl', n=1)
line1 = lines[0]

In [None]:
print(len(line1['tokens']))

In [None]:
line1_encoded = tokenizer_large(line1['string'], return_tensors='pt')

In [None]:
line1_encoded['input_ids']

In [None]:
torch.equal(torch.tensor(line1['tokens'], dtype=torch.long), line1_encoded['input_ids'].squeeze())
# line1['tokens'] and line1_encoded['input_ids'] are equal

In [None]:
line1_output = gpt2_large(**line1_encoded, labels=line1_encoded['input_ids'])

In [None]:
print(line1_output.loss)

In [None]:
math.exp(line1_output.loss.item())

In [None]:
logits = line1_output.logits
target = line1_encoded['input_ids']
logits = rearrange(logits, 'B L V -> B V L')

shift_logits = logits[..., :-1]
shift_target = target[..., 1:]

In [None]:
criterian = nn.NLLLoss(reduction='none')
log_softmax = nn.LogSoftmax(dim=1)

In [None]:
with torch.no_grad():
    nll_loss = criterian(log_softmax(shift_logits), shift_target).squeeze()

In [None]:
print(nll_loss)
print(nll_loss.size())

In [None]:
print(line1['nll4tok'])

In [None]:
with torch.no_grad():
    nll_loss2 = criterian(log_softmax(logits), target).squeeze()

In [None]:
print(nll_loss2)