In [1]:
# !pip install transformers
# !pip install faiss-gpu
# !pip install einops

In [2]:
from IPython import embed

from memorizing_transformers import MemorizingModel, MemorizingLMHeadModel
import random
import tqdm
import gzip
import numpy as np
import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import GPT2Tokenizer, GPT2Model


In [3]:
# constants
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
NUM_BATCHES = int(1e5)
BATCH_SIZE = 8
SEQ_LEN = 512
SEGMENTS = 5

LEARNING_RATE = 0.002
MAX_GRAD_CLIP_NORM = 0.5

VALIDATE_EVERY  = 100
GENERATE_EVERY  = 500
GENERATE_LENGTH = 512

# helpers

def cycle(loader):
    while True:
        for data in loader:
            yield data

def decode_token(token):
    return str(chr(max(32, token)))

def decode_tokens(tokens):
    return ''.join(list(map(decode_token, tokens)))


In [4]:
import pickle

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# with gzip.open('./data/enwik8.gz',"rt",encoding="utf-8") as file:
#     text = file.read()
#     X = tokenizer(text).input_ids
# with open('./data/enwik8_token.pickle', 'wb') as file:
#     pickle.dump(X, file)

with open('./data/enwik8_token.pickle', 'rb') as file:
    X = pickle.load(file)

tr_num = len(X) // 2
trX, vaX = np.split(X, [tr_num])
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)

In [5]:
# instantiate GPT-like decoder model
# model = MemorizingTransformer(
#     num_tokens = 256, # int8 tokens, 32k (x128 = 2^7d) in the paper
#     dim = 512, # dim of token in embedding space, 1024 (x2) in the paper
#     depth = 8, # 12 
#     memorizing_layers = 4, # 9 in the paper
#     max_knn_memories = 512 * 15, 
#     num_retrieved_memories = 32, 
#     xl_memory_layers = (7, 8),
#     xl_max_memories = 512,
# )

model = MemorizingLMHeadModel.from_pretrained('gpt2').to(DEVICE)

# prepare enwik8 data

class TextSamplerDataset(Dataset):
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
        full_seq = self.data[rand_start: rand_start + self.seq_len].long()
        return full_seq.to(DEVICE)

    def __len__(self):
        return self.data.size(0) // self.seq_len

# dataset and dataloader

train_dataset = TextSamplerDataset(data_train, SEQ_LEN * SEGMENTS)
train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE, drop_last = True))
valid_dataset = TextSamplerDataset(data_val, SEQ_LEN * SEGMENTS)
valid_loader = cycle(DataLoader(valid_dataset, batch_size = BATCH_SIZE, drop_last = True))



Some weights of MemorizingLMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.9.attn.knn_attention_ratio', 'h.3.attn.knn_attention_ratio', 'h.2.attn.knn_attention_ratio', 'h.0.attn.knn_attention_ratio', 'h.10.attn.knn_attention_ratio', 'h.6.attn.knn_attention_ratio', 'h.5.attn.knn_attention_ratio', 'h.11.attn.knn_attention_ratio', 'h.8.attn.knn_attention_ratio', 'h.1.attn.knn_attention_ratio', 'h.7.attn.knn_attention_ratio', 'h.4.attn.knn_attention_ratio']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
next(train_loader)
next(valid_loader)
print(f"{model.num_parameters()//1e6}M parameters")

124.0M parameters


In [7]:
optim = torch.optim.Adam([param for (name,param) in model.named_parameters() if name == "transformer.h.5.attn.knn_attention_ratio"], lr = LEARNING_RATE)


In [8]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.29.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [9]:
# sentence_prefix = "History of Rome dates back to"
 
# input_ids = tokenizer.encode(
#     sentence_prefix,
#     add_special_tokens=False,
#     return_tensors="pt",
# ).to(DEVICE)
 
# output_ids = model.generate(
#     input_ids=input_ids,
#     do_sample=True,
#     max_length=50,  # desired output sentence length
#     pad_token_id=model.config.eos_token_id,
# )[0].tolist()
 
# generated_text = tokenizer.decode(
#     output_ids,
#     clean_up_tokenization_spaces=True)
 
# print(generated_text)


In [None]:
# training

for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training'):
    model.train()

    data = next(train_loader)

    train_loss = 0.
    seq = data

    with model.knn_memories_context(batch_size = BATCH_SIZE, num_heads=12) as knn_memories:

        for seq_segment in seq.chunk(SEGMENTS, dim = -1):
            result = model(
                input_ids = seq_segment,
                labels = seq_segment,
                knn_memories = knn_memories
            )

            loss = result.loss

            train_loss += loss.item() / SEGMENTS
            (loss / SEGMENTS).backward()

    #         output_ids = model.generate(
    #             input_ids=seq_segment,
    #             do_sample=True,
    #             max_length=20,  # desired output sentence length
    #             pad_token_id=model.config.eos_token_id,
    #         )[0].tolist()

    #         generated_text = tokenizer.decode(
    #             output_ids,
    #             clean_up_tokenization_spaces=True)

    #         print(generated_text)


        print(f'training loss: {train_loss}')
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_CLIP_NORM)
        optim.step()
        optim.zero_grad()

    if not (i % VALIDATE_EVERY):
        model.eval()

        valid_data = next(valid_loader)
        valid_loss = 0.

        with torch.no_grad():
            seq = data
            
            for seq_segment in seq.chunk(SEGMENTS, dim = -1):
                
                result = model(
                    input_ids = seq_segment,
                    labels = seq_segment,
                )

                loss = result.loss

                valid_loss += loss.item() / SEGMENTS

        print(f'valid loss: {valid_loss}')


training:   0%|          | 0/100000 [00:00<?, ?it/s]

tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
        0.5000, 0.5000, 0.5000], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
        0.5000, 0.5000, 0.5000], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
        0.5000, 0.5000, 0.5000], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
        0.5000, 0.5000, 0.5000], device='cuda:0', grad_fn=<SigmoidBackward0>)
training loss: 3.4777318954467775
valid loss: 3.087804937362671
tensor([0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498,
        0.5498, 0.5498, 0.4502], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498, 0.5498,
        0.5498, 0.5498, 0.4502], device='cuda:0', grad_fn=<Sigmoid

training:   0%|          | 2/100000 [00:10<143:30:37,  5.17s/it]

training loss: 3.5264980316162107
tensor([0.5987, 0.5983, 0.5841, 0.5987, 0.5981, 0.5984, 0.5988, 0.5920, 0.5388,
        0.5856, 0.5972, 0.4047], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5987, 0.5983, 0.5841, 0.5987, 0.5981, 0.5984, 0.5988, 0.5920, 0.5388,
        0.5856, 0.5972, 0.4047], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5987, 0.5983, 0.5841, 0.5987, 0.5981, 0.5984, 0.5988, 0.5920, 0.5388,
        0.5856, 0.5972, 0.4047], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.5987, 0.5983, 0.5841, 0.5987, 0.5981, 0.5984, 0.5988, 0.5920, 0.5388,
        0.5856, 0.5972, 0.4047], device='cuda:0', grad_fn=<SigmoidBackward0>)
training loss: 3.543728399276733
tensor([0.6432, 0.6422, 0.6228, 0.6442, 0.6449, 0.6450, 0.6455, 0.6316, 0.5388,
        0.6067, 0.6332, 0.3609], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.6432, 0.6422, 0.6228, 0.6442, 0.6449, 0.6450, 0.6455, 0.6316, 0.5388,
        0.6067, 0.6332, 0.3609], device='cuda:0', grad_fn=<Sigm

training:   0%|          | 5/100000 [00:23<130:30:06,  4.70s/it]

training loss: 3.60932879447937
tensor([0.7235, 0.7200, 0.6861, 0.7266, 0.7305, 0.7284, 0.7253, 0.6789, 0.5691,
        0.6330, 0.6590, 0.3035], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.7235, 0.7200, 0.6861, 0.7266, 0.7305, 0.7284, 0.7253, 0.6789, 0.5691,
        0.6330, 0.6590, 0.3035], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.7235, 0.7200, 0.6861, 0.7266, 0.7305, 0.7284, 0.7253, 0.6789, 0.5691,
        0.6330, 0.6590, 0.3035], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.7235, 0.7200, 0.6861, 0.7266, 0.7305, 0.7284, 0.7253, 0.6789, 0.5691,
        0.6330, 0.6590, 0.3035], device='cuda:0', grad_fn=<SigmoidBackward0>)
training loss: 3.499234199523926
tensor([0.7556, 0.7538, 0.7187, 0.7621, 0.7672, 0.7630, 0.7602, 0.6961, 0.5951,
        0.6452, 0.6649, 0.2806], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([0.7556, 0.7538, 0.7187, 0.7621, 0.7672, 0.7630, 0.7602, 0.6961, 0.5951,
        0.6452, 0.6649, 0.2806], device='cuda:0', grad_fn=<Sigmoi

In [1]:
optimal_weights=[0.2981, 0.9796, 0.9995, 0.9996, 0.9999, 0.6157, 0.9999, 0.9999, 0.9651,0.6707, 0.6466, 0.9562]