In [5]:
import torch
import torch.nn as nn

In [6]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
txt = 'This is created for testing purpose.'
ids = tokenizer.encode(txt)
print(ids)

[1212, 318, 2727, 329, 4856, 4007, 13]


In [7]:
n_dim = 768
n_context = 1024
embd_layer = nn.Embedding(tokenizer.n_vocab, n_dim)
pos_embd_layer = nn.Embedding(n_context, n_dim)
length = len(ids)
ids = torch.tensor(ids) # change to pytorch tensor for compapility
embd_vecs = embd_layer(ids)
pos_embd_vecs = embd_layer(torch.arange(length))
input_vecs = embd_vecs + pos_embd_vecs
print(input_vecs.shape)

torch.Size([7, 768])


In [8]:
GPT2_CONFIG = {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": None,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": True,
  "summary_type": "cls_index",
  "summary_use_proj": True,
  "task_specific_params": {
    "text-generation": {
      "do_sample": True,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [9]:
class InputPreprocess(nn.Module):
    def __init__(self, tokenizer, cfg):
        super().__init__()
        self.tokenizer = tokenizer
        n_vocab = cfg['vocab_size']
        n_dim = cfg['n_embd']
        n_context = cfg['n_ctx']
        
        self.embedding_layer = nn.Embedding(n_vocab, n_dim)
        self.pos_embedding_layer = nn.Embedding(n_context, n_dim)
    
    def forward(self, x):
        ids = self.tokenizer.encode(x)
        length = len(ids)
        ids = torch.tensor(ids) # change to torch tensor for compapility

        embd_vecs = self.embedding_layer(ids)
        pos_embd_vecs = self.pos_embedding_layer(torch.arange(length))

        return (embd_vecs + pos_embd_vecs).unsqueeze(0)


In [10]:
preprocesser = InputPreprocess(tokenizer, GPT2_CONFIG)
outputs = preprocesser(txt)
print(outputs.shape)

torch.Size([1, 7, 768])


In [11]:
outputs = tokenizer.encode("Hello, i am")
print(outputs)

[15496, 11, 1312, 716]
