In [1]:
import torch
import torch.nn as nn

In [8]:
class FFN(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        d_in = cfg['n_embd']
        d_out = cfg['n_embd']
        self.linear1 = nn.Linear(d_in, d_out)
        self.linear2 = nn.Linear(d_out, d_out)
    
    def forward(self, x):
        x = self.linear1(x)
        x = nn.GELU()(x)
        x = self.linear2(x)
        return x

In [9]:
GPT2_CONFIG = {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": None,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": True,
  "summary_type": "cls_index",
  "summary_use_proj": True,
  "task_specific_params": {
    "text-generation": {
      "do_sample": True,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [10]:
inputs = torch.rand((2, 3, 768))
ffn = FFN(GPT2_CONFIG)
outputs = ffn(inputs)
print(outputs.shape)
print(outputs)

torch.Size([2, 3, 768])
tensor([[[ 0.1627, -0.0431,  0.0289,  ..., -0.1600,  0.0599,  0.0576],
         [ 0.2020, -0.0993, -0.0343,  ..., -0.0478,  0.0818, -0.1814],
         [ 0.1695, -0.0965,  0.0306,  ..., -0.0689,  0.0038, -0.1218]],

        [[ 0.1746, -0.0800,  0.0484,  ..., -0.0777,  0.0434, -0.1043],
         [ 0.1032,  0.0379,  0.1231,  ..., -0.0849,  0.0231, -0.1273],
         [ 0.1432,  0.0271, -0.0313,  ..., -0.1051, -0.0058,  0.0041]]],
       grad_fn=<ViewBackward0>)
