In [2]:
# This model is equipped can pre-load GPT-2

## under the following parameters: 
# gpt2: 124M parameters, 12 layers, 12 heads, 768 hidden dimension
# gpt2-medium: 350M parameters, 24 layers, 16 heads, 1024 hidden dimension
# gpt2-large: 774M parameters, 36 layers, 20 heads, 1280 hidden dimension
# gpt2-xl: 1558M parameters, 48 layers, 25 heads, 1600 hidden dimension

In [3]:
## Loading GPT-2
from model import GPT
from train import get_batch

## Loading word encoder/decoder
import tiktoken
enc = tiktoken.get_encoding("gpt2")
decode = lambda x: enc.decode(x)


## Loading constants
from constants import N_LAYERS, N_HEADS, N_EMBED, BLOCK_SIZE, BIAS, DROP_OUT, VOCAB_SIZE

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# initiate GPT model
config = {"n_heads": N_HEADS, 
          "n_embed": N_EMBED,
          "block_size": BLOCK_SIZE,
          "n_layers": N_LAYERS, 
          "bias": BIAS, 
          'dropout': DROP_OUT, 
          'vocab_size': VOCAB_SIZE}
config
mogpt = GPT(config)

Building GPT model with config: {'n_heads': 12, 'n_embed': 768, 'block_size': 4, 'n_layers': 12, 'bias': False, 'dropout': 0.1, 'vocab_size': 50304}
number of parameters: 123.59M


In [5]:
## Loading pre-trained model
## Here the above config is changed to the pre-trained model's config
pretrained_mogpt= mogpt.load_pretrained_model('gpt2-medium')

Loading model_type='gpt2-medium', with configmodel_config={'gpt2': {'n_layers': 12, 'n_heads': 12, 'n_embed': 768}, 'gpt2-medium': {'n_layers': 24, 'n_heads': 16, 'n_embed': 1024}, 'gpt2-large': {'n_layers': 36, 'n_heads': 20, 'n_embed': 1280}, 'gpt2-xl': {'n_layers': 48, 'n_heads': 25, 'n_embed': 1600}}




Building GPT model with config: {'n_layers': 24, 'n_heads': 16, 'n_embed': 1024, 'vocab_size': 50257, 'block_size': 1024, 'bias': True, 'dropout': 0.1}
number of parameters: 354.82M


In [15]:
## Evaluating the model
data, targets = get_batch('eval', pretrained_mogpt.get_config())
num_predictions = 5
y = pretrained_mogpt.predict_next(data, num_predictions)
for i in range(len(y)): 
   print(f" targets: \t\t {decode(targets[i,-num_predictions:].tolist())}, \n prediction: \t\t {decode(y[i,-num_predictions:].tolist())} \n *-----------------*")


 targets: 		 'll fast for company:, 
 prediction: 		  Coutthanksrorsme Gardner 
 *-----------------*
 targets: 		 , for my sake., 
 prediction: 		 LEY horizontally Sourcespeech intensity 
 *-----------------*
 targets: 		 

MIRAN, 
 prediction: 		 parse fixtureprev lith shorten 
 *-----------------*
 targets: 		 ill you have done your, 
 prediction: 		  Bark commodity playthroughious profit 
 *-----------------*
 targets: 		 ISTA:
Not, 
 prediction: 		  StarCraft goats StampGrandflies 
 *-----------------*
 targets: 		 
Mistress, how, 
 prediction: 		  invoked prosecute efforts unlawfullyfal 
 *-----------------*
 targets: 		 men in their new f, 
 prediction: 		 lords grindingpless Conquest Erdogan 
 *-----------------*
 targets: 		  desperate mart.

, 
 prediction: 		  François CMS comb capable overriding 
 *-----------------*


In [None]:
## Finetuning on new dataset
# To use wandb set 
# WANDB_LOG=True and WANDB_KEY in constants.py
# adjust INPUT_DATA_FOLDER in constants.py

from train import train
optimizer = torch.optim.Adam(pretrained_mogpt.parameters(), lr=0.01)
optimizer.zero_grad()
train(model=pretrained_mogpt, optimizer=optimizer, num_epochs=10)

