# minGPT

## Preprocess the Training Data

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

import sys
sys.path.insert(1, '../') #Add project root directory to the path

In [6]:
#Load the training data 
import pickle
path = "../data/Training_data_input.pickle"
with open(path, 'rb') as f:
    train_dat = pickle.load(f)

In [8]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [9]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [10]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]

        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [11]:
block_size = 128 
train_dataset = CharDataset(train_dat, block_size)

data has 255632 characters, 114 unique.


## Construct the GPT Model

In [12]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

10/28/2020 22:42:31 - INFO - mingpt.model -   number of parameters: 2.540237e+07


## Train the Model

In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=1, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

In [None]:
#Save the Model
model_save_name = 'Basic-Bot_trained.pt'
path = "../models/mingpt_trained.pickle" 
torch.save(model.state_dict(), path)

In [None]:
## Test the Model

In [None]:
model_save_name = 'Basic-Bot_trained.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))

In [None]:
from mingpt.utils import sample

context = "wtf "
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 200, temperature=1.5, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

In [None]:
from tqdm import tqdm