In [1]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset

In [29]:
class CharDataset(Dataset):

    def __init__(self, data, block_size, extractAtt=False):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
        self.attributes = extractAtt
        
        if self.attributes:
            self.dataList = self.data.split('\n') #TODO: remove later?

            self.blockIdx = []
            summation = 0
            for d in self.dataList:
                s = summation
                e = s + len(d)
                self.blockIdx.append((s,e))
                summation = e+1
    
    def __len__(self):
        if self.attributes:
            return len(self.dataList)
        else:
            return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        #chunk = self.data[idx:idx + self.block_size + 1]
        chunk = self.data[self.blockIdx[idx][0]:self.blockIdx[idx][1]]
        
        # extracts other attributes
        points = None
        if self.attributes:
            dic = json.loads(chunk)
            points = []
            for xy in zip(dic['X'], dic['Y']):
                x = xy[0]
                y = xy[1]
                x.extend([y])
                x = torch.tensor(x)
                points.append(x)
            chunk = '"'+dic['EQ']+'"'
        
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        
        return x, y, points

In [30]:
import json
from tqdm import tqdm
import glob
def processDataFiles(files):
    text = ''""
    for f in tqdm(files):
        with open(f, 'r') as h: 
            lines = h.read() # don't worry we won't run out of file handles
            text += lines #json.loads(line)                
    return text

In [31]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TrainDatasetFixed\*.json'
files = glob.glob(path)
text = processDataFiles([files[0]])

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.35it/s]


In [32]:
# avgBlockSize = 0
# upNum = 100
# for i in tqdm(range(0,upNum)):
#     avgBlockSize += len(text.split('\n')[i])
# avgBlockSize /= upNum
# print('avg block size is {}'.format(avgBlockSize))

In [33]:
block_size = 500 # spatial extent of the model for its context

In [34]:
train_dataset = CharDataset(text, block_size, extractAtt=True) # one line of poem is roughly 50 characters

data has 243072217 characters, 43 unique.


In [35]:
sample = train_dataset.__getitem__(0)
print('X:{}\ny:{}\nPoints:{}'.format(sample[0], sample[1], sample[2]))

X:tensor([ 2,  8, 38, 31, 34,  3, 12,  9, 12,  5, 40, 12,  6, 11,  9, 17, 15,  4])
y:tensor([ 8, 38, 31, 34,  3, 12,  9, 12,  5, 40, 12,  6, 11,  9, 17, 15,  4,  2])
Points:[tensor([ 0.8300, -1.0000]), tensor([ 0.9300, -1.0000]), tensor([ 0.7200, -0.9900]), tensor([ 1.0300, -0.9800]), tensor([ 0.6200, -0.9700]), tensor([ 1.1400, -0.9500]), tensor([ 0.5200, -0.9400]), tensor([ 1.2400, -0.9100]), tensor([ 0.4100, -0.8900]), tensor([ 1.3400, -0.8600]), tensor([ 0.3100, -0.8300]), tensor([ 1.4500, -0.7900]), tensor([ 0.2100, -0.7700]), tensor([ 1.5500, -0.7200]), tensor([ 0.1000, -0.6800]), tensor([ 1.6600, -0.6300]), tensor([ 0.0000, -0.6000]), tensor([ 1.7600, -0.5400]), tensor([ 1.8600, -0.4400]), tensor([ 1.9700, -0.3300]), tensor([ 2.0700, -0.2300]), tensor([ 2.1700, -0.1200]), tensor([2.2800, 0.0000]), tensor([2.3800, 0.1100]), tensor([2.4800, 0.2200]), tensor([2.5900, 0.3300]), tensor([2.6900, 0.4400]), tensor([2.7900, 0.5300]), tensor([2.9000, 0.6300]), tensor([3.0000, 0.7100])]


In [36]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TestDataset\*.json'
files = glob.glob(path)
textTest = processDataFiles([files[0]])
test_dataset = CharDataset(textTest, block_size, extractAtt=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.21it/s]

data has 901208 characters, 43 unique.





In [37]:
from mingpt.model import GPT, GPTConfig, PointNetConfig
embeddingSize=512
pconf = PointNetConfig(embeddingSize=embeddingSize, 
                       numberofPoints=30, 
                       numberofVars=1, 
                       numberofYs=1)
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=embeddingSize)
model = GPT(mconf, pconf)

05/11/2021 00:42:35 - INFO - mingpt.model -   number of parameters: 2.552013e+07


In [55]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=2, batch_size=1, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=0)
trainer = Trainer(model, train_dataset, None, tconf)

try:
    trainer.train()
except KeyboardInterrupt:
    print('KeyboardInterrupt')

epoch 1 iter 67: train loss 2.19714. lr 7.177734e-05:   0%|                      | 68/500001 [00:03<8:06:31, 17.13it/s]

KeyboardInterrupt





In [None]:
# alright, let's sample some character-level symbolic GPT
from mingpt.utils import sample
from gp_model import Genetic_Model
from mlp_model import MLP_Model
    
loader = torch.utils.data.DataLoader(
                                test_dataset, 
                                shuffle=False, 
                                pin_memory=True,
                                batch_size=1,
                                num_workers=0)

gpm = Genetic_Model(n_jobs=-1)
mlp = MLP_Model()
    
fName = 'res.txt'
resultDict = {}
with open(fName, 'w', encoding="utf-8") as o:
    textTestList = textTest.split('\n')
    resultDict[fName] = {'SymbolicGPT':[],
                         'MLP':[],
                         'GP':[]}
    for i, (x,y,p) in enumerate(loader):
        print('Sample {}:'.format(i))
        t = json.loads(textTestList[i])
        x = x.to(trainer.device)
        p = [x.to(trainer.device) for x in p]
        yHat = sample(model, x, 20, points=p, 
                      temperature=1.0, sample=True, 
                      top_k=10)[0]
        predicted = ''.join([train_dataset.itos[int(i)] for i in yHat])
        # filter out predicted
        predicted = predicted.split('"')[1]
        target = ''.join([train_dataset.itos[int(i)] for i in y[0]]).strip('"')
        print('Target:{}\nPredicted:{}'.format(target, predicted))
        
        Ys = t['YT']
        Yhats = []
        for xs in t['XT']:
            try:
                eqTmp = predicted + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                Yhat = eval(eqTmp)
                Yhat = 0 if np.isnan(Yhat) else Yhat
                Yhat = 10000 if np.isinf(Yhat) else Yhat
            except:
                Yhat = 10000 if np.isinf(Yhat) else Yhat
            Yhats.append(Yhat)
        mseErr = mse(Ys,Yhats)
        
        
        print('MSE:{}\n'.format(mseErr))

Sample 0:
Target:-sin(1.1*x1+0.64)
Predicted:-sin(1.1*x1+0.64)
MSE:0.00011463910705842355

Sample 1:
Target:-1.42*x1+sqrt(x1+0.53)
Predicted:-1.42*x1+sqrt(x1+0.53)
MSE:0.00015629388692434343

Sample 2:
Target:sqrt(-sin(0.2*x1))
Predicted:sqrt(-sin(0.2*x1))
MSE:3.4483289580975676e-05

Sample 3:
Target:sin(sqrt(x1))
Predicted:sin(sqrt(x1))
MSE:7.747989464006532e-06

Sample 4:
Target:0.86*x1**2-0.59*x1+1.36
Predicted:0.86*x1**2-0.59*x1+1.368
MSE:0.007778901333333281

Sample 5:
Target:0.28*sqrt(-x1**2)
Predicted:0.28*sqrt(-x1**2)
MSE:0.000407333333333339

Sample 6:
Target:sin(0.07*x1**2+0.28*x1)
Predicted:sin(0.07*x1**2+0.28*x1)
MSE:0.008675065163676714

Sample 7:
Target:sin(x1)
Predicted:sin(x1)
MSE:7.36144468968795e-06

Sample 8:
Target:0.81*sqrt(-x1**2-0.95*x1+0.12)
Predicted:0.81*sqrt(-x1**2-0.95*x1+0.12)
MSE:0.00045791611437728623

Sample 9:
Target:sin(x1+0.37)
Predicted:sin(x1+0.37)
MSE:1.0393529956063066e-05

Sample 10:
Target:-0.68*x1**2+1.77*x1
Predicted:-0.68*x1**2+1.77*x1
MSE:38

In [80]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)