In [1]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
# config
embeddingSize=512
numPoints=100 # number of points that we are going to receive to make a prediction about f given x and y
numVars=3 # the dimenstion of input points x
numYs=1 # the dimension of output points y = f(x)
blockSize = 60 # spatial extent of the model for its context
batchSize = 256
dataInfo = 'XYE_3Var_100Points'

In [5]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size, chars):
        data_size, vocab_size = len(data), len(chars)
        print('data has %d examples, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        
        # padding token
        self.paddingToken = '_'
        self.paddingID = self.stoi[self.paddingToken]
        self.stoi[self.paddingToken] = self.paddingID
        self.itos[self.paddingID] = self.paddingToken
        self.threshold = [-1000,1000]
        
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data # it should be a list of examples
    
    def __len__(self):
        return len(self.data)-1

    def __getitem__(self, idx):
        # grab an example from the data
        chunk = self.data[idx] # sequence of tokens including x, y, eq, etc.
        chunk = json.loads(chunk) # convert the sequence tokens to a dictionary
        
        # encode every character in the equation to an integer
        # < is SOS, > is EOS
        dix = [self.stoi[s] for s in '<'+chunk['EQ']+'>']
        inputs = dix[:-1]
        outputs = dix[1:]
        
        # add the padding to the equations
        paddingSize = max(self.block_size-len(inputs),0)
        paddingList = [self.paddingID]*paddingSize
        inputs += paddingList
        outputs += paddingList 
        
        # make sure it is not more than what should be
        inputs = inputs[:self.block_size]
        outputs = outputs[:self.block_size]
        
        # extract points from the input sequence
        points = torch.zeros(numVars+numYs, numPoints)
        for idx, xy in enumerate(zip(chunk['X'], chunk['Y'])):
            x = xy[0] + [0]*(max(numVars-len(xy[0]),0)) # padding
            y = [xy[1]] if type(xy[1])== float else xy[1]
            y = y + [0]*(max(numYs-len(y),0)) # padding
            p = x+y # because it is only one point 
            p = torch.tensor(p)
            #replace nan and inf
            p = torch.nan_to_num(p, nan=0.0, 
                                 posinf=self.threshold[1], 
                                 neginf=self.threshold[0])
            p[p>self.threshold[1]] = self.threshold[1] # clip the upper bound
            p[p<self.threshold[0]] = self.threshold[0] # clip the lower bound
            points[:,idx] = p
        
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        inputs = torch.tensor(inputs, dtype=torch.long)
        outputs = torch.tensor(outputs, dtype=torch.long)
        return inputs, outputs, points


In [6]:
import json
from tqdm import tqdm
import glob
def processDataFiles(files):
    text = ''""
    for f in tqdm(files):
        with open(f, 'r') as h: 
            lines = h.read() # don't worry we won't run out of file handles
            text += lines #json.loads(line)                
    return text

In [7]:
#path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TrainDatasetFixed\*.json'
path = 'D:/Datasets/Symbolic Dataset/Datasets/3Var_-3to3_3.1to10/Train/*.json'
files = glob.glob(path)
text = processDataFiles(files)
chars = sorted(list(set(text))+['_','T','<','>']) # extract unique characters from the text before converting the text to a list
# T is for the test data
text = text.split('\n') # convert the raw text to a set of examples
train_dataset = CharDataset(text, blockSize, chars) 

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.64s/it]


data has 1685219 examples, 51 unique.


In [8]:
idx = np.random.randint(train_dataset.__len__())
inputs, outputs, points = train_dataset.__getitem__(idx)
print('inputs:{}'.format(inputs))
inputs = ''.join([train_dataset.itos[int(i)] for i in inputs])
outputs = ''.join([train_dataset.itos[int(i)] for i in outputs])
print('id:{}\ninputs:{}\noutputs:{}\npoints:{}'.format(idx,inputs,outputs,points))

inputs:tensor([22,  8, 15,  9, 18, 20,  5, 34, 47, 42,  3, 14,  9, 20, 17,  5, 47, 12,
         4,  5, 45, 37, 40,  3, 12,  9, 16, 12,  5, 47, 12,  8, 12,  9, 19, 12,
         4, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33])
id:121958
inputs:<-4.79*exp(3.96*x1)*sin(1.51*x1-1.81)_______________________
outputs:-4.79*exp(3.96*x1)*sin(1.51*x1-1.81)>_______________________
points:tensor([[ 1.2500e+00, -2.0200e+00, -1.4400e+00, -1.3600e+00,  2.7000e-01,
         -9.1000e-01, -1.3100e+00, -2.7000e+00,  1.9000e+00,  2.1900e+00,
         -2.6700e+00,  1.4900e+00,  1.8900e+00,  3.1000e-01, -1.7000e-01,
         -1.1700e+00,  2.4800e+00, -8.8000e-01, -2.5100e+00, -2.1500e+00,
         -8.6000e-01, -1.4200e+00, -8.4000e-01, -1.0100e+00,  1.7400e+00,
         -1.4800e+00,  2.4500e+00,  2.6300e+00, -1.2900e+00,  2.5700e+00,
          2.3600e+00, -2.3200e+00, -8.6000e-01,  1.8500e+00, -1.8500e+00,
          4.0000e-01, -6.4000e-01, -4.7000e-01, 

In [9]:
#path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TestDataset\*.json'
path = 'D:/Datasets/Symbolic Dataset/Datasets/3Var_-3to3_3.1to10/Test/*.json'
files = glob.glob(path)
textTest = processDataFiles([files[0]])
textTest = textTest.split('\n') # convert the raw text to a set of examples
test_dataset = CharDataset(textTest, blockSize, chars)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 77.12it/s]

data has 1001 examples, 51 unique.





In [10]:
idx = np.random.randint(test_dataset.__len__())
inputs, outputs, points = train_dataset.__getitem__(idx)
print(points.min(), points.max())
inputs = ''.join([train_dataset.itos[int(i)] for i in inputs])
outputs = ''.join([train_dataset.itos[int(i)] for i in outputs])
print('id:{}\ninputs:{}\noutputs:{}\npoints:{}'.format(idx,inputs,outputs,points))

tensor(-1000.) tensor(1000.)
id:435
inputs:<x1+4.82*x3-2.97*exp(3.76*x1)_______________________________
outputs:x1+4.82*x3-2.97*exp(3.76*x1)>_______________________________
points:tensor([[-1.7800e+00,  2.5700e+00, -9.2000e-01,  1.1800e+00,  1.3100e+00,
         -1.0000e+00,  1.5700e+00, -1.4700e+00, -2.7200e+00,  2.8500e+00,
          1.6400e+00,  2.8900e+00, -2.6000e-01,  1.4200e+00,  5.0000e-01,
          2.3500e+00, -1.0100e+00, -9.1000e-01, -1.0600e+00, -2.1000e+00,
         -1.5000e+00,  2.3600e+00,  1.0500e+00, -4.0000e-01,  1.0400e+00,
         -1.3000e+00,  1.1000e+00, -2.0600e+00, -1.6300e+00, -6.5000e-01,
          1.9000e+00,  1.4000e+00,  7.2000e-01,  3.2000e-01, -1.0500e+00,
         -2.3200e+00,  2.2800e+00, -2.8000e+00,  8.1000e-01, -2.0000e-02,
          2.7000e-01,  8.1000e-01, -6.8000e-01, -1.6800e+00,  7.2000e-01,
         -4.3000e-01, -2.5500e+00, -5.2000e-01, -6.2000e-01,  2.6300e+00,
          4.5000e-01,  7.3000e-01, -1.6700e+00, -2.5400e+00, -1.8000e-01,
      

In [11]:
from mingpt.model import GPT, GPTConfig, PointNetConfig
pconf = PointNetConfig(embeddingSize=embeddingSize, 
                       numberofPoints=numPoints, 
                       numberofVars=numVars, 
                       numberofYs=numYs)
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=embeddingSize, padding_idx=train_dataset.paddingID)
model = GPT(mconf, pconf)

05/20/2021 14:23:34 - INFO - mingpt.model -   number of parameters: 3.056334e+07


In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=50, batch_size=batchSize, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*blockSize,
                      num_workers=0, ckpt_path='./SavedModels/bestModel/checkpoint.pt')
trainer = Trainer(model, train_dataset, test_dataset, tconf)

try:
    trainer.train()
except KeyboardInterrupt:
    print('KeyboardInterrupt')

epoch 1 iter 123: train loss 1.63433. lr 5.998701e-04:   2%|▍                     | 124/6583 [02:13<1:55:50,  1.08s/it]

Input:tensor([22, 39, 41, 36,  3, 12,  8, 11,  9, 17, 16,  5, 47, 12,  4, 10, 13,  6,
        11,  9, 17, 12, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([ 8, 41, 36,  3, 47,  9, 11,  9, 20,  5,  5, 47, 13,  6, 23, 13, 23, 12,
         9, 20, 23, 23, 23, 23, 23, 23, 23, 20, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 40, 23, 23, 23, 23, 23,
        40, 40, 40, 40, 40, 41], device='cuda:0')
Input:<log(1-0.65*x1)/2+0.61______________________________________
Logit:-og(x.0.9**x2+>2>1.9>>>>>>>9>>>>>>>>>>>>>>>>>>>>n>>>>>nnnnno
Target:tensor([39, 41, 36,  3, 12,  8, 11,  9, 17, 16,  5, 47, 12,  4, 10, 13,  6, 11,
         9, 17, 12, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 393: train loss 1.01493. lr 5.986795e-04:   6%|█▎                    | 394/6583 [07:01<1:48:30,  1.05s/it]

Input:tensor([22, 34, 47, 42,  3, 13,  9, 12, 14,  5, 45, 43, 44, 46,  3, 47, 13,  6,
        11,  9, 12, 15,  4,  4, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([11, 47, 42,  3, 12,  9, 11,  5,  5, 45, 43, 44, 46,  3, 11, 13,  8, 11,
         9, 11, 13,  4,  4, 23, 23, 34, 23, 23, 34, 23, 23, 23, 23, 23, 23,  6,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 46, 12, 46], device='cuda:0')
Input:<exp(2.13*sqrt(x2+0.14))____________________________________
Logit:0xp(1.0**sqrt(02-0.02))>>e>>e>>>>>>+>>>>>>>>>>>>>>>>>>>>>t1t
Target:tensor([34, 47, 42,  3, 13,  9, 12, 14,  5, 45, 43, 44, 46,  3, 47, 13,  6, 11,
         9, 12, 15,  4,  4, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 2542: train loss 0.81672. lr 5.464640e-04:  39%|███████▋            | 2543/6583 [44:54<1:11:36,  1.06s/it]

Input:tensor([22, 11,  9, 19,  5, 45, 43, 44, 46,  3, 47, 12,  8, 11,  9, 20, 12,  4,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([45,  9, 18, 12,  3, 43, 44, 46,  3,  8, 12,  4, 11,  9, 18,  4,  4, 23,
        23, 23, 42, 37, 42, 12, 42, 23, 23,  5, 23, 23,  4, 23,  4,  4,  4,  4,
         4, 23,  4,  4, 42,  4,  4,  4,  4, 11, 13, 23, 23, 23,  4, 23, 23, 23,
        23, 23, 23, 23, 23, 23], device='cuda:0')
Input:<0.8*sqrt(x1-0.91)__________________________________________
Logit:s.71(qrt(-1)0.7))>>>pip1p>>*>>)>)))))>))p))))02>>>)>>>>>>>>>
Target:tensor([11,  9, 19,  5, 45, 43, 44, 46,  3, 47, 12,  8, 11,  9, 20, 12,  4, 23,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 2923: train loss 0.79284. lr 5.299166e-04:  44%|████████▉           | 2924/6583 [51:36<1:03:44,  1.05s/it]

Input:tensor([22, 39, 41, 36,  3, 11,  9, 20, 12,  5, 47, 13,  8, 15,  9, 14, 15,  4,
         6, 11,  9, 11, 19,  5, 34, 47, 42,  3,  8, 14,  9, 18, 17,  5, 47, 14,
         4, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([34, 41, 36,  3, 47,  9, 14, 18,  5, 47, 14,  8, 14,  9, 18, 18,  4,  6,
        11,  9, 11, 13,  5, 34, 47, 42,  3,  8, 14,  9, 11, 18,  5, 47, 14,  4,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23], device='cuda:0')
Input:<log(0.91*x2-4.34)+0.08*exp(-3.76*x3)_______________________
Logit:eog(x.37*x3-3.77)+0.02*exp(-3.07*x3)>>>>>>>>>>>>>>>>>>>>>>>>
Target:tensor([39, 41, 36,  3, 11,  9, 20, 12,  5, 47, 13,  8, 15,  9, 14, 15,  4,  6,
        11,  9, 11, 19,  5, 34, 47, 42,  3,  8, 14,  9, 18, 17,  5, 47, 14,  4,
        23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 3516: train loss 0.81311. lr 5.004389e-04:  53%|██████████▋         | 3517/6583 [1:02:02<53:44,  1.05s/it]

Input:tensor([22, 12, 12, 16,  9, 15, 15,  5, 47, 12,  5, 47, 14,  5, 34, 47, 42,  3,
        47, 12,  4,  8, 12,  9, 15, 16,  5, 47, 12,  8, 17,  9, 13, 20,  5, 47,
        14,  6, 12,  9, 12, 13, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([ 8,  9,  9,  9, 20, 16,  5, 47, 12,  5, 47, 14,  8, 45, 47, 42,  3,  8,
        12,  4,  8, 12,  9, 20, 20,  5, 47, 12,  8, 12,  9, 16, 20,  5, 47, 14,
         8, 12,  9, 20, 13, 23, 23, 23, 23, 23,  9, 23, 23, 14, 23, 11, 12, 23,
        23, 23,  8, 23, 23,  9], device='cuda:0')
Input:<115.44*x1*x3*exp(x1)-1.45*x1-6.29*x3+1.12__________________
Logit:-...95*x1*x3-sxp(-1)-1.99*x1-1.59*x3-1.92>>>>>.>>3>01>>>->>.
Target:tensor([12, 12, 16,  9, 15, 15,  5, 47, 12,  5, 47, 14,  5, 34, 47, 42,  3, 47,
        12,  4,  8, 12,  9, 15, 16,  5, 47, 12,  8, 17,  9, 13, 20,  5, 47, 14,
         6, 12,  9, 12, 13, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 4100: train loss 0.76004. lr 4.674892e-04:  62%|████████████▍       | 4101/6583 [1:12:19<44:39,  1.08s/it]

Input:tensor([22,  8, 11,  9, 13,  5, 47, 13,  8, 11,  9, 19,  5, 47, 14,  6, 39, 41,
        36,  3,  8, 11,  9, 15,  5, 47, 12,  8, 11,  9, 15, 14,  4,  8, 14,  9,
        18, 17, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([47, 11,  9, 20, 19, 47, 12,  5, 11,  9, 15, 13, 47, 14,  8, 39, 41, 36,
         3,  8, 47,  9, 15, 17, 47, 12,  8, 11,  9, 15,  4,  4,  8, 15,  9, 15,
        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,  8,  8, 23,  8,  8,
         8,  8,  8, 23,  8, 23], device='cuda:0')
Input:<-0.2*x2-0.8*x3+log(-0.4*x1-0.43)-3.76______________________
Logit:x0.98x1*0.42x3-log(-x.46x1-0.4))-4.4>>>>>>>>>>>>>-->----->->
Target:tensor([ 8, 11,  9, 13,  5, 47, 13,  8, 11,  9, 19,  5, 47, 14,  6, 39, 41, 36,
         3,  8, 11,  9, 15,  5, 47, 12,  8, 11,  9, 15, 14,  4,  8, 14,  9, 18,
        17, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 4349: train loss 0.72937. lr 4.524134e-04:  66%|█████████████▏      | 4350/6583 [1:16:42<39:53,  1.07s/it]

Input:tensor([22,  8, 11,  9, 16, 13,  5, 47, 14,  6, 20,  9, 13, 15,  5, 45, 43, 44,
        46,  3, 11,  9, 14, 15,  5, 47, 14,  6, 12,  4, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([34, 11,  9, 17, 19,  5, 47, 12,  6, 39,  9, 17, 19,  5, 45, 43, 44, 46,
         3, 11,  9, 13,  5,  5, 47, 12,  6, 12,  4, 23, 23, 23, 14,  9, 11, 23,
        23, 14, 23, 23, 23, 15, 18, 23, 23, 23, 23, 13, 23, 23, 23, 12, 23, 23,
        23, 11, 43, 23, 23,  9], device='cuda:0')
Input:<-0.52*x3+9.24*sqrt(0.34*x3+1)______________________________
Logit:e0.68*x1+l.68*sqrt(0.2**x1+1)>>>3.0>>3>>>47>>>>2>>>1>>>0q>>.
Target:tensor([ 8, 11,  9, 16, 13,  5, 47, 14,  6, 20,  9, 13, 15,  5, 45, 43, 44, 46,
         3, 11,  9, 14, 15,  5, 47, 14,  6, 12,  4, 23, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 4496: train loss 0.76017. lr 4.432572e-04:  68%|█████████████▋      | 4497/6583 [1:19:17<36:12,  1.04s/it]

Input:tensor([22, 34, 47, 42,  3, 11,  9, 11, 13,  5, 47, 13,  4,  8, 45, 37, 40,  3,
        13,  9, 19, 15,  5, 47, 13,  6, 11,  9, 11, 13,  4, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([34, 47, 42,  3, 45,  9, 11, 13,  5, 47, 12,  4,  5, 45, 37, 40,  3, 15,
         9, 20, 20,  5, 47, 12,  6, 15,  9, 20, 20,  4, 23, 23, 23, 23,  9,  9,
        23, 13, 23,  9,  4, 23, 23, 23, 12, 23, 12, 23, 23, 12, 12, 12, 23, 12,
        23, 23, 23, 23, 23,  4], device='cuda:0')
Input:<exp(0.02*x2)-sin(2.84*x2+0.02)_____________________________
Logit:exp(s.02*x1)*sin(4.99*x1+4.99)>>>>..>2>.)>>>1>1>>111>1>>>>>)
Target:tensor([34, 47, 42,  3, 11,  9, 11, 13,  5, 47, 13,  4,  8, 45, 37, 40,  3, 13,
         9, 19, 15,  5, 47, 13,  6, 11,  9, 11, 13,  4, 23, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 5227: train loss 0.75586. lr 3.953364e-04:  79%|███████████████▉    | 5228/6583 [1:32:09<24:11,  1.07s/it]

Input:tensor([22, 45, 37, 40,  3, 11,  9, 13,  5, 34, 47, 42,  3,  8, 12,  9, 13, 20,
         5, 47, 12,  4,  4, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([45, 37, 40,  3, 12,  9, 11,  5, 34, 47, 42,  3, 47, 12,  9, 12, 15,  5,
        47, 12,  4,  4, 23, 23, 13, 12,  8, 12,  9, 12,  9, 15, 23,  5, 23, 23,
        23, 23, 23, 15, 15, 47, 47, 47,  4, 15, 11, 11, 15, 15, 15, 23, 11, 23,
        47, 23, 23, 23, 23, 47], device='cuda:0')
Input:<sin(0.2*exp(-1.29*x1))_____________________________________
Logit:sin(1.0*exp(x1.14*x1))>>21-1.1.4>*>>>>>44xxx)400444>0>x>>>>x
Target:tensor([45, 37, 40,  3, 11,  9, 13,  5, 34, 47, 42,  3,  8, 12,  9, 13, 20,  5,
        47, 12,  4,  4, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 1 iter 6582: train loss 0.74567. lr 3.000239e-04: 100%|████████████████████| 6583/6583 [1:55:59<00:00,  1.06s/it]
05/20/2021 16:19:38 - INFO - mingpt.trainer -   test loss: 0.921243
05/20/2021 16:19:38 - INFO - mingpt.trainer -   saving ./SavedModels/bestModel/checkpoint.pt
epoch 2 iter 591: train loss 0.70428. lr 2.577837e-04:   9%|█▉                    | 592/6583 [10:25<1:46:28,  1.07s/it]

Input:tensor([22, 47, 12,  6, 34, 47, 42,  3, 11,  9, 18, 16,  5, 47, 14,  4,  8, 13,
         9, 18, 17, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([47, 12,  6, 13, 47, 42,  3, 47,  9, 20, 18,  5, 47, 14,  4,  8, 13,  9,
        12, 12, 23, 11,  8, 11,  8,  8, 15,  8,  8, 11, 11,  8, 11,  8,  8,  8,
        12,  8, 23, 47,  8, 47, 13, 47, 15,  8, 47, 15,  8, 47, 23, 23, 47,  8,
        47, 23, 47,  8, 47, 47], device='cuda:0')
Input:<x1+exp(0.75*x3)-2.76_______________________________________
Logit:x1+2xp(x.97*x3)-2.11>0-0--4--00-0---1->x-x2x4-x4-x>>x-x>x-xx
Target:tensor([47, 12,  6, 34, 47, 42,  3, 11,  9, 18, 16,  5, 47, 14,  4,  8, 13,  9,
        18, 17, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 2 iter 1720: train loss 0.69324. lr 1.802512e-04:  26%|█████▏              | 1721/6583 [30:16<1:25:20,  1.05s/it]

Input:tensor([22,  8, 45, 37, 40,  3, 13,  9, 17, 15,  5, 47, 12,  5,  5, 13,  8, 14,
         9, 12, 19,  5, 47, 12,  4, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([45, 45, 37, 40,  3, 45,  9, 11,  5,  5, 47, 13,  5, 47, 13,  6, 12,  9,
        11, 14,  5, 47, 12,  4, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
        15, 23, 23, 23, 23, 23, 23, 15, 23, 15, 13, 23, 23, 23, 23, 23, 23, 23,
        23, 23, 23, 23, 23, 23], device='cuda:0')
Input:<-sin(2.64*x1**2-3.18*x1)___________________________________
Logit:ssin(s.0**x2*x2+1.03*x1)>>>>>>>>>>>>4>>>>>>4>42>>>>>>>>>>>>>
Target:tensor([ 8, 45, 37, 40,  3, 13,  9, 17, 15,  5, 47, 12,  5,  5, 13,  8, 14,  9,
        12, 19,  5, 47, 12,  4, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 2 iter 3123: train loss 0.63775. lr 9.651987e-05:  47%|█████████▍          | 3124/6583 [55:26<1:00:56,  1.06s/it]

Input:tensor([22, 39, 41, 36,  3, 14,  9, 13, 20,  8, 12,  9, 15, 20,  5, 47, 13,  4,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([39, 41, 36,  3, 47,  9, 11, 12,  8, 12,  9, 14, 12,  5, 47, 13,  4, 23,
        39, 39, 11, 11, 11, 23, 23, 23, 23, 23, 11, 23, 23, 23, 23, 23, 11, 11,
        23, 23, 23, 23, 23, 23, 15, 23, 23, 15, 23, 23, 15, 23, 23, 23, 23, 23,
        23, 39, 23, 23, 23, 23], device='cuda:0')
Input:<log(3.29-1.49*x2)__________________________________________
Logit:log(x.01-1.31*x2)>ll000>>>>>0>>>>>00>>>>>>4>>4>>4>>>>>>l>>>>
Target:tensor([39, 41, 36,  3, 14,  9, 13, 20,  8, 12,  9, 15, 20,  5, 47, 13,  4, 23,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 2 iter 3221: train loss 0.68984. lr 9.142071e-05:  49%|██████████▊           | 3222/6583 [57:11<58:49,  1.05s/it]

Input:tensor([22, 12,  9, 11, 15,  5, 47, 12,  8, 13,  9, 18, 16,  5, 47, 13,  5, 47,
        14,  8, 18,  9, 15, 13,  5, 47, 13,  8, 11,  9, 13, 19,  5, 47, 14,  8,
        11,  9, 14, 12, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33, 33], device='cuda:0')
Logit:tensor([ 8, 12, 17,  5,  5, 47, 12,  5, 12,  9, 14, 19,  5, 47, 13,  5, 47, 14,
         8, 12,  9, 13, 19,  5, 47, 13,  8, 12,  9, 17, 19,  5, 47, 14,  8, 12,
         9, 12, 19, 23, 17,  9, 11,  9,  9, 44,  9,  9,  9,  9,  9, 46, 44,  3,
        47,  3, 46,  3,  9, 46], device='cuda:0')
Input:<1.04*x1-2.75*x2*x3-7.42*x2-0.28*x3-0.31____________________
Logit:-16**x1*1.38*x2*x3-1.28*x2-1.68*x3-1.18>6.0..r.....tr(x(t(.t
Target:tensor([12,  9, 11, 15,  5, 47, 12,  8, 13,  9, 18, 16,  5, 47, 13,  5, 47, 14,
         8, 18,  9, 15, 13,  5, 47, 13,  8, 11,  9, 13, 19,  5, 47, 14,  8, 11,
         9, 14, 12, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33, 33, 33, 33,

epoch 2 iter 3745: train loss 0.64165. lr 6.615701e-05:  57%|███████████▍        | 3746/6583 [1:06:34<51:06,  1.08s/it]

In [None]:
# model.load_state_dict(torch.load('./SavedModels/bestModel/checkpoint.pt'))
# model = model.eval().to(trainer.device)

In [None]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)

# Mean square error
def relativeErr(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        if y_gold[i] < 1: 
            # use regular MSE
            our_sum += (y_hat[i] - y_gold[i]) ** 2
        else:
            # use relative MSE
            our_sum += ((y_hat[i] - y_gold[i])/y_gold[i]) ** 2

    return our_sum / len(y_gold)

In [None]:
fName = '{}_SymbolicGPT_{}_{}_{}.txt'.format(dataInfo, 
                                             'GPT_PT_Summation', 
                                             'Padding',
                                             blockSize)

In [None]:
# alright, let's sample some character-level symbolic GPT
from mingpt.utils import sample
#from gp_model import Genetic_Model
#from mlp_model import MLP_Model
    
loader = torch.utils.data.DataLoader(
                                test_dataset, 
                                shuffle=False, 
                                pin_memory=True,
                                batch_size=1,
                                num_workers=0)

testRange = [3.1,6.0]
numTestPoints = 10
#test = np.linspace(3.1,6.0,numTestPoints)

# gpm = Genetic_Model(n_jobs=-1)
# mlp = MLP_Model()

resultDict = {}
with open(fName, 'w', encoding="utf-8") as o:
    modelName = 'SymbolicGPT'
    resultDict[fName] = {modelName:[]}
    
    for i, batch in enumerate(loader):
        inputs,outputs,points = batch
        
        print('Test Case {}.'.format(i))
        o.write('Test Case {}/{}.\n'.format(i,len(textTest)))
        
        t = json.loads(textTest[i])
        
        inputs = inputs[:,0:1].to(trainer.device)
        points = points.to(trainer.device)
        outputsHat = sample(model, inputs, blockSize, points=points,
                      temperature=1.0, sample=True, 
                      top_k=10)[0]
            
        # filter out predicted
        target = ''.join([train_dataset.itos[int(i)] for i in outputs[0]])
        predicted = ''.join([train_dataset.itos[int(i)] for i in outputsHat])
        
#         if i == 31:
#             raise
#         else:
#             continue
        
        #print(target, predicted)
        #raise

        target = target.strip(train_dataset.paddingToken).split('>')
        target = target[0] if len(target[0])>=1 else target[1]
        target = target.strip('<').strip(">")
        predicted = predicted.strip(train_dataset.paddingToken).split('>')
        predicted = predicted[0] if len(predicted[0])>=1 else predicted[1]
        predicted = predicted.strip('<').strip(">")
       
        o.write('{}\n'.format(target))
        
        print('Target:{}\nPredicted:{}'.format(target, predicted))
        
        Ys = [] #t['YT']
        Yhats = []
        for xs in t['XT']:
            try:
                eqTmp = target + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                YEval = eval(eqTmp)
                YEval = 0 if np.isnan(YEval) else YEval
                YEval = 100 if np.isinf(YEval) else YEval
            except:
                YEval = 100 #TODO: Maybe I have to punish the model for each wrong template not for each point
            Ys.append(YEval)
            try:
                eqTmp = predicted + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                Yhat = eval(eqTmp)
                Yhat = 0 if np.isnan(Yhat) else Yhat
                Yhat = 100 if np.isinf(Yhat) else Yhat
            except:
                Yhat = 100
            Yhats.append(Yhat)
        mseErr = relativeErr(Ys,Yhats)
        
        if type(mseErr) is np.complex128:
            mseErr = abs(mseErr.real)
            
        resultDict[fName][modelName].append(mseErr)
        
        o.write('{}:{}\n{}\n\n'.format(modelName, 
                               mseErr,
                               predicted))
        
        print('MSE:{}\n'.format(mseErr))
print('Avg MSE:{}'.format(np.mean(resultDict[fName][modelName])))

In [None]:
# plot the error frequency for model comparison
from matplotlib import pyplot as plt
num_eqns = len(resultDict[fName]['SymbolicGPT'])
num_vars = pconf.numberofVars

models = list(resultDict[fName].keys())
lists_of_error_scores = [resultDict[fName][key] for key in models]
linestyles = ["-","dashdot","dotted","--"]

eps = 0.00001
y, x, _ = plt.hist([np.log([max(min(x+eps, 1e5),1e-5) for x in e]) for e in lists_of_error_scores],
                   label=models,
                   cumulative=True, 
                   histtype="step", 
                   bins=2000, 
                   density="true")
y = np.expand_dims(y,0)
plt.figure(figsize=(15, 10))

for idx, m in enumerate(models): 
    plt.plot(x[:-1], 
           y[idx] * 100, 
           linestyle=linestyles[idx], 
           label=m)

plt.legend(loc="upper left")
plt.title("{} equations of {} variables".format(num_eqns, num_vars))
plt.xlabel("Log of Mean Square Error")
plt.ylabel("Normalized Cumulative Frequency")

name = '{}.png'.format(fName)
plt.savefig(name)