In [1]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset

In [5]:
# config
embeddingSize=768
numPoints=30
numVars=1
numYs=1
paddingToken='<PAD>'
padId=0
extractAttributes = True
block_size = 500 # spatial extent of the model for its context
pointsAsList = False

In [6]:
class CharDataset(Dataset):

    def __init__(self, data, block_size, extractAtt=False, 
                 chars=None, stoi=None, itos=None, testTime=False,
                 pointsAsList=False):
        self.chars = sorted(list(set(data))+['T']) if chars==None else chars
        data_size, vocab_size = len(data), len(self.chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate([paddingToken]+self.chars if extractAtt else self.chars) } if stoi==None else stoi
        self.itos = { i:ch for i,ch in enumerate([paddingToken]+self.chars if extractAtt else self.chars) } if itos==None else itos
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
        self.attributes = extractAtt
        self.threshold = [-1000,1000]
        self.testTime = testTime
        self.pointsAsList = pointsAsList
        
        if self.attributes or self.testTime:
            self.dataList = self.data.split('\n') #TODO: remove later?

            self.blockIdx = []
            summation = 0
            for d in self.dataList:
                s = summation
                e = s + len(d)
                self.blockIdx.append((s,e))
                summation = e+1
    
    def __len__(self):
        if self.attributes or self.testTime:
            return len(self.dataList) - 1
        else:
            return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        #chunk = self.data[idx:idx + self.block_size + 1]
        if not self.attributes:
            if self.testTime:
                chunk = self.data[self.blockIdx[idx][0]:self.blockIdx[idx][1]]
            else:
                chunk = self.data[idx:idx + self.block_size + 1]
                
            dix = [self.stoi[s] for i,s in enumerate(chunk)]
            inputs = torch.tensor(dix[:-1], dtype=torch.long).contiguous()
            outputs = torch.tensor(dix[1:], dtype=torch.long).contiguous()
            return inputs, outputs
        else:
            chunk = self.data[self.blockIdx[idx][0]:self.blockIdx[idx][1]]
        
            # extracts other attributes
            points = None
            if self.attributes:
                dic = json.loads(chunk)

                if self.pointsAsList:
                    points = []
                else:
                    points = torch.zeros(numVars+numYs, numPoints)
                    
                for idx, xy in enumerate(zip(dic['X'], dic['Y'])):
                    x = xy[0] + [self.stoi[paddingToken]]*(max(numVars-len(xy[0]),0)) # padding
                    y = [xy[1]] if type(xy[1])== float else xy[1]
                    y = y + [self.stoi[paddingToken]]*(max(numYs-len(y),0)) # padding

                    p = x + y #x.extend(y)
                    p = torch.tensor(p)

                    #replace nan and inf
                    p = torch.nan_to_num(p, nan=0.0, 
                                         posinf=self.threshold[1], 
                                         neginf=self.threshold[0])
                    
                    if self.pointsAsList:
                        points.append(p)
                    else:
                        points[:,idx] = p
                        
                chunk = '"'+dic['EQ']+'"'

            # encode every character to an integer
            dix = [self.stoi[s] for i,s in enumerate(chunk) if i<self.block_size]
            paddingSize = max(self.block_size-len(dix),0)

            mask = [1] + [1 for s in dix]
            #dixX = dix + [self.stoi[paddingToken]]*paddingSize # padding
            #dix += [self.stoi[paddingToken]]*paddingSize # padding
            dix += dix * (int(paddingSize/len(dix))+1) # instead of padding use the same tokens repetitive
            dix = dix[:self.block_size] # make sure the size is correct
            mask += [0]*paddingSize

            inputs = torch.tensor(dix[:-1], dtype=torch.long).contiguous()
            mask = torch.tensor(mask[:-1], dtype=torch.long).contiguous()
            mask = mask.unsqueeze(0)
            mask = mask.T @ mask
            mask = mask.T # transpose the output mask

            """
            arrange data and targets so that the first i elements of x
            will be asked to predict the i-th element of y. Notice that
            the eventual language model will actually make block_size
            individual predictions at the same time based on this data,
            so we are being clever and amortizing the cost of the forward
            pass of the network. So for example if block_size is 4, then
            we could e.g. sample a chunk of text "hello", the integers in
            x will correspond to "hell" and in y will be "ello". This will
            then actually "multitask" 4 separate examples at the same time
            in the language model:
            - given just "h", please predict "e" as next
            - given "he" please predict "l" next
            - given "hel" predict "l" next
            - given "hell" predict "o" next

            In addition, because the DataLoader will create batches of examples,
            every forward/backward pass during traning will simultaneously train
            a LOT of predictions, amortizing a lot of computation. In particular,
            for a batched input of integers X (B, T) where B is batch size and
            T is block_size and Y (B, T), the network will during training be
            simultaneously training to make B*T predictions, all at once! Of course,
            at test time we can paralellize across batch B, but unlike during training
            we cannot parallelize across the time dimension T - we have to run
            a forward pass of the network to recover the next single character of the 
            sequence along each batch dimension, and repeatedly always feed in a next
            character to get the next one.

            So yes there is a big asymmetry between train/test time of autoregressive
            models. During training we can go B*T at a time with every forward pass,
            but during test time we can only go B at a time, T times, with T forward 
            passes.
            """        

            outputs = torch.tensor(dix[1:], dtype=torch.long).contiguous()
            
#             if not self.pointsAsList:
#                 points = torch.tensor(points).contiguous()
#                 points = points.view(numVars+numYs, numPoints)

            #assert mask.shape==outputs.shape==inputs.shape, 'M:{}-O:{}-I:{}'.format(mask.shape,outputs.shape,inputs.shape)
            assert len(mask) == self.block_size, 'Wrong mask shape: {}'.format(mask.shape)
            assert len(inputs) == self.block_size-1, 'Wrong inputs shape: {}'.format(inputs.shape)
            assert len(outputs) == self.block_size-1, 'Wrong y shape: {}'.format(outputs.shape)
            #assert len(points) == numPoints, 'Wrong #points: {}'.format(len(points))

            return inputs, outputs, points, mask

In [7]:
import json
from tqdm import tqdm
import glob
def processDataFiles(files):
    text = ''""
    for f in tqdm(files):
        with open(f, 'r') as h: 
            lines = h.read() # don't worry we won't run out of file handles
            text += lines #json.loads(line)                
    return text

In [8]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TrainDatasetFixed\*.json'
files = glob.glob(path)
text = processDataFiles(files[0:3]) #[files[0]])

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.44it/s]


In [9]:
# avgBlockSize = 0
# upNum = 100
# for i in tqdm(range(0,upNum)):
#     avgBlockSize += len(text.split('\n')[i])
# avgBlockSize /= upNum
# print('avg block size is {}'.format(avgBlockSize))

In [10]:
train_dataset = CharDataset(text, block_size, extractAtt=extractAttributes, pointsAsList=pointsAsList) 

data has 243072217 characters, 44 unique.


In [11]:
idx = np.random.randint(min(train_dataset.__len__(),1000))
sample = train_dataset.__getitem__(idx)
batch = sample
if extractAttributes:
    x,y,p,m = batch
    print('XS:{}\nMS:{}\nyS:{}\nPointsS:{}'.format(x.shape,m.shape,y.shape,len(p)))
    print('X:{}\nM:{}\ny:{}\nPoints:{}'.format(x,m,y,p))
else:
    x,y = batch
    print('X:{}\ny:{}\n'.format(x,y))
    xc = ''.join([train_dataset.itos[int(i)] for i in x]).strip('"')
    yc = ''.join([train_dataset.itos[int(i)] for i in y]).strip('"')
    print('x:{}\n\ny:{}'.format(xc,yc))

XS:torch.Size([499])
MS:torch.Size([500, 500])
yS:torch.Size([499])
PointsS:2
X:tensor([ 3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,  6, 42, 13,  5,  5,
         3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,  6, 42, 13,  5,
         5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,  6, 42, 13,
         5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,  6, 42,
        13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,  6,
        42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10, 14,
         6, 42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12, 10,
        14,  6, 42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4, 12,
        10, 14,  6, 42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,  4,
        12, 10, 14,  6, 42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33, 36,
         4, 12, 10, 14,  6, 42, 13,  5,  5,  3,  3,  9, 40, 33, 36,  4, 40, 33,
        36,  4, 12, 10, 14,  6, 42, 13, 

In [12]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TestDataset\*.json'
files = glob.glob(path)
textTest = processDataFiles([files[0]])
test_dataset = CharDataset(textTest, block_size, extractAtt=extractAttributes,
                           testTime=True,
                           chars=train_dataset.chars, stoi=train_dataset.stoi, 
                           itos=train_dataset.itos,
                           pointsAsList=pointsAsList)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.29it/s]

data has 901208 characters, 44 unique.





In [13]:
torch.cuda.is_available()

True

In [14]:
from mingpt.model import GPT, GPTConfig, PointNetConfig
pconf = PointNetConfig(embeddingSize=embeddingSize, 
                       numberofPoints=numPoints, 
                       numberofVars=numVars, 
                       numberofYs=numYs)
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size-1 if extractAttributes else train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=embeddingSize, grad_norm_clip=1.0,
                  padToken=paddingToken, padId=padId)
model = GPT(mconf, pconf) if extractAttributes else GPT(mconf)

05/15/2021 22:56:24 - INFO - mingpt.model -   number of parameters: 6.897716e+07


In [15]:
model

GPT(
  (pointNet): TNet(
    (conv1): Conv1d(2, 768, kernel_size=(1,), stride=(1,))
    (conv2): Conv1d(768, 1536, kernel_size=(1,), stride=(1,))
    (conv3): Conv1d(1536, 3072, kernel_size=(1,), stride=(1,))
    (fc1): Linear(in_features=3072, out_features=1536, bias=True)
    (fc2): Linear(in_features=1536, out_features=768, bias=True)
    (input_batch_norm): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn3): BatchNorm1d(3072, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn4): BatchNorm1d(1536, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn5): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (tok_emb): Embedding(44, 768, padding_idx=0)
  (drop): Dropout(p=0.1, inplace=False)
  (

In [16]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=150, batch_size=16, learning_rate=5e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=0)
trainer = Trainer(model, train_dataset, test_dataset, tconf)

try:
    trainer.train()
except KeyboardInterrupt:
    print('KeyboardInterrupt')

epoch 1 iter 5423: train loss 0.08414. lr 4.908065e-04:  17%|███▎               | 5424/31250 [40:55<3:14:52,  2.21it/s]

KeyboardInterrupt





In [17]:
# torch.save(model, './SavedModels/savedModel.pt')

In [18]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)

In [21]:
# alright, let's sample some character-level symbolic GPT
from mingpt.utils import sample
#from gp_model import Genetic_Model
#from mlp_model import MLP_Model
    
loader = torch.utils.data.DataLoader(
                                test_dataset, 
                                shuffle=False, 
                                pin_memory=True,
                                batch_size=1,
                                num_workers=0)

testRange = [3.1,6.0]
numTestPoints = 10
#test = np.linspace(3.1,6.0,numTestPoints)

# gpm = Genetic_Model(n_jobs=-1)
# mlp = MLP_Model()
    
fName = 'res.txt'
resultDict = {}
with open(fName, 'w', encoding="utf-8") as o:
    textTestList = textTest.split('\n')
    modelName = 'SymbolicGPT'
    resultDict[fName] = {modelName:[]}
    
    for i, batch in enumerate(loader):
        if extractAttributes:
            x,y,p,m = batch
        else:
            x,y = batch
        
        print('Test Case {}.'.format(i))
        o.write('Test Case {}/{}.\n'.format(i,len(textTestList)))
        
        t = json.loads(textTestList[i])
        
        if model.pointNetConfig:
            x = x[:,0:1].to(trainer.device)
            p = [e.to(trainer.device) for e in p] if pointsAsList else p.to(trainer.device)
            yHat = sample(model, x, 50, points=p, 
                          temperature=1.0, sample=True, 
                          top_k=10)[0]
        else:
            sos_eq_loc = loc = (x == test_dataset.stoi['E']).nonzero(as_tuple=True) 
            # pass everything (x,y) to the model except the equations
            x = x[:,:loc[1].item()+5].to(trainer.device)
            #x = x[:,0:sos_eq_loc].to(trainer.device)
            yHat = sample(model, x, block_size, points=None, 
                          temperature=1.0, sample=True, 
                          top_k=10)[0]
            
        # filter out predicted
        target = ''.join([train_dataset.itos[int(i)] for i in y[0]])
        predicted = ''.join([train_dataset.itos[int(i)] for i in yHat])
        
        #raise
                
        if extractAttributes:
            target = target.strip(paddingToken).split('"')
            target = target[0] if len(target[0])>1 else target[1]
            target = target.strip('"').strip("'")
            predicted = predicted.strip(paddingToken).split('"')
            predicted = predicted[0] if len(predicted[0])>1 else predicted[1]
            predicted = predicted.strip('"').strip("'")
        else:
            target = target[loc[1].item()+5:].split('"')[0]
            predicted = predicted[loc[1].item()+5+1:].split('"')[0]
            target = target.strip(paddingToken).strip('"').strip("'")
            predicted = predicted.strip(paddingToken).strip('"').strip("'")
            
        

        o.write('{}\n'.format(target))
        
        print('Target:{}\nPredicted:{}'.format(target, predicted))
        
        Ys = [] #t['YT']
        Yhats = []
        for xs in t['XT']:
            try:
                eqTmp = target + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                YEval = eval(eqTmp)
                YEval = 0 if np.isnan(YEval) else YEval
                YEval = 100 if np.isinf(YEval) else YEval
            except:
                YEval = 100 #TODO: Maybe I have to punish the model for each wrong template
            Ys.append(YEval)
            try:
                eqTmp = predicted + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                Yhat = eval(eqTmp)
                Yhat = 0 if np.isnan(Yhat) else Yhat
                Yhat = 100 if np.isinf(Yhat) else Yhat
            except:
                Yhat = 100
            Yhats.append(Yhat)
        mseErr = mse(Ys,Yhats)
        
        if type(mseErr) is np.complex128:
            mseErr = mseErr.real
        #elif mseErr < 0.00005: # to handle negative infinity, and log 0
        #    mseErr = 0.00005
            
        resultDict[fName][modelName].append(mseErr)
        
        o.write('{}:{}\n{}\n\n'.format(modelName, 
                               mseErr,
                               predicted))
        
        print('MSE:{}\n'.format(mseErr))

Test Case 0.
Target:-sin(1.1*x1+0.64)
Predicted:sin(sin(0.84*x1-0.85))
MSE:0.08790561588615452

Test Case 1.
Target:-1.42*x1+sqrt(x1+0.53)
Predicted:-0.44*x1**2-0.44*x1
MSE:60.18734143076241

Test Case 2.
Target:sqrt(-sin(0.2*x1))
Predicted:sin(0.65*sqrt(x1-0.48))
MSE:0.0061697012426823175

Test Case 3.
Target:sin(sqrt(x1))
Predicted:0.36*sqrt(0.38*x1**2-1)
MSE:0.10851230647995899

Test Case 4.
Target:0.86*x1**2-0.59*x1+1.36
Predicted:sqrt(2)*sqrt(x1-0.93)
MSE:244.49237734977683

Test Case 5.
Target:0.28*sqrt(-x1**2)
Predicted:sin(0.84*sqrt(x1))
MSE:0.173664949632233

Test Case 6.
Target:sin(0.07*x1**2+0.28*x1)
Predicted:sin(0.14*x1+0.08)
MSE:0.6664744552526244

Test Case 7.
Target:sin(x1)
Predicted:sin(0.59*sqrt(0.64*x1-1))
MSE:2.008972497842119

Test Case 8.
Target:0.81*sqrt(-x1**2-0.95*x1+0.12)
Predicted:0.92*sqrt(-x1**2-0.76*x1)
MSE:0.23922213555427688

Test Case 9.
Target:sin(x1+0.37)
Predicted:sqrt(x1+0.66)
MSE:8.642992934204942

Test Case 10.
Target:-0.68*x1**2+1.77*x1
Predicted

Target:x1-0.21
Predicted:x1
MSE:0.04410000000000001

Test Case 88.
Target:0.33*x1-0.98
Predicted:0.94*x1**2+0.45*x1-0.47
MSE:507.07387105466665

Test Case 89.
Target:sqrt(x1-0.1)
Predicted:0.92*sqrt(-x1-0.16)
MSE:0.012869291504733432

Test Case 90.
Target:1.42*x1+0.77
Predicted:0.73*sqrt(-x1**2)
MSE:15.640868499999995

Test Case 91.
Target:sin(0.34*x1)
Predicted:0.51*sqrt(0.95-0.21*x1**2
MSE:9809.52023193843

Test Case 92.
Target:0.83*sqrt(-x1)
Predicted:0.52*(x1+0.7)**(1/4)**(1/4)
MSE:0.008348938194069453

Test Case 93.
Target:sin(x1)+sin(x1-0.38)
Predicted:sqrt(x1)+sin(x1-0.37)
MSE:7.894569517666615

Test Case 94.
Target:0.94*sqrt(-0.65*x1-1)
Predicted:sqrt(x1+0.36)
MSE:0.12088773577129125

Test Case 95.
Target:-2*x1*sin(0.58*x1)+1.0
Predicted:-0.28*x1**2-0.02*x1-sin(0.33*x1)
MSE:53.67588613842156

Test Case 96.
Target:sin(sqrt(x1+0.02))
Predicted:sin(sqrt(0.43*x1-0.9))
MSE:0.04241871801746101

Test Case 97.
Target:x1**(1/4)
Predicted:0.89*(x1+0.58)**(1/4)
MSE:0.014492578464089346

T

KeyboardInterrupt: 

In [None]:
target

In [None]:
predicted

In [None]:
target = target.strip(paddingToken).strip("'").split('"')
target = target[0] if len(target[0])>1 else target[1]
target = target.strip('"')
predicted = predicted.strip("'").strip(paddingToken).split('"')
predicted = predicted[0] if len(predicted[0])>1 else predicted[1]
predicted = predicted.strip('"')

In [None]:
target

In [None]:
predicted

In [None]:
# # load the GP and MLP if it's available
# expPath = 'C:/Users/vpcom/OneDrive - University of Waterloo/Projects/symbolicgpt2/Experiments/OLD/'
# expFile = 'test_1var_simple_mesh_GPT2_XYSorted_1024_88000.out'

# with open(expPath+expFile, 'r') as f:
#     resultDict[fName]['GP'] = []
#     lines = f.readlines()
#     for line in lines:
#         filt = 'GP: '
#         if filt in line:
#             # save the error
#             err = float(line.split(filt)[1].strip('\n'))
#             resultDict[fName]['GP'].append(err)

In [None]:
# plot the error frequency for model comparison
from matplotlib import pyplot as plt
num_eqns = len(resultDict[fName]['SymbolicGPT'])
num_vars = pconf.numberofVars

models = list(resultDict[fName].keys())
lists_of_error_scores = [resultDict[fName][key] for key in models]
linestyles = ["-","dashdot","dotted","--"]

eps = 0.00001
y, x, _ = plt.hist([np.log([x+eps for x in e]) for e in lists_of_error_scores],
                   label=models,
                   cumulative=True, 
                   histtype="step", 
                   bins=2000, 
                   density="true")
y = np.expand_dims(y,0)
plt.figure(figsize=(15, 10))

for idx, m in enumerate(models): 
    plt.plot(x[:-1], 
           y[idx] * 100, 
           linestyle=linestyles[idx], 
           label=m)

plt.legend(loc="upper left")
plt.title("{} equations of {} variables".format(num_eqns, num_vars))
plt.xlabel("Log of Mean Square Error")
plt.ylabel("Normalized Cumulative Frequency")

name = '{}.png'.format('results')
plt.savefig(name)