In [1]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset

In [5]:
# config
embeddingSize=504
numPoints=30
numVars=1
numYs=1
paddingToken='P'
padId=0
block_size = 60 # spatial extent of the model for its context
batchSize = 256 if block_size < 100 else 16
pointsAsList = False
padding = True
method = 'outputConcat' # GPT/FirstToken/Summation/Concat/outputConcat/outputSummation
extractAttributes = True # use the pointNET if it is true
extractAttributes = False if method=='GPT' else extractAttributes
dataInfo = 'Mesh_XYSorted'

In [6]:
class CharDataset(Dataset):

    def __init__(self, data, files=None, block_size=50, extractAtt=False, 
                 chars=None, stoi=None, itos=None, testTime=False,
                 pointsAsList=False, padding=False):
        self.chars = sorted(list(set(data))+['T','<','>']) if chars==None else chars
        data_size = len(data)
        print('data has %d characters, %d unique.' % (data_size, len(self.chars)))
        
        self.stoi = { ch:i for i,ch in enumerate([paddingToken]+self.chars if extractAtt else self.chars) } if stoi==None else stoi
        self.itos = { i:ch for ch,i in self.stoi.items() } if itos==None else itos
        self.block_size = block_size
        self.vocab_size = len(self.stoi)
        
        self.files = files
#         if self.files == None:
        self.data = data # this is memory expensive
        
        self.attributes = extractAtt
        self.threshold = [-1000,1000]
        self.testTime = testTime
        self.pointsAsList = pointsAsList
        self.padding = padding
        
        if self.attributes or self.testTime:
            self.dataList = self.data.split('\n') #TODO: remove later?

            self.blockIdx = []
            summation = 0
            for d in self.dataList:
                s = summation
                e = s + len(d)
                self.blockIdx.append((s,e))
                summation = e+1
    
    def __len__(self):
        if self.attributes or self.testTime:
            return len(self.dataList) - 1
        else:
            return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        #chunk = self.data[idx:idx + self.block_size + 1]
        if not self.attributes:
            if self.testTime:
                chunk = self.data[self.blockIdx[idx][0]:self.blockIdx[idx][1]]
            else:
                chunk = self.data[idx:idx + self.block_size + 1]
                
            dix = [self.stoi[s] for i,s in enumerate(chunk)]
            inputs = torch.tensor(dix[:-1], dtype=torch.long).contiguous()
            outputs = torch.tensor(dix[1:], dtype=torch.long).contiguous()
            return inputs, outputs
        else:
            chunk = self.data[self.blockIdx[idx][0]:self.blockIdx[idx][1]]
        
            # extracts other attributes
            points = None
            if self.attributes:
                dic = json.loads(chunk)

                if self.pointsAsList:
                    points = []
                else:
                    points = torch.zeros(numVars+numYs, numPoints)
                    
                for idx, xy in enumerate(zip(dic['X'], dic['Y'])):
                    x = xy[0] + [self.stoi[paddingToken]]*(max(numVars-len(xy[0]),0)) # padding
                    y = [xy[1]] if type(xy[1])== float else xy[1]
                    y = y + [self.stoi[paddingToken]]*(max(numYs-len(y),0)) # padding
                    
                    #print('x:{},y:{}\n'.format(x,y))

                    p = x+y # because it is only one point 
                    p = torch.tensor(p)

                    #replace nan and inf
                    p = torch.nan_to_num(p, nan=0.0, 
                                         posinf=self.threshold[1], 
                                         neginf=self.threshold[0])
                    
                    if self.pointsAsList:
                        points.append(p)
                    else:
                        points[:,idx] = p
                        
                chunk = '<'+dic['EQ']+'>'

            # encode every character to an integer
            dix = [self.stoi[s] for i,s in enumerate(chunk) if i<self.block_size]
            dixInput = dix[:-1]
            dixOutput = dix[1:]
            
           
            if self.padding:
                paddingSize = max(self.block_size-len(dixInput),0)
                paddingList = [self.stoi[paddingToken]]*paddingSize
                dixInput += paddingList # padding
                dixOutput += paddingList # padding
            else:
                paddingSize = max(self.block_size-len(dixInput),0)
                # instead of padding use the same tokens repetitive
                dixInput += dixInput * (int(paddingSize/len(dixInput))+1) 
                dixOutput += dixOutput * (int(paddingSize/len(dixOutput))+1) 
                
            dixInput = dixInput[:self.block_size-1] # make sure the size is correct
            dixOutput = dixOutput[:self.block_size-1] # make sure the size is correct
                
            outputs = torch.tensor(dixOutput, dtype=torch.long).contiguous()
            inputs = torch.tensor(dixInput, dtype=torch.long).contiguous()

            mask = [1 for s in dix]
            mask += [0]*paddingSize

            mask = torch.tensor(mask[:-1], dtype=torch.long).contiguous()
            mask = mask.unsqueeze(0)
            mask = mask.T @ mask
            #mask = mask.T # transpose the output mask

            assert len(mask) == self.block_size, 'Wrong mask shape: {}'.format(mask.shape)
            assert len(inputs) == self.block_size-1, 'Wrong inputs shape: {}'.format(inputs.shape)
            assert len(outputs) == self.block_size-1, 'Wrong y shape: {}'.format(outputs.shape)

            return inputs, outputs, points, mask

In [7]:
import json
from tqdm import tqdm
import glob
def processDataFiles(files):
    text = ''""
    for f in tqdm(files):
        with open(f, 'r') as h: 
            lines = h.read() # don't worry we won't run out of file handles
            text += lines #json.loads(line)                
    return text

In [8]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TrainDatasetFixed\*.json'
files = glob.glob(path)
text = processDataFiles(files[:3 if block_size>60 else 10]) #[files[0]])

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.00s/it]


In [9]:
# avgBlockSize = 0
# upNum = 100
# for i in tqdm(range(0,upNum)):
#     avgBlockSize += len(text.split('\n')[i])
# avgBlockSize /= upNum
# print('avg block size is {}'.format(avgBlockSize))

In [10]:
train_dataset = CharDataset(text, files, block_size, extractAtt=extractAttributes, pointsAsList=pointsAsList, padding=padding) 

data has 2430722170 characters, 46 unique.


In [11]:
idx = np.random.randint(min(train_dataset.__len__(),1000))
sample = train_dataset.__getitem__(idx)
batch = sample
if extractAttributes:
    x,y,p,m = batch
    #print('XS:{}\nMS:{}\nyS:{}\nPointsS:{}'.format(x.shape,m.shape,y.shape,len(p)))
    print('X:{}\nM:{}\ny:{}\nPoints:{}'.format(x,m,y,p))
    print(p.shape)
else:
    x,y = batch
    print('X:{}\ny:{}\n'.format(x,y))
    
xc = ''.join([train_dataset.itos[int(i)] for i in x])#.strip('"')
yc = ''.join([train_dataset.itos[int(i)] for i in y])#.strip('"')
print('x:{}\n\ny:{}'.format(xc,yc))

X:tensor([23,  9, 42, 35, 38,  4, 42, 35, 38,  4, 12, 10, 14,  6, 44, 13,  5,  5,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0])
M:tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
y:tensor([ 9, 42, 35, 38,  4, 42, 35, 38,  4, 12, 10, 14,  6, 44, 13,  5,  5, 24,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0])
Points:tensor([[ 3.0000,  2.9000,  2.7900,  2.6900,  2.5900,  2.4800,  2.3800,  2.2800,
          2.1700,  2.0700,  1.9700,  1.8600,  1.7600,  1.6600,  1.5500,  1.4500,
          1.3400,  1.2400,  1.1400,  1.0300,  0.9300,  0.8300,  0.720

In [12]:
path = 'D:\Datasets\Symbolic Dataset\Datasets\Mesh_Simple_GPT2_Sorted\TestDataset\*.json'
files = glob.glob(path)
textTest = processDataFiles([files[0]])
test_dataset = CharDataset(textTest, block_size, extractAtt=extractAttributes,
                           testTime=True,
                           chars=train_dataset.chars, stoi=train_dataset.stoi, 
                           itos=train_dataset.itos,
                           pointsAsList=pointsAsList, padding=padding)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.13it/s]

data has 901208 characters, 46 unique.





In [13]:
print(train_dataset.vocab_size, test_dataset.vocab_size)

47 47


In [14]:
torch.cuda.is_available()

True

In [15]:
from mingpt.model import GPT, GPTConfig, PointNetConfig
pconf = PointNetConfig(embeddingSize=embeddingSize, 
                       numberofPoints=numPoints, 
                       numberofVars=numVars, 
                       numberofYs=numYs,
                       method=method)
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size-1 if extractAttributes else train_dataset.block_size,
                  n_layer=8, n_head=12, n_embd=embeddingSize, grad_norm_clip=1.0,
                  padToken=paddingToken, padId=padId)
model = GPT(mconf, pconf) if extractAttributes else GPT(mconf)

05/18/2021 23:28:25 - INFO - mingpt.model -   number of parameters: 2.961353e+07


In [16]:
model

GPT(
  (pointNet): TNet(
    (conv1): Conv1d(2, 504, kernel_size=(1,), stride=(1,))
    (conv2): Conv1d(504, 1008, kernel_size=(1,), stride=(1,))
    (conv3): Conv1d(1008, 2016, kernel_size=(1,), stride=(1,))
    (fc1): Linear(in_features=2016, out_features=1008, bias=True)
    (fc2): Linear(in_features=1008, out_features=504, bias=True)
    (input_batch_norm): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn1): BatchNorm1d(504, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(1008, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn3): BatchNorm1d(2016, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn4): BatchNorm1d(1008, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn5): BatchNorm1d(504, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (tok_emb): Embedding(47, 504)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): Sequen

In [17]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=150, batch_size=batchSize, learning_rate=5e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=0, ckpt_path='./SavedModels/bestModel/checkpoint.pt')
trainer = Trainer(model, train_dataset, test_dataset, tconf)

try:
    trainer.train()
except KeyboardInterrupt:
    print('KeyboardInterrupt')

epoch 1 iter 124: train loss 0.77729. lr 4.999879e-04:   1%|▏                    | 124/19532 [00:58<2:30:43,  2.15it/s]

-->Inputs: tensor([23, 12, 10, 17, 19,  6, 44, 13,  6,  6, 14,  7, 14, 10, 20, 14,  6, 44,
        13,  7, 12, 10, 17, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0], device='cuda:0') 
Logits: tensor([12, 10, 20, 16,  6, 44, 13,  6,  6, 14,  7, 12, 10, 12,  6,  6, 44, 13,
        24, 12, 10, 15, 16, 24,  6, 24, 24, 24, 24, 24, 24, 24, 24, 24, 12, 12,
        16, 24, 24, 24, 24, 24, 24, 16, 24, 24, 24, 24, 24, 24, 24, 16, 24, 16,
        24, 24, 16, 16, 24], device='cuda:0') 
Targets: tensor([12, 10, 17, 19,  6, 44, 13,  6,  6, 14,  7, 14, 10, 20, 14,  6, 44, 13,
         7, 12, 10, 17, 20, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0], device='cuda:0')
Inputs:<0.57*x1**2+2.82*x1+0.58PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
Logits:0.84*x1**2+0.0**x1>0.34>*>>>>>>>>>004>

epoch 1 iter 393: train loss 0.73747. lr 4.998791e-04:   2%|▍                    | 394/19532 [03:06<2:29:18,  2.14it/s]

-->Inputs: tensor([23, 42, 40, 41, 43,  4, 44, 13,  9, 12, 10, 12, 13,  5,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0], device='cuda:0') 
Logits: tensor([42, 35, 41, 43,  4, 44, 13,  5, 12, 10, 21, 15,  5, 24,  6,  6,  6,  6,
        42,  6,  6,  6, 35, 12,  6, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 16, 24, 24, 24, 24, 12, 24, 24, 24, 12, 24, 12, 24, 24,
        24, 24, 24, 24, 24], device='cuda:0') 
Targets: tensor([42, 40, 41, 43,  4, 44, 13,  9, 12, 10, 12, 13,  5, 24,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0], device='cuda:0')
Inputs:<sqrt(x1-0.01)PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
Logits:sirt(x1)0.93)>****s***i0*>>>>>>>>>>>>>

epoch 1 iter 537: train loss 0.76020. lr 4.997743e-04:   3%|▌                    | 538/19532 [04:13<2:29:24,  2.12it/s]

KeyboardInterrupt





In [18]:
#torch.save(model, './SavedModels/savedModel.pt')

In [19]:
# model.load_state_dict(torch.load('./SavedModels/bestModel/checkpoint.pt'))
# model = model.eval().to(trainer.device)

In [20]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)

In [21]:
fName = '{}_SymbolicGPT_{}_{}_{}Epochs_{}.txt'.format(dataInfo, method, 
                                                      'Repetitive' if not padding else 'Padding',
                                                      trainer.epoch,
                                                      block_size)

In [None]:
# alright, let's sample some character-level symbolic GPT
from mingpt.utils import sample
#from gp_model import Genetic_Model
#from mlp_model import MLP_Model
    
loader = torch.utils.data.DataLoader(
                                test_dataset, 
                                shuffle=False, 
                                pin_memory=True,
                                batch_size=1,
                                num_workers=0)

testRange = [3.1,6.0]
numTestPoints = 10
#test = np.linspace(3.1,6.0,numTestPoints)

# gpm = Genetic_Model(n_jobs=-1)
# mlp = MLP_Model()

resultDict = {}
with open(fName, 'w', encoding="utf-8") as o:
    textTestList = textTest.split('\n')
    modelName = 'SymbolicGPT'
    resultDict[fName] = {modelName:[]}
    
    for i, batch in enumerate(loader):
        if extractAttributes:
            x,y,p,m = batch
        else:
            x,y = batch
        
        print('Test Case {}.'.format(i))
        o.write('Test Case {}/{}.\n'.format(i,len(textTestList)))
        
        t = json.loads(textTestList[i])
        
        if model.pointNetConfig:
            x = x[:,0:1].to(trainer.device)
            p = [e.to(trainer.device) for e in p] if pointsAsList else p.to(trainer.device)
            yHat = sample(model, x, block_size, points=p, 
                          temperature=1.0, sample=True, 
                          top_k=10)[0]
        else:
            sos_eq_loc = loc = (x == test_dataset.stoi['E']).nonzero(as_tuple=True) 
            # pass everything (x,y) to the model except the equations
            x = x[:,:loc[1].item()+5].to(trainer.device)
            #x = x[:,0:sos_eq_loc].to(trainer.device)
            yHat = sample(model, x, block_size, points=None, 
                          temperature=1.0, sample=True, 
                          top_k=10)[0]
            
        # filter out predicted
        target = ''.join([train_dataset.itos[int(i)] for i in y[0]])
        predicted = ''.join([train_dataset.itos[int(i)] for i in yHat])
        
        #raise
                
        if extractAttributes:
            target = target.strip(paddingToken).split('>')
            target = target[0] if len(target[0])>1 else target[1]
            target = target.strip('<').strip(">")
            predicted = predicted.strip(paddingToken).split('>')
            predicted = predicted[0] if len(predicted[0])>1 else predicted[1]
            predicted = predicted.strip('<').strip(">")
        else:
            target = target[loc[1].item()+5:].split('>')[0]
            predicted = predicted[loc[1].item()+5+1:].split('>')[0]
            target = target.strip(paddingToken).strip('<').strip(">")
            predicted = predicted.strip(paddingToken).strip('<').strip(">")
            
        

        o.write('{}\n'.format(target))
        
        print('Target:{}\nPredicted:{}'.format(target, predicted))
        
        Ys = [] #t['YT']
        Yhats = []
        for xs in t['XT']:
            try:
                eqTmp = target + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                YEval = eval(eqTmp)
                YEval = 0 if np.isnan(YEval) else YEval
                YEval = 100 if np.isinf(YEval) else YEval
            except:
                YEval = 100 #TODO: Maybe I have to punish the model for each wrong template
            Ys.append(YEval)
            try:
                eqTmp = predicted + '' # copy eq
                eqTmp = eqTmp.replace(' ','')
                eqTmp = eqTmp.replace('\n','')
                for i,x in enumerate(xs):
                    # replace xi with the value in the eq
                    eqTmp = eqTmp.replace('x{}'.format(i+1), str(x))
                    if ',' in eqTmp:
                        assert 'There is a , in the equation!'
                Yhat = eval(eqTmp)
                Yhat = 0 if np.isnan(Yhat) else Yhat
                Yhat = 100 if np.isinf(Yhat) else Yhat
            except:
                Yhat = 100
            Yhats.append(Yhat)
        mseErr = mse(Ys,Yhats)
        
        if type(mseErr) is np.complex128:
            mseErr = mseErr.real
        #elif mseErr < 0.00005: # to handle negative infinity, and log 0
        #    mseErr = 0.00005
            
        resultDict[fName][modelName].append(mseErr)
        
        o.write('{}:{}\n{}\n\n'.format(modelName, 
                               mseErr,
                               predicted))
        
        print('MSE:{}\n'.format(mseErr))

Test Case 0.
Target:-sin(1.1*x1+0.64)
Predicted:sin(x1)
MSE:1.483690849489537

Test Case 1.


In [None]:
print('Avg MSE:{}'.format(np.mean(resultDict[fName][modelName])))

In [None]:
print(fName)

In [None]:
target

In [None]:
predicted

In [None]:
target = target.strip(paddingToken).strip("'").split('"')
target = target[0] if len(target[0])>1 else target[1]
target = target.strip('"')
predicted = predicted.strip("'").strip(paddingToken).split('"')
predicted = predicted[0] if len(predicted[0])>1 else predicted[1]
predicted = predicted.strip('"')

In [None]:
target

In [None]:
predicted

In [None]:
# # load the GP and MLP if it's available
# expPath = 'C:/Users/vpcom/OneDrive - University of Waterloo/Projects/symbolicgpt2/Experiments/OLD/'
# expFile = 'test_1var_simple_mesh_GPT2_XYSorted_1024_88000.out'

# with open(expPath+expFile, 'r') as f:
#     resultDict[fName]['GP'] = []
#     lines = f.readlines()
#     for line in lines:
#         filt = 'GP: '
#         if filt in line:
#             # save the error
#             err = float(line.split(filt)[1].strip('\n'))
#             resultDict[fName]['GP'].append(err)

In [None]:
# plot the error frequency for model comparison
from matplotlib import pyplot as plt
num_eqns = len(resultDict[fName]['SymbolicGPT'])
num_vars = pconf.numberofVars

models = list(resultDict[fName].keys())
lists_of_error_scores = [resultDict[fName][key] for key in models]
linestyles = ["-","dashdot","dotted","--"]

eps = 0.00001
y, x, _ = plt.hist([np.log([max(min(x+eps, 100000),1e-5) for x in e]) for e in lists_of_error_scores],
                   label=models,
                   cumulative=True, 
                   histtype="step", 
                   bins=2000, 
                   density="true")
y = np.expand_dims(y,0)
plt.figure(figsize=(15, 10))

for idx, m in enumerate(models): 
    plt.plot(x[:-1], 
           y[idx] * 100, 
           linestyle=linestyles[idx], 
           label=m)

plt.legend(loc="upper left")
plt.title("{} equations of {} variables".format(num_eqns, num_vars))
plt.xlabel("Log of Mean Square Error")
plt.ylabel("Normalized Cumulative Frequency")

name = '{}.png'.format(fName)
plt.savefig(name)