In [1]:
import os
import json
import ProGED
import random
import numpy as np
from datetime import datetime
from ProGED.generators.grammar_construction import grammar_from_template
from ProGED.generate import generate_models

In [2]:
# GRAMMAR_LIBRARY = {
#     "universal": construct_grammar_universal,
#     "universal-dim": construct_grammar_universal_dim,
#     "rational": construct_grammar_rational,
#     "simplerational": construct_grammar_simplerational,
#     "polytrig": construct_grammar_polytrig,
#     "trigonometric": construct_grammar_trigonometric,
#     "polynomial": construct_grammar_polynomial}

In [3]:
# Config
numSamples = 100
numVars = 2
seed = 2021
numPoints = [20,21]
decimals = 4
trainRange = [-1.0,4.0]
testRange = [4.1,8.0]
constantsRange = [1,1]
template = {'EQ':'', 'Skeleton':'', 'X':[], 'Y':0.0, 'XT':[], 'YT':0.0,}
folder = './Dataset'
os.makedirs(folder, exist_ok=True)
now = datetime.now()
time = now.strftime("%d%m%Y_%H%M%S")
dataPath = folder +'/id{}_nv{}_np{}_trR{}_teR{}_t{}.json'.format('{}', numVars, numPoints, 
                                                                 trainRange,
                                                                 testRange, 
                                                                 time)
print(dataPath)

./Dataset/id{}_nv2_np[20, 21]_trR[-1.0, 4.0]_teR[4.1, 8.0]_t25052021_230810.json


In [4]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)

# Mean square error
def relativeErr(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        if y_gold[i] < 1: 
            # use regular MSE
            our_sum += (y_hat[i] - y_gold[i]) ** 2
        else:
            # use relative MSE
            our_sum += ((y_hat[i] - y_gold[i])/y_gold[i]) ** 2

    return our_sum / len(y_gold)

In [5]:
np.random.seed(seed)
random.seed(seed)
rng = np.random.RandomState(seed)

In [6]:
symbols = {"x":['x{}'.format(i+1) for i in range(numVars)], "start":"S", "const":"C"}
grammer = grammar_from_template("universal", 
    {"functions":["sin", "sqrt", "exp", "log"], 
    "variables":["'x{}'".format(i+1) for i in range(numVars)],
    "p_sum":[0.2, 0.2, 0.6], 
    "p_mul": [0.3, 0.1, 0.6], 
    "p_rec": [0.2, 0.4, 0.4], 
    "p_vars":[1/numVars for i in range(numVars)],
    "p_functs":[0.7, 0.1, 0.1, 0.05, 0.05]})

In [7]:
# Generate Equations
equations = []
while len(equations) < numSamples:
    models = generate_models(grammer, symbols, strategy_settings = {"N":numSamples}) # the output is an ModelBox object
    equations.extend([eq for eq in models])

In [9]:
# Generate the data
fileID = 1

for eq in equations:
    skeletonEqn = eq.__str__() # convert the object to string
    
    chosenPoints = np.random.randint(numPoints[0],numPoints[1]) # for each equation choose the number of points randomly
    
    # find all constants in the generated equation, generate a random number based on the given boundry
    constants = [random.uniform(constantsRange[0], constantsRange[1]) for i,x in enumerate(skeletonEqn) if x=='C']            
    eq = skeletonEqn.replace('C','{}').format(*constants) if len(constants)>0 else skeletonEqn
    
    # for each variable, generate the same number of points (x: (numPoints, numVars))
    X = np.round(rng.uniform(low=trainRange[0], high=trainRange[1], size=(chosenPoints,numVars)), decimals) # generate random points uniformly
    
    # calculate y based on x
    Y = []
    for point in X:
        tmpEq = eq + '' # copy the string
        for varId in range(numVars):
            tmpEq = tmpEq.replace('x{}'.format(varId+1),str(np.round(point[varId], decimals)))
        y = eval(tmpEq)
        Y.append(np.round(y))
        
    # generate xt for the test range
    XT = np.round(rng.uniform(low=testRange[0], high=testRange[1], size=(chosenPoints,numVars)), decimals) # generate random points uniformly
    
    # calculate yt based on xt
    YT = []
    for point in XT:
        tmpEq = eq + '' # copy the string
        for varId in range(numVars):
            tmpEq = tmpEq.replace('x{}'.format(varId+1),str(point[varId]))
        y = eval(tmpEq)
        YT.append(np.round(y))
    
    structure = template.copy() # copy the template
    
    # hold data in the structure
    structure['X'] = X.tolist()
    structure['Y'] = Y
    structure['Skeleton'] = skeletonEqn
    structure['EQ'] = eq
    structure['XT'] = XT.tolist()
    structure['YT'] = YT
    
    # write to a file
    outputPath = dataPath.format(fileID)
    if os.path.exists(outputPath):
        fileSize = os.path.getsize(outputPath)
        if fileSize > 500000000: # 500 MB
            fileID +=1 
        
    with open(outputPath, "a", encoding="utf-8") as h:
        json.dump(structure, h, ensure_ascii=False)
        h.write('\n')

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [10]:
# Temp

In [20]:
symbols = {"x":['x1', 'x2'], "start":"S", "const":"C"}
grammer = grammar_from_template("universal",
             {"variables":["'x1'", "'x2'"],
              "p_vars":[0.5,0.5],
              "functions":["sin", "cos", "sqrt", "exp"],
              "p_functs":[0.6, 0.1, 0.1, 0.1, 0.1],
              "p_sum":[0.2, 0.2, 0.6], 
              "p_mul":[0.2, 0.2, 0.6],
              "p_rec":[0.2, 0.2, 0.6],
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 8 models
-> C*x2/x1, p = 0.0002592, parse trees = 1, valid = False
-> x2/x1, p = 0.0064800000000000005, parse trees = 1, valid = False
-> x2, p = 0.108, parse trees = 1, valid = False
-> x1/x2, p = 0.0064800000000000005, parse trees = 1, valid = False
-> C + x2, p = 0.00011197440000000002, parse trees = 1, valid = False
-> C*x1 + 2*x1 + x2, p = 2.0155392e-07, parse trees = 1, valid = False
-> C*x2 + C + C/x2 + x1*x2**3/cos(x1*sqrt(x1 + x2) + x2**2) + x1, p = 2.112394198730865e-37, parse trees = 1, valid = False
-> sqrt(x2), p = 4.665600000000001e-05, parse trees = 1, valid = False


In [21]:
symbols = {"x":['x1', 'x2', 'x3'], "start":"S", "const":"C"}
grammer = grammar_from_template("universal-dim",
             {"variables":["'x1'", "'x2'", "'x3'"],
              "functions":["sin", "cos", "sqrt", "exp"],
              "p_functs":[0.6, 0.1, 0.1, 0.1, 0.1],
              "p_sum":[0.2, 0.2, 0.6], 
              "p_mul":[0.2, 0.2, 0.6],
              "p_rec":[0.2, 0.4, 0.4],
              "units":[[2,-2,1,0,0], [1,0,0,0,0], [-1,0,0,0,0], [2,-2,1,0,0]], 
              "target_variable_unit_index":-1,
              "dimensionless":[0,0,0,0,0]
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 10 models
-> C*x1 + x1, p = 1.1796480000000002e-06, parse trees = 1, valid = False
-> C*x1 + C + x1*sin(1), p = 2.264924160000001e-11, parse trees = 1, valid = False
-> C/(x3*(C*x2 + C - x2)), p = 7.514474781081598e-16, parse trees = 1, valid = False
-> C + 2*x1, p = 2.3887871999999995e-05, parse trees = 1, valid = False
-> C*x1 + C*cos(C*x1 + C*x3 + C), p = 6.597069766656e-21, parse trees = 1, valid = False
-> x1, p = 0.144, parse trees = 1, valid = False
-> C*x1, p = 0.00012287999999999996, parse trees = 1, valid = False
-> C + x1, p = 0.000497664, parse trees = 1, valid = False
-> C*x1 + C, p = 0.0009215999999999997, parse trees = 1, valid = False
-> x1/(C + C/x3), p = 9.555148800000004e-08, parse trees = 1, valid = False


In [247]:
symbols = {"x":['x1'], "start":"S", "const":"C"}
grammer = grammar_from_template("rational",
             {"functions":["'exp'"],
              "variables":["'x1'"],
              "p_vars":[1],
              "p_S":[0.4, 0.6], 
              "p_T":[0.4, 0.6],
              "p_R":[0.4, 0.6],
              "p_F":[1], 
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 9 models
-> C*exp(C*x1), p = 0.018765595607039998, parse trees = 2, valid = False
-> C/(C*exp(C*x1) + C), p = 0.0001719926784, parse trees = 1, valid = False
-> C*(C*x1**2 + C), p = 0.0007166361600000001, parse trees = 1, valid = False
-> (C*x1 + C*exp(C*x1**2) + C)/(C*exp(C*x1**2) + C), p = 6.340338096537601e-07, parse trees = 1, valid = False
-> exp(C*x1), p = 0.012441599999999997, parse trees = 1, valid = False
-> C*(C*exp(C*x1) + C), p = 0.0002579890176, parse trees = 1, valid = False
-> (C*x1**2 + C*exp(C*x1**2))/(C*exp(C*x1) + C*exp(C*x1**2) + C), p = 1.521681143169024e-07, parse trees = 1, valid = False
-> C*(C*exp(C*x1) + C)*exp(C*x1), p = 0.0010749542399999998, parse trees = 1, valid = False
-> (C*x1 + C*exp(C*x1))/(C*exp(C*x1) + C), p = 3.962711310335999e-06, parse trees = 1, valid = False


In [249]:
symbols = {"x":['x1'], "start":"S", "const":"C"}
grammer = grammar_from_template("simplerational",
             {"functions":["'exp'"],
              "variables":["'x1'"],
              "p_vars":[1],
              "p_F":[1],
              "p_S":[0.2, 0.8], 
              "p_P":[0.4, 0.3, 0.3],
              "p_R":[0.4, 0.6],
              "p_M":[0.4, 0.6], 
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 9 models
-> C*x1, p = 0.0864, parse trees = 1, valid = False
-> C*x1**2 + C, p = 0.013824000000000001, parse trees = 1, valid = False
-> C*x1**5, p = 0.00221184, parse trees = 1, valid = False
-> C*exp(C*x1) + C, p = 0.000331776, parse trees = 1, valid = False
-> C/x1, p = 0.0216, parse trees = 1, valid = False
-> C*exp(C*x1) + C*exp(C*x1**2), p = 0.00221184, parse trees = 1, valid = False
-> C*exp(C*x1), p = 0.0576, parse trees = 1, valid = False
-> C*x1**2 + C*x1 + C, p = 0.0019906560000000004, parse trees = 1, valid = False
-> C*x1**3, p = 0.013824000000000001, parse trees = 1, valid = False


In [262]:
symbols = {"x":["'x1'", "'x2'", "'x3'"], "start":"S", "const":"C"}
grammer = grammar_from_template("polytrig",
             {
              "variables":["'x1'", "'x2'", "'x3'"],
              "p_vars":[0.5,0.3,0.2],
              "p_more_terms":[0.7,0.15,0.15],
              "p_higher_terms":0.5, 
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 9 models
-> C*x1*x2 + C*x1*x3 + C*x1 + C*x2 + C, p = 1.8087890624999998e-06, parse trees = 1, valid = False
-> C*x1*x2*x3 + C, p = 0.0005625, parse trees = 1, valid = False
-> C*x2*x3 + C*x2 + C*x3 + C, p = 1.7364374999999992e-06, parse trees = 1, valid = False
-> C*x1**2*x2**4*x3 + C*x1**2*x2 + C*x1**2*x3 + C*x1 + C*x3**2 + C*x3 + C, p = 6.262272490561006e-18, parse trees = 1, valid = False
-> C*x1**3*x2 + C*x1**2*x2 + C*x1**2 + C*x1*x3 + C*x1 + C*x2**3 + C*x2**2*x3 + C*x2 + C*x3 + C, p = 1.9029578378699922e-22, parse trees = 1, valid = False
-> C*x3 + C, p = 0.015, parse trees = 1, valid = False
-> C*x2 + C, p = 0.01740375, parse trees = 2, valid = False
-> C*x1**4*x2**2*x3 + C*x1**2*x2*x3 + C*x1**2 + C*x1*x2**3 + C*x1*x2 + C*x1*x3 + C*x1 + C*x2 + C*x3 + C, p = 2.9515572949337384e-31, parse trees = 1, valid = False
-> C*x1**2*x2*x3**2 + C*x2 + C, p = 1.4765624999999999e-06, parse trees = 1, valid = False


In [248]:
symbols = {"x":['x1', 'x2'], "start":"S", "const":"C"}
grammer = grammar_from_template("trigonometric",
             {"variables":["'x1'", "'x2'"],
              "p_vars":[0.5,0.5],
              "functions":["'sin'", "'cos'", "'tan'"],
              "probs1":[0.5,0.5], 
              "probs2":[0.3,0.3,0.3],
             })                   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 8 models
-> tanh(x1), p = 0.08333333333333334, parse trees = 1, valid = False
-> cosh(x2), p = 0.08333333333333334, parse trees = 1, valid = False
-> tan(x1), p = 0.08333333333333334, parse trees = 1, valid = False
-> sin(x1), p = 0.08333333333333334, parse trees = 1, valid = False
-> tanh(x2), p = 0.08333333333333334, parse trees = 1, valid = False
-> sinh(x2), p = 0.08333333333333334, parse trees = 1, valid = False
-> tan(x2), p = 0.08333333333333334, parse trees = 1, valid = False
-> cos(x1), p = 0.08333333333333334, parse trees = 1, valid = False


In [117]:
symbols = {"x":['x1','x2'], "start":"S", "const":"C"}
grammer = grammar_from_template("polynomial",
             {"variables":["'x1'","'x2'"],
              "p_vars":[0.5, 0.5],
              "functions":["'1.0*'"],
              "p_S":[0.4, 0.6], 
              "p_T":[0.4, 0.6],
              "p_R":[0.6, 0.4],
              "p_F":[0.5],
             })   
    
models = generate_models(grammer, symbols, strategy_settings = {"N":10})
print(models)

ModelBox: 9 models
-> C*x2 + C, p = 0.0027648, parse trees = 1, valid = False
-> C*x1*x2, p = 0.00864, parse trees = 1, valid = False
-> C*x1**3*x2**2 + C, p = 4.423680000000001e-06, parse trees = 1, valid = False
-> C*x2**2 + C*x2 + C, p = 1.5925248000000003e-05, parse trees = 1, valid = False
-> C*x1 + C, p = 0.004147199999999999, parse trees = 1, valid = False
-> C*x1**2 + C*x1, p = 0.00011059200000000001, parse trees = 1, valid = False
-> C*x1**2*x2**4 + C*x2, p = 3.981312000000001e-07, parse trees = 1, valid = False
-> C*x1*x2 + C, p = 0.00124416, parse trees = 1, valid = False
-> C*x1**2, p = 0.0144, parse trees = 2, valid = False
