In [2]:
import os
import json
import random
import numpy as np
from datetime import datetime

In [377]:
# Config
numSamples = 100
numVars = 2
seed = 2021
numPoints = [20,21]
decimals = 4
trainRange = [-1.0,4.0]
testRange = [4.1,8.0]
constantsRange = [1,1]
template = {'EQ':'', 'Skeleton':'', 'X':[], 'Y':0.0, 'XT':[], 'YT':0.0,}
folder = './Dataset'
os.makedirs(folder, exist_ok=True)
now = datetime.now()
time = now.strftime("%d%m%Y_%H%M%S")
dataPath = folder +'/id{}_nv{}_np{}_trR{}_teR{}_templateBased_t{}.json'.format('{}', numVars, numPoints, 
                                                                 trainRange,
                                                                 testRange, 
                                                                 time)
maxSumTerms = 6
maxMulTerms = 2
maxExponents = 6
pSum = 0.99
pMul = 0.5
print(dataPath)

./Dataset/id{}_nv2_np[20, 21]_trR[-1.0, 4.0]_teR[4.1, 8.0]_templateBased_t26052021_140050.json


In [6]:
np.random.seed(seed)
random.seed(seed)
rng = np.random.RandomState(seed)

In [7]:
# add a safe wrapper for numpy math functions
from numpy import *
import numpy as np

def divide(x, y):
  x = np.nan_to_num(x)
  y = np.nan_to_num(y)
  return np.divide(x,y+1e-5)

def sqrt(x):
  x = np.nan_to_num(x)
  return np.sqrt(np.abs(x)) 

# Mean square error
def mse(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        our_sum += (y_hat[i] - y_gold[i]) ** 2

    return our_sum / len(y_gold)

# Mean square error
def relativeErr(y, y_hat):
    y_hat = np.reshape(y_hat, [1, -1])[0]
    y_gold = np.reshape(y, [1, -1])[0]
    our_sum = 0
    for i in range(len(y_gold)):
        if y_gold[i] < 1: 
            # use regular MSE
            our_sum += (y_hat[i] - y_gold[i]) ** 2
        else:
            # use relative MSE
            our_sum += ((y_hat[i] - y_gold[i])/y_gold[i]) ** 2

    return our_sum / len(y_gold)

In [10]:
symbols = {"x":['x{}'.format(i+1) for i in range(numVars)], "const":"C"}
# Generate Equations
equations = []
#while len(equations) < numSamples:
    #models = generate_models(grammer, symbols, strategy_settings = {"N":numSamples}) # the output is an ModelBox object
    #equations.extend([eq for eq in models])

In [358]:
templates = {
    'polynomials': '{}*{}**{}'.format(symbols['const'],'{}','{}'), # identity is part of the polynomials
    'exponentials': '{}*exp({})**{}'.format(symbols['const'],'{}','{}'),
    'logarithms': '{}*log({})**{}'.format(symbols['const'],'{}','{}'),
    'cos': '{}*cos({})**{}'.format(symbols['const'],'{}','{}'),
    'sin': '{}*sin({})**{}'.format(symbols['const'],'{}','{}'),
    'sqrt': '{}*sqrt({})**{}'.format(symbols['const'],'{}','{}'),
    'power': '{}*{}**{}'.format(symbols['const'],'{}','{}'),
    'identity': '{}*{}'.format(symbols['const'],'{}'),
}

In [384]:
symbols

{'x': ['x1', 'x2'], 'const': 'C'}

In [359]:
templates

{'polynomials': 'C*{}**{}',
 'exponentials': 'C*exp({})**{}',
 'logarithms': 'C*log({})**{}',
 'cos': 'C*cos({})**{}',
 'sin': 'C*sin({})**{}',
 'sqrt': 'C*sqrt({})**{}',
 'power': 'C*{}**{}',
 'identity': 'C*{}'}

In [735]:
# Generate the data
fileID = 1

numTermsChoices = list(range(maxSumTerms))
keys = list(templates.keys())
for i in range(numSamples):
    numTerms = np.random.choice(numTermsChoices) # choose the number of terms
    eq = '{}'.format(symbols['const']) # add a constant
    for i in range(numTerms): # for each term, generate an expression
        
        # generate an equation
        if np.random.rand() < pSum: # Summation probability
            term = np.random.choice(len(templates)) # choose a term
            if keys[term] != 'power': # except power which is designed for expression like t1^t2, all the others have the same template
                exponent = np.random.choice(maxExponents) # choose an exponent
                if exponent != 0: # if exponent is not zero continue, otherwise ignore this term.
                    # generate the first term
                    if keys[term] == 'identity':
                        t1 = (templates[keys[term]] + '').format(np.random.choice(symbols['x']))
                    else:
                        t1 = (templates[keys[term]] + '').format(np.random.choice(symbols['x']), exponent)
                    if np.random.rand() < pMul: # Multiplication Probability (t1*t2)
                        # create the second term
                        term2 = np.random.choice(len(templates)) # choose a term
                        exponent = np.random.choice(maxExponents) # choose an exponent
                        if exponent != 0: # if exponent is not zero continue, otherwise ignore this term.
                            if keys[term] == 'identity':
                                t2 = (templates[keys[term]] + '').format(np.random.choice(symbols['x'])).strip('C*')
                            else:
                                t2 = (templates[keys[term]] + '').format(np.random.choice(symbols['x']), exponent).strip('C*')
                            t += '*' + t2
                    eq += '+' + t1
            else:
                # for the special t1^t2 case
                term = np.random.choice(len(templates)) # choose a term
                exponent = np.random.choice(maxExponents) # choose an exponent
                if exponent != 0: # if exponent is not zero continue, otherwise ignore this term.
                    # generate the first term
                    if keys[term] == 'identity':
                        t1 = (templates[keys[term]] + '').format(np.random.choice(symbols['x'])).strip('C*')
                    else:
                        t1 = (templates[keys[term]] + '').format(np.random.choice(symbols['x']), exponent).strip('C*')
                term = np.random.choice(len(templates)) # choose a term
                exponent = np.random.choice(maxExponents) # choose an exponent
                if exponent != 0: # if exponent is not zero continue, otherwise ignore this term.
                    # generate the first term
                    if keys[term] == 'identity':
                        t2 = (templates[keys[term]] + '').format(np.random.choice(symbols['x'])).strip('C*')
                    else:
                        t2 = (templates[keys[term]] + '').format(np.random.choice(symbols['x']), exponent).strip('C*')
                t = 'C*' + (templates['power']+'').format(t1, t2).strip('C*')
                eq += '+' + t                
        
        
    # generate data points 
    skeletonEqn = eq + '' # copy an equation
    chosenPoints = np.random.randint(numPoints[0],numPoints[1]) # for each equation choose the number of points randomly

    # find all constants in the generated equation, generate a random number based on the given boundry
    constants = [random.uniform(constantsRange[0], constantsRange[1]) for i,x in enumerate(skeletonEqn) if x=='C']            
    eq = skeletonEqn.replace('C','{}').format(*constants) if len(constants)>0 else skeletonEqn

    # for each variable, generate the same number of points (x: (numPoints, numVars))
    X = np.round(rng.uniform(low=trainRange[0], high=trainRange[1], size=(chosenPoints,numVars)), decimals) # generate random points uniformly

    # calculate y based on x
    Y = []
    for point in X:
        tmpEq = eq + '' # copy the string
        for varId in range(numVars):
            tmpEq = tmpEq.replace('x{}'.format(varId+1),str(np.round(point[varId], decimals)))
        try: 
            y = eval(tmpEq)
            if type(y) is np.complex128 or np.complex:
                y = 0 #abs(err.real)
        except ZeroDivisionError:
            y = 0
        except OverflowError:
            y = 0
        except:
            raise Exception('Err to process this equation: {}, original:{}'.format(tmpEq, skeletonEqn)) 
        Y.append(round(y, decimals))

    # generate xt for the test range
    XT = np.round(rng.uniform(low=testRange[0], high=testRange[1], size=(chosenPoints,numVars)), decimals) # generate random points uniformly

    # calculate yt based on xt
    YT = []
    for point in XT:
        tmpEq = eq + '' # copy the string
        for varId in range(numVars):
            tmpEq = tmpEq.replace('x{}'.format(varId+1),str(point[varId]))
        try: 
            y = eval(tmpEq)
            if type(y) is np.complex128 or np.complex:
                y = 0 #abs(err.real)
        except ZeroDivisionError:
            y = 0
        except OverflowError:
            y = 0
        except:
            raise Exception('Err to process this equation: {}, original:{}'.format(tmpEq, skeletonEqn)) 
        YT.append(round(y, decimals))

    structure = template.copy() # copy the template

    # hold data in the structure
    structure['X'] = X.tolist()
    structure['Y'] = Y
    structure['Skeleton'] = skeletonEqn
    structure['EQ'] = eq
    structure['XT'] = XT.tolist()
    structure['YT'] = YT

    # write to a file
    outputPath = dataPath.format(fileID)
    if os.path.exists(outputPath):
        fileSize = os.path.getsize(outputPath)
        if fileSize > 500000000: # 500 MB
            fileID +=1 

    with open(outputPath, "a", encoding="utf-8") as h:
        json.dump(structure, h, ensure_ascii=False)
        h.write('\n')

    print(structure['Skeleton'])
    break

C+C*exp(x2)**3**sin(x2)**4+C*cos(x2)**2
