# Synthetic Polynomial Data Generator

You <font color='red'>don't</font> have to run this code but we are providing this for completeness.  High dimensional synthetic polynomial data is available at https://github.com/nitishbahadur/book_chapter/tree/master/data/synthetic_polynomial/input

In [1]:
import pandas as pd
import numpy as np
from sympy import Symbol,sympify,symbols,lambdify,expand
import random

In [2]:
x, y, z, A, B, C = symbols('x y z A B C')

In [3]:
expr = (A*x + B*y + C*z)

In [4]:
expand(expr) + expand(expr*expr) + expand(expr * expr * expr)

A**3*x**3 + 3*A**2*B*x**2*y + 3*A**2*C*x**2*z + A**2*x**2 + 3*A*B**2*x*y**2 + 6*A*B*C*x*y*z + 2*A*B*x*y + 3*A*C**2*x*z**2 + 2*A*C*x*z + A*x + B**3*y**3 + 3*B**2*C*y**2*z + B**2*y**2 + 3*B*C**2*y*z**2 + 2*B*C*y*z + B*y + C**3*z**3 + C**2*z**2 + C*z

The co-efficients of $x$, $y$, and $z$ is replaced by $P$, $Q$, $R$, etc. 

$Mx^3 + Nx^2y + Ox^2z + Px^2 + Qxy^2 + Rxyz + Sxy + Txz^2 + Uxz + Vx + Ay^3 + By^2z + Cy^2 + Dyz^2 + Eyz + Fy + Gz^3 + Hz^2 + Iz + J$

Generate $M, N, O, P...I, J$, the coefficients.  These are the columns and constant for all $x, y, z$.  

In [11]:
# generate a random number
def generate_random_number():
    x = 100 # a large number
    while True:
        x = np.random.normal(loc=0,scale=1.0)
        if (x > -1) and (x < 1):
            break
    return np.round(x, 2)

# ---------------------------------------
# generates a tuple of coefficients
# These are constants for the column
# ---------------------------------------
def generate_coefficients():
    with open(r'X_coefficients.csv', 'w') as file:
        file.write("M,N,O,P,Q,R,S,T,U,V,A,B,C,D,E,F,G,H,I,J\n")

        for x in range(784):
            rand_nums = []
            for i in range(20):
                num = generate_random_number()
                rand_nums.append(num)
            joined_string = ",".join([str(i) for i in rand_nums]) 
            file.write(f"{joined_string}\n")

# -------------------------------------------            
# generates tuple of (x,y,z)            
# These are the constant for for the row.
# ------------------------------------------
def generate_xyz(std_dev):
    with open(r'XYZ_{}.csv'.format(std_dev), 'w') as file:
        file.write("X,Y,Z\n")
        for x in range(5000):
            x = np.round(np.random.normal(loc=0,scale=std_dev), 2)
            y = np.round(np.random.normal(loc=0.2,scale=std_dev), 2)
            z = np.round(np.random.normal(loc=-0.4,scale=std_dev), 2)
            file.write(f"{x},{y},{z}\n")            

Now generate the random values that will be used to generate synthetic polynomial

In [14]:
generate_coefficients()
generate_xyz(1.0)

In [5]:
'''
The function calculates the value for each cell in a m x n dataset.
The row and column values are generated prior to this function
'''
def eval(xyz_row, coeff_):  
    x = xyz_row['X']; y = xyz_row['Y']; z = xyz_row['Z']
    
#     𝑀𝑥3 + 𝑁𝑥2𝑦 + 𝑂𝑥2𝑧 + 𝑃𝑥2 + 𝑄𝑥𝑦2 + 𝑅𝑥𝑦𝑧 + 
    M = coeff_['M']; N = coeff_['N']; O = coeff_['O']; P = coeff_['P']; Q = coeff_['Q']; R = coeff_['R'];
    val1 = M*(x**3) + N*(x**2)*y + O*(x**2)*z + P*(x**2) + Q*x*(y**2) + R*x*y*z
    
#     𝑆𝑥𝑦 + 𝑇𝑥𝑧2 + 𝑈𝑥𝑧 + 𝑉𝑥 + 𝐴𝑦3 + 𝐵𝑦2𝑧 + 𝐶𝑦2 + 
    S = coeff_['S']; T = coeff_['T']; U = coeff_['U']; V = coeff_['V']; A = coeff_['A']; B = coeff_['B']; C = coeff_['C'];
    val2 = S*x*y + T*x*(z**2) + U*x*z + V*x + A*(y**3) + B*(y**2)*z + C*(y**2)
    
#     𝐷𝑦𝑧2 + 𝐸𝑦𝑧 + 𝐹𝑦 + 𝐺𝑧3 + 𝐻𝑧2 + 𝐼𝑧 + 𝐽
    D = coeff_['D']; E = coeff_['E']; F = coeff_['F']; G = coeff_['G']; H = coeff_['H']; I = coeff_['I']; J = coeff_['J'];
    val3 = D*y*(z**2) + E*y*z + F*y + G*(z**3) + H*(z**2) + I*z 
    
    #+ J
    
    val =  val1 + val2 + val3
    
    return val

Generate Y_stddev.csv for each of $x,y,z$ with different standard deviations

In [4]:
def generate_Y_stddev(std):
    df = pd.read_csv(r'XYZ_{}.csv'.format(std))
    df_coeff = pd.read_csv(r'X_coefficients.csv')
    with open(r'Y_{}.csv'.format(std), 'w') as file:
        for df_index, df_row in df.iterrows():
            a_list = []
            for x_coeff_index, coeff_col in df_coeff.iterrows():
                a_list.append(eval(df_row, coeff_col))
            joined_string = ",".join([str(i) for i in a_list]) 
            file.write(joined_string + "\n")

In [5]:
generate_Y_stddev(1.0)

In [11]:
df = pd.read_csv(r'Y_1.0.csv', header=None)
m = df.values
np.save(r'Y_10.npy', m)