# Function Generation for the Training of λ-Nets

## Package installation (uncommand first line to install packages at the beginning)

## Specitication of Experiment Settings

In [1]:
#######################################################################################################################################
###################################################### CONFIG FILE ####################################################################
#######################################################################################################################################
sleep_time = 0 #minutes


config = {
    'data': {
        'd': 3, #degree
        'n': 4, #number of variables
        'sparsity': None,
        'sample_sparsity': None,
        'x_max': 1,
        'x_min': 0,
        #'x_step': 0.01,
        'a_max': 10,
        'a_min': -10,
        #'a_step': 0.001,
        'polynomial_data_size': 100,
        'noise': 0,
        
        'same_training_all_lambda_nets': False,
    },
    'lambda_net': {
        'lambda_dataset_size': 1000,
    },    
    'computation':{
        'n_jobs': -3,
        'use_gpu': False,
        'gpu_numbers': '0',
        'RANDOM_SEED': 42,   
    }
}





In [2]:
#######################################################################################################################################
########################################### IMPORT GLOBAL VARIABLES FROM CONFIG #######################################################
#######################################################################################################################################
globals().update(config['data'])
globals().update(config['lambda_net'])
globals().update(config['computation'])

## Imports

In [3]:
from utilities.utility_functions import *

from itertools import product       # forms cartesian products
from more_itertools import random_product 
from tqdm import tqdm_notebook as tqdm
import pickle

import numpy as np

import pandas as pd
from joblib import Parallel, delayed

import random 
from random import sample 

import os

from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler

from sympy import Symbol, sympify

        
import seaborn as sns
        
import random 

import warnings


In [4]:
#######################################################################################################################################
###################################################### SET VARIABLES + DESIGN #########################################################
#######################################################################################################################################
variables = 'abcdefghijklmnopqrstuvwxyz'[:n] 


os.environ['CUDA_VISIBLE_DEVICES'] = gpu_numbers if use_gpu else ''

sns.set_style("darkgrid")

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
    
    
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 500)

warnings.filterwarnings('ignore')

In [5]:
#######################################################################################################################################
####################################################### CONFIG ADJUSTMENTS ############################################################
#######################################################################################################################################
config['data']['sparsity'] = nCr(config['data']['n']+config['data']['d'], config['data']['d'])

config['data']['sample_sparsity'] = config['data']['sparsity'] if config['data']['sample_sparsity'] == None else config['data']['sample_sparsity']
    
#######################################################################################################################################
################################################## UPDATE VARIABLES ###################################################################
#######################################################################################################################################
globals().update(config['data'])
globals().update(config['lambda_net'])
globals().update(config['computation'])

initialize_utility_functions_config_from_curent_notebook(config)

if same_training_all_lambda_nets:
    training_string = '_same'
else:
    training_string = '_diverse'

#######################################################################################################################################
###################################################### PATH + FOLDER CREATION #########################################################
#######################################################################################################################################
generate_directory_structure()

#######################################################################################################################################
############################################################ SLEEP TIMER ##############################################################
#######################################################################################################################################
sleep_minutes(sleep_time)

<class 'KeyError'>
<class 'KeyError'>


In [6]:
##############DO NOT CHANGE###################
variables = 'abcdefghijklmnopqrstuvwxyz'[:n]

RANDOM_SEED = 42

#if same_training_all_lambda_nets:
#    training_string = '_same'
#else:
#    training_string = '_diverse'
    
print('Variables: ' + str(n) + ' (' + variables + ')')
print('Degree: ' + str(d))
print('Sparsity: ' + str(sparsity)) 
print('Lambda-Net Dataset Size: ' + str(lambda_dataset_size))
print('I-Net Dataset Size: ' + str(polynomial_data_size))
      
print('Coefficient Range: ' + '[' + str(a_min) + ', ' + str(a_max) + ']')
print('Variable Range: ' + '[' + str(x_min) + ', ' + str(x_max) + ']')

Variables: 4 (abcd)
Degree: 3
Sparsity: 35
Lambda-Net Dataset Size: 1000
I-Net Dataset Size: 100
Coefficient Range: [-10, 10]
Variable Range: [0, 1]


# Function Generation

In [7]:
list_of_monomial_identifiers_extended = []
for i in tqdm(range((d+1)**n)):    
    monomial_identifier = dec_to_base(i, base = (d+1)).zfill(n) 
    list_of_monomial_identifiers_extended.append(monomial_identifier)

print('List length: ' + str(len(list_of_monomial_identifiers_extended)))
print('Number of monomials in a polynomial with ' + str(n) + ' variables and degree ' + str(d) + ': ' + str(nCr(n+d, d)))
print('Sparsity: ' + str(sparsity))
print(list_of_monomial_identifiers_extended)

list_of_monomial_identifiers = []
for monomial_identifier in tqdm(list_of_monomial_identifiers_extended):
    monomial_identifier_values = list(map(int, list(monomial_identifier)))
    if sum(monomial_identifier_values) <= d:
        list_of_monomial_identifiers.append(monomial_identifier)

print('List length: ' + str(len(list_of_monomial_identifiers)))
print('Number of monomials in a polynomial with ' + str(n) + ' variables and degree ' + str(d) + ': ' + str(nCr(n+d, d)))
print('Sparsity: ' + str(sparsity))
print(list_of_monomial_identifiers)


  0%|          | 0/256 [00:00<?, ?it/s]

List length: 256
Number of monomials in a polynomial with 4 variables and degree 3: 35
Sparsity: 35
['0000', '0001', '0002', '0003', '0010', '0011', '0012', '0013', '0020', '0021', '0022', '0023', '0030', '0031', '0032', '0033', '0100', '0101', '0102', '0103', '0110', '0111', '0112', '0113', '0120', '0121', '0122', '0123', '0130', '0131', '0132', '0133', '0200', '0201', '0202', '0203', '0210', '0211', '0212', '0213', '0220', '0221', '0222', '0223', '0230', '0231', '0232', '0233', '0300', '0301', '0302', '0303', '0310', '0311', '0312', '0313', '0320', '0321', '0322', '0323', '0330', '0331', '0332', '0333', '1000', '1001', '1002', '1003', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1030', '1031', '1032', '1033', '1100', '1101', '1102', '1103', '1110', '1111', '1112', '1113', '1120', '1121', '1122', '1123', '1130', '1131', '1132', '1133', '1200', '1201', '1202', '1203', '1210', '1211', '1212', '1213', '1220', '1221', '1222', '1223', '1230', '1231', '1232', '1233', '13

  0%|          | 0/256 [00:00<?, ?it/s]

List length: 35
Number of monomials in a polynomial with 4 variables and degree 3: 35
Sparsity: 35
['0000', '0001', '0002', '0003', '0010', '0011', '0012', '0020', '0021', '0030', '0100', '0101', '0102', '0110', '0111', '0120', '0200', '0201', '0210', '0300', '1000', '1001', '1002', '1010', '1011', '1020', '1100', '1101', '1110', '1200', '2000', '2001', '2010', '2100', '3000']


In [8]:
list_of_polynomials = np.random.uniform(low=-10, high=10, size=(polynomial_data_size, sparsity))
            
if sample_sparsity < sparsity:
    for polynomial in tqdm(list_of_polynomials):
        sparsity_indices = np.random.choice(nCr(n+d, d), nCr(n+d, d)-sample_sparsity, replace=False)
        for sparsity_index in sparsity_indices:                            
            polynomial[sparsity_index] = 0
                                
polynomials_list_df = pd.DataFrame(data=list_of_polynomials, columns=list_of_monomial_identifiers)
    
print(len(list_of_monomial_identifiers))
print(polynomials_list_df.shape)

35
(100, 35)


In [9]:
list_of_polynomials[:2]

array([[-2.50919762,  9.01428613,  4.63987884,  1.97316968, -6.87962719,
        -6.88010959, -8.83832776,  7.32352292,  2.02230023,  4.16145156,
        -9.58831011,  9.39819704,  6.64885282, -5.75321779, -6.36350066,
        -6.3319098 , -3.91515514,  0.49512863, -1.36109963, -4.1754172 ,
         2.23705789, -7.21012279, -4.15710703, -2.67276313, -0.87860032,
         5.70351923, -6.00652436,  0.28468877,  1.84829138, -9.07099175,
         2.15089704, -6.58951753, -8.69896814,  8.97771075,  9.31264066],
       [ 6.16794696, -3.90772462, -8.04655772,  3.68466053, -1.19695013,
        -7.5592353 , -0.0964618 , -9.31222958,  8.18640804, -4.82440037,
         3.25044569, -3.76577848,  0.40136042,  0.93420559, -6.30291089,
         9.39169256,  5.50265647,  8.78997883,  7.89654701,  1.95799958,
         8.4374847 , -8.23014996, -6.08034275, -9.09545422, -3.49339338,
        -2.22645421, -4.57301936,  6.57475018, -2.86493347, -4.38130981,
         0.85392166, -7.1815155 ,  6.04393962, -8.

In [10]:
if same_training_all_lambda_nets:
    parallel = Parallel(n_jobs=n_jobs, verbose=3, backend='multiprocessing')
    result_list = parallel(delayed(gen_regression_symbolic)(polynomial_array=list_of_polynomials[i], 
                                                               n_samples=lambda_dataset_size,
                                                               noise=noise,
                                                               noise_dist='normal', 
                                                               seed=RANDOM_SEED, 
                                                               sympy_calculation=False) for i in range(polynomial_data_size))  
else:
    parallel = Parallel(n_jobs=n_jobs, verbose=3, backend='multiprocessing')
    result_list = parallel(delayed(gen_regression_symbolic)(polynomial_array=list_of_polynomials[i], 
                                                               n_samples=lambda_dataset_size,
                                                               noise=noise,
                                                               noise_dist='normal', 
                                                               seed=RANDOM_SEED+i, 
                                                               sympy_calculation=False) for i in range(polynomial_data_size))

X_data_list = [[pd.Series(result[0],  index=list_of_monomial_identifiers), pd.DataFrame(result[1], columns=list(variables[:n]))] for result in result_list]
y_data_list = [[pd.Series(result[0],  index=list_of_monomial_identifiers), pd.DataFrame(result[2], columns=['result'])] for result in result_list]



[Parallel(n_jobs=-3)]: Using backend MultiprocessingBackend with 22 concurrent workers.
[Parallel(n_jobs=-3)]: Done  91 out of 100 | elapsed:    1.9s remaining:    0.2s
[Parallel(n_jobs=-3)]: Done 100 out of 100 | elapsed:    2.1s finished


In [11]:
X_data_list[0][0].head()

0000   -2.509
0001    9.014
0002    4.640
0003    1.973
0010   -6.880
dtype: float64

In [12]:
X_data_list[0][1].head()

Unnamed: 0,a,b,c,d
0,0.375,0.185,0.262,0.673
1,0.951,0.542,0.247,0.797
2,0.732,0.873,0.906,0.25
3,0.599,0.732,0.25,0.625
4,0.156,0.807,0.272,0.572


In [13]:
y_data_list[0][0].head()

0000   -2.509
0001    9.014
0002    4.640
0003    1.973
0010   -6.880
dtype: float64

In [14]:
y_data_list[0][1].head()

Unnamed: 0,result
0,-0.17
1,-1.444
2,-23.861
3,-9.267
4,-9.417


In [16]:
path_polynomials = './data/saved_polynomial_lists/polynomials_sample' + str(polynomial_data_size) + '_variables_' + str(n) +  '_degree_' + str(d) + '_sparsity_' + str(sample_sparsity)  + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '.csv'
polynomials_list_df.to_csv(path_polynomials, index=False)

path_X_data = './data/saved_polynomial_lists/X_sample' + str(polynomial_data_size) + '_train_' + str(lambda_dataset_size) + '_variables_' + str(n) + '_degree_' + str(d) + '_sparsity_' + str(sample_sparsity) + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '_xmin_' + str(x_min) + '_xmax_' + str(x_max) + '_noise_' + str(noise) + training_string + '.pkl'
with open(path_X_data, 'wb') as f:
    pickle.dump(X_data_list, f)#, protocol=2)
    
path_y_data = './data/saved_polynomial_lists/y_sample' + str(polynomial_data_size) + '_train_' + str(lambda_dataset_size) + '_variables_' + str(n) + '_degree_' + str(d) + '_sparsity_' + str(sample_sparsity) + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '_xmin_' + str(x_min) + '_xmax_' + str(x_max) + '_noise_' + str(noise) + training_string + '.pkl'
with open(path_y_data, 'wb') as f:
    pickle.dump(y_data_list, f)#, protocol=2)


In [17]:
path_polynomials

'./data/saved_polynomial_lists/polynomials_sample100_variables_4_degree_3_sparsity_35_amin_-10_amax_10.csv'

In [18]:
path_X_data

'./data/saved_polynomial_lists/X_sample100_train_1000_variables_4_degree_3_sparsity_35_amin_-10_amax_10_xmin_0_xmax_1_noise_0_diverse.pkl'

In [None]:
path_X_data = './data/saved_polynomial_lists/X_sample' + str(polynomial_data_size) + '_train_' + str(lambda_dataset_size) + '_variables_' + str(n) + '_degree_' + str(d) + '_sparsity_' + str(sample_sparsity)  + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '_xmin_' + str(x_min) + '_xmax_' + str(x_max) + '_noise_' + str(noise) +  training_string + '.pkl'
