# Function Generation for the Training of λ-Nets

## Package installation (uncommand first line to install packages at the beginning)

In [1]:
%%script false --no-raise-error
#if some errors occur during the installation consider using "sudo pip install"
!pip install numpy
!pip install pandas
!pip install truth-table-generator
!pip install more-itertools
!pip install tqdm
!pip install joblib
!pip install scipy
!pip install PrettyTable
!pip install colored
!pip install scikit-learn
!pip install keras
!pip install ipython
!pip install livelossplot
!pip install matplotlib
!pip install seaborn
!pip install tensorflow
!pip install tensorflow-gpu

## Specitication of Experiment Settings

In [2]:
import math
def nCr(n,r):
    f = math.factorial
    return f(n) // f(r) // f(n-r)

In [3]:
d = 3  
n = 4
sparsity = nCr(n+d, d)


x_max = 1 
x_min = -1
x_step = 0.01
a_max = 10 
a_min = -10
a_step = 0.1

n_jobs = 5

lambda_dataset_size = 1000 #specify the number of data points to calculate the function values for (determines the lambda net training size)

interpretation_dataset_size = 500 #specifies the number of functions generated (specifies the interpretation-net dataset size)


In [4]:
##############DO NOT CHANGE###################
variables = 'abcdefghijklmnopqrstuvwxyz'[:n]

RANDOM_SEED = 42

same_training_all_polynomials = True

if same_training_all_polynomials:
    training_string = '_same'
else:
    training_string = '_diverse'
    
print('Variables: ' + str(n) + ' (' + variables + ')')
print('Degree: ' + str(d))
print('Sparsity: ' + str(sparsity)) 
print('Lambda-Net Dataset Size: ' + str(lambda_dataset_size))
print('I-Net Dataset Size: ' + str(interpretation_dataset_size))
      
print('Coefficient Range: ' + '[' + str(a_min) + ', ' + str(a_max) + ']')
print('Variable Range: ' + '[' + str(x_min) + ', ' + str(x_max) + ']')

Variables: 4 (abcd)
Degree: 3
Sparsity: 35
Lambda-Net Dataset Size: 10
I-Net Dataset Size: 50
Coefficient Range: [-10, 10]
Variable Range: [-1, 1]


## Imports

In [5]:
import ttg
from itertools import product       # forms cartesian products
from more_itertools import random_product 
from tqdm import tqdm_notebook as tqdm
import pickle

import numpy as np

import pandas as pd
from joblib import Parallel, delayed

import random 
from random import sample 
random.seed(42)

import os

directory_names = ['parameters', 'plotting', 'saved_polynomial_lists', 'results', 'saved_models', 'weights', 'weights_training']
if not os.path.exists('./data'):
    os.mkdir('./data')
for directory_name in directory_names:
    path = './data/' + directory_name
    if not os.path.exists(path):
        os.mkdir(path)
        
import seaborn as sns
sns.set_style("darkgrid")
        
import random 
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)




## Utility functions

In [6]:
#test for exact equality
def arreq_in_list(myarr, list_arrays):
    return next((True for elem in list_arrays if np.array_equal(elem, myarr)), False)

In [7]:
ALPHABET = \
  "0123456789abcdefghijklmnopqrstuvwxyz"

def encode (n):
    try:
        return ALPHABET [n]
    except IndexError:
        raise Exception ("cannot encode: %s" % n)

def dec_to_base (dec = 0, base = 16):
    if dec < base:
        return encode (dec)
    else:
        return dec_to_base (dec // base, base) + encode (dec % base)

# Function Generation

In [8]:
list_of_monomial_identifiers_extended = []
for i in tqdm(range((d+1)**n)):    
    monomial_identifier = dec_to_base(i, base = (d+1)).zfill(n) 
    list_of_monomial_identifiers_extended.append(monomial_identifier)

print('List length: ' + str(len(list_of_monomial_identifiers_extended)))
print('Number of monomials in a polynomial with ' + str(n) + ' variables and degree ' + str(d) + ': ' + str(nCr(n+d, d)))
print('Sparsity: ' + str(sparsity))
print(list_of_monomial_identifiers_extended)

list_of_monomial_identifiers = []
for monomial_identifier in tqdm(list_of_monomial_identifiers_extended):
    monomial_identifier_values = list(map(int, list(monomial_identifier)))
    if sum(monomial_identifier_values) <= d:
        list_of_monomial_identifiers.append(monomial_identifier)

print('List length: ' + str(len(list_of_monomial_identifiers)))
print('Number of monomials in a polynomial with ' + str(n) + ' variables and degree ' + str(d) + ': ' + str(nCr(n+d, d)))
print('Sparsity: ' + str(sparsity))
print(list_of_monomial_identifiers)


HBox(children=(IntProgress(value=0, max=256), HTML(value='')))


List length: 256
Number of monomials in a polynomial with 4 variables and degree 3: 35
Sparsity: 35
['0000', '0001', '0002', '0003', '0010', '0011', '0012', '0013', '0020', '0021', '0022', '0023', '0030', '0031', '0032', '0033', '0100', '0101', '0102', '0103', '0110', '0111', '0112', '0113', '0120', '0121', '0122', '0123', '0130', '0131', '0132', '0133', '0200', '0201', '0202', '0203', '0210', '0211', '0212', '0213', '0220', '0221', '0222', '0223', '0230', '0231', '0232', '0233', '0300', '0301', '0302', '0303', '0310', '0311', '0312', '0313', '0320', '0321', '0322', '0323', '0330', '0331', '0332', '0333', '1000', '1001', '1002', '1003', '1010', '1011', '1012', '1013', '1020', '1021', '1022', '1023', '1030', '1031', '1032', '1033', '1100', '1101', '1102', '1103', '1110', '1111', '1112', '1113', '1120', '1121', '1122', '1123', '1130', '1131', '1132', '1133', '1200', '1201', '1202', '1203', '1210', '1211', '1212', '1213', '1220', '1221', '1222', '1223', '1230', '1231', '1232', '1233', '1

HBox(children=(IntProgress(value=0, max=256), HTML(value='')))


List length: 35
Number of monomials in a polynomial with 4 variables and degree 3: 35
Sparsity: 35
['0000', '0001', '0002', '0003', '0010', '0011', '0012', '0020', '0021', '0030', '0100', '0101', '0102', '0110', '0111', '0120', '0200', '0201', '0210', '0300', '1000', '1001', '1002', '1010', '1011', '1020', '1100', '1101', '1110', '1200', '2000', '2001', '2010', '2100', '3000']


In [9]:
list_of_polynomials = []

if interpretation_dataset_size/((a_max-a_min)*10**int(-np.log10(a_step)))**(nCr(n+d, d)) <= 10e-4:
    while len(list_of_polynomials) < interpretation_dataset_size:
        random_polynomial = list(random_product([i*a_step for i in range(int(a_min*10**int(-np.log10(a_step))), int(a_max*10**int(-np.log10(a_step))))], repeat=nCr(n+d, d)))
        if random_polynomial not in list_of_polynomials:
            list_of_polynomials.append(random_polynomial)
else:
    all_polynomials_list = list(product([i*a_step for i in range(int(a_min*10**int(-np.log10(a_step))), int(a_max*10**int(-np.log10(a_step))))], repeat=nCr(n+d, d)))
    list_of_polynomials = [all_polynomials_list[i] for i in np.random.choice(len(all_polynomials_list), interpretation_dataset_size, replace=False)]
    del all_polynomials_list

for polynomial in tqdm(list_of_polynomials):
    sparsity_indices = np.random.choice(nCr(n+d, d), nCr(n+d, d)-sparsity, replace=False)
    for sparsity_index in sparsity_indices:                            
        polynomial[sparsity_index] = 0
                                
polynomials_list_df = pd.DataFrame(data=list_of_polynomials, columns=list_of_monomial_identifiers)

    
print(len(list_of_monomial_identifiers))
print(polynomials_list_df.shape)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


35
(50, 35)


In [10]:
polynomials_list_df.head()

Unnamed: 0,0000,0001,0002,0003,0010,0011,0012,0020,0021,0030,...,1020,1100,1101,1110,1200,2000,2001,2010,2100,3000
0,6.3,-7.2,-9.4,8.9,-3.0,-3.8,-4.3,-6.5,8.8,-7.4,...,-5.0,8.3,6.6,7.9,3.9,0.7,-4.4,1.4,5.0,-2.9
1,-9.9,9.4,-6.0,7.8,0.8,-1.3,-2.9,-6.1,-4.5,9.5,...,-8.0,4.1,-2.5,6.0,5.8,-0.8,4.7,-5.1,8.0,-8.3
2,-8.9,6.9,-4.2,9.7,-2.6,-8.0,-4.1,-7.5,-0.3,-2.9,...,-5.7,3.6,8.6,-3.8,-5.9,1.8,-0.3,-3.1,6.3,7.6
3,4.2,-4.4,7.5,-1.7,9.6,9.8,-8.6,-4.2,-9.2,-2.0,...,-6.5,-3.7,9.0,4.3,3.7,-3.3,9.1,4.9,0.9,4.9
4,0.2,-0.8,-4.4,-6.5,3.0,2.6,-7.7,9.3,-8.8,-7.2,...,7.4,8.4,-7.1,7.4,3.7,9.2,-3.2,9.6,6.4,-1.3


In [11]:
if same_training_all_polynomials:
    x_values_list = []
    for i in tqdm(range(lambda_dataset_size)):
        values = np.round(np.array(random_product(np.arange(x_min, x_max, x_step), repeat=n)), int(-np.log10(x_step)))
        while arreq_in_list(values, x_values_list):
                values = np.round(np.array(random_product(np.arange(x_min, x_max, x_step), repeat=n)), int(-np.log10(x_step)))         
        x_values_list.append(values)



HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [12]:
x_values_list

[array([0.59, 0.57, 0.72, 0.64]),
 array([ 0.42, -0.94,  0.55,  0.68]),
 array([ 0.77, -0.32, -0.93, -0.54]),
 array([-0.31,  0.79,  0.95, -0.21]),
 array([-0.14, -0.11, -0.99, -0.54]),
 array([-0.64,  0.44,  0.68,  0.02]),
 array([-0.83, -0.64,  0.89,  0.62]),
 array([-0.93, -0.77,  0.91,  0.35]),
 array([-0.45, -0.04,  0.07,  0.16]),
 array([-0.13, -0.6 , -0.06, -0.21])]

In [13]:
polynomials_list_df.shape

(50, 35)

In [14]:
from functools import reduce

def calcualate_function_with_data(coefficient_list, variable_values):
    
    result = 0    
    for coefficient_value, coefficient_multipliers in zip(coefficient_list, list_of_monomial_identifiers):
        partial_results = [variable_value**int(coefficient_multiplier) for coefficient_multiplier, variable_value in zip(coefficient_multipliers, variable_values)]
        
        result += coefficient_value * reduce(lambda x, y: x*y, partial_results)

    return result, variable_values
 
def calculate_function_values_from_polynomial(true_value_test, evaluation_dataset):

    #print('method_call')

    if isinstance(true_value_test, pd.DataFrame):
        true_value_test = true_value_test.values
        
    true_value_fv = []
    true_value_coeff = []
    
    #print('start_loop')
    
    for evaluation in evaluation_dataset:
        true_function_value, true_coeff = calcualate_function_with_data(true_value_test, evaluation)
       
        true_value_fv.append(true_function_value) 
        true_value_coeff.append(true_coeff)


    #print('end_loop')
        
    return [true_value_test, pd.DataFrame(np.array(true_value_coeff))], [true_value_test, pd.DataFrame(np.array(true_value_fv))]


In [15]:
result_list = []

polynomials_X_data_list = []
polynomials_y_data_list = []
    
chunks = max(interpretation_dataset_size//1000, 1)

for polynomials_list_df_chunk in tqdm(np.array_split(polynomials_list_df, chunks), total=chunks):
    parallel = Parallel(n_jobs=n_jobs, verbose=3, backend='loky')
    result_sublist = parallel(delayed(calculate_function_values_from_polynomial)(polynomial, x_values_list) for iterator, polynomial in polynomials_list_df_chunk.iterrows())  
    result_list.extend(result_sublist)
    del parallel

polynomials_X_data_list = [result[0] for result in result_list]
polynomials_y_data_list = [result[1] for result in result_list]


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.





[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    1.0s
[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed:    1.0s finished


In [16]:
polynomials_X_data_list[0][0].head()

0000    6.3
0001   -7.2
0002   -9.4
0003    8.9
0010   -3.0
Name: 0, dtype: float64

In [17]:
polynomials_X_data_list[0][1].head()

Unnamed: 0,0,1,2,3
0,0.59,0.57,0.72,0.64
1,0.42,-0.94,0.55,0.68
2,0.77,-0.32,-0.93,-0.54
3,-0.31,0.79,0.95,-0.21
4,-0.14,-0.11,-0.99,-0.54


In [18]:
polynomials_y_data_list[0][0].values

array([ 6.3, -7.2, -9.4,  8.9, -3. , -3.8, -4.3, -6.5,  8.8, -7.4,  7.3,
        8.9,  3.9, -7.8,  5.1,  0.8, -9.2, -9.3, -7.7, -4.5, -4.1,  2.9,
        5.4, -9.4,  4.3, -5. ,  8.3,  6.6,  7.9,  3.9,  0.7, -4.4,  1.4,
        5. , -2.9])

In [19]:
polynomials_y_data_list[0][0].head()

0000    6.3
0001   -7.2
0002   -9.4
0003    8.9
0010   -3.0
Name: 0, dtype: float64

In [20]:
polynomials_y_data_list[0][1].head()

Unnamed: 0,0
0,-6.569992
1,-40.256885
2,1.886288
3,-19.520226
4,2.400554


In [27]:
polynomials_X_data_list[0]

[0000    6.3
 0001   -7.2
 0002   -9.4
 0003    8.9
 0010   -3.0
 0011   -3.8
 0012   -4.3
 0020   -6.5
 0021    8.8
 0030   -7.4
 0100    7.3
 0101    8.9
 0102    3.9
 0110   -7.8
 0111    5.1
 0120    0.8
 0200   -9.2
 0201   -9.3
 0210   -7.7
 0300   -4.5
 1000   -4.1
 1001    2.9
 1002    5.4
 1010   -9.4
 1011    4.3
 1020   -5.0
 1100    8.3
 1101    6.6
 1110    7.9
 1200    3.9
 2000    0.7
 2001   -4.4
 2010    1.4
 2100    5.0
 3000   -2.9
 Name: 0, dtype: float64,       0     1     2     3
 0  0.59  0.57  0.72  0.64
 1  0.42 -0.94  0.55  0.68
 2  0.77 -0.32 -0.93 -0.54
 3 -0.31  0.79  0.95 -0.21
 4 -0.14 -0.11 -0.99 -0.54
 5 -0.64  0.44  0.68  0.02
 6 -0.83 -0.64  0.89  0.62
 7 -0.93 -0.77  0.91  0.35
 8 -0.45 -0.04  0.07  0.16
 9 -0.13 -0.60 -0.06 -0.21]

In [26]:
polynomials_y_data_list[0]

[0000    6.3
 0001   -7.2
 0002   -9.4
 0003    8.9
 0010   -3.0
 0011   -3.8
 0012   -4.3
 0020   -6.5
 0021    8.8
 0030   -7.4
 0100    7.3
 0101    8.9
 0102    3.9
 0110   -7.8
 0111    5.1
 0120    0.8
 0200   -9.2
 0201   -9.3
 0210   -7.7
 0300   -4.5
 1000   -4.1
 1001    2.9
 1002    5.4
 1010   -9.4
 1011    4.3
 1020   -5.0
 1100    8.3
 1101    6.6
 1110    7.9
 1200    3.9
 2000    0.7
 2001   -4.4
 2010    1.4
 2100    5.0
 3000   -2.9
 Name: 0, dtype: float64,            0
 0  -6.569992
 1 -40.256885
 2   1.886288
 3 -19.520226
 4   2.400554
 5   2.794980
 6 -10.266045
 7  -1.225909
 8   6.582626
 9   3.081288]

In [21]:
path_polynomials = './data/saved_polynomial_lists/polynomials_sample' + str(interpretation_dataset_size) + '_variables_' + str(n) +  '_degree_' + str(d) + '_sparsity_' + str(sparsity) + '_astep_' + str(a_step)  + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '.csv'
polynomials_list_df.to_csv(path_polynomials, index=False)

path_X_data = './data/saved_polynomial_lists/X_sample' + str(interpretation_dataset_size) + '_train_' + str(lambda_dataset_size) + '_variables_' + str(n) + '_degree_' + str(d) + '_sparsity_' + str(sparsity) + '_astep_' + str(a_step) + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '_xstep_' + str(x_step) + '_xmin_' + str(x_min) + '_xmax_' + str(x_max) + training_string + '.pkl'
with open(path_X_data, 'wb') as f:
    pickle.dump(polynomials_X_data_list, f)#, protocol=2)
    
path_y_data = './data/saved_polynomial_lists/y_sample' + str(interpretation_dataset_size) + '_train_' + str(lambda_dataset_size) + '_variables_' + str(n) + '_degree_' + str(d) + '_sparsity_' + str(sparsity) + '_astep_' + str(a_step) + '_amin_' + str(a_min) + '_amax_' + str(a_max) + '_xstep_' + str(x_step) + '_xmin_' + str(x_min) + '_xmax_' + str(x_max) + training_string + '.pkl'
with open(path_y_data, 'wb') as f:
    pickle.dump(polynomials_y_data_list, f)#, protocol=2)
