# Imports

In [1]:
import numpy as np
import pandas as pd
import math

import tensorflow as tf
import keras
from tensorflow import keras
from keras import models

from utilities.InterpretationNet import *
from utilities.LambdaNet import *
from utilities.metrics import *
from utilities.utility_functions import *
from utilities.DecisionTree_BASIC import *

import utilities_LR

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
import sklearn

import matplotlib.pyplot as plt

# Config

### Logistic Regression

In [2]:
config_LR = {
    'data': {
        'n_datasets': 10_000, # the number of datasets
        
        'n_samples': 4_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        'n_informative': 8,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 3.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 42,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'lambda': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': -1, # currently not used
                'val_size': 0.25,
                'random_state': None,
                'shuffle': True,
                'stratify': None
            }
        },
        'model_compile': {
            'optimizer_lambda': 'adam',
            'loss': 'mae', #tf.keras.losses.get(config['lambda_net']['loss_lambda']),
            'metrics': ['mae']
        },
        'model_fit': { # refer to keras API
            'batch_size': 32,
            'epochs': 100,
            'verbose': 0,
            'callbacks': None,
            'shuffle': True,
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'inets': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': 0.3,
                'val_size': 0.2,
                'random_state': None,
                'shuffle': True,
                'stratify': None
            }
        },
        'model_compile': {
            
        },
        'model_fit': { # refer to keras API
            'batch_size': 256,
            'epochs': 1000,
            'verbose': 'auto',
            'callbacks': None,
            'shuffle': True,
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'eval': {
        'n_datasets': 9_000,
        'n_samples_train': 2000,
        'n_samples_queryLambda': 1000, # _forLogRegBaseModel
        'n_samples_comparison': 1000 # compare inet and basemodel
    },
    'computation':{
        'n_jobs': 38,
        'use_gpu': False,
        'gpu_numbers': '31',
        'RANDOM_SEED': 1,   
    }
}

# Settings

In [3]:
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'

In [4]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = config_LR['computation']['gpu_numbers'] if config_LR['computation']['use_gpu'] else ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if config_LR['computation']['use_gpu'] else ''

os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda-11.4' if config_LR['computation']['use_gpu'] else ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 ,--tf_xla_enable_xla_devices' if config_LR['computation']['use_gpu'] else ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices

In [5]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num XLA-GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

Num GPUs Available:  0
Num XLA-GPUs Available:  0


2022-06-27 09:19:22.975836: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


# Load Models

In [6]:
@tf.function
def custom_loss(y_predictions_index, y_coef_pred):
    
    index = y_predictions_index[:, 0]
    y_true = y_predictions_index[:, 1:]
    
    index = tf.cast(index, tf.int32)
    
    X_feature_data_samples = tf.gather(valid_feature_data, index)
    
    #y_pred = tf.math.sigmoid(tf.linalg.matvec(valid_feature_data_sample, y_coef_pred))
    y_pred = tf.linalg.matvec(X_feature_data_samples, y_coef_pred)

    metric = tf.keras.losses.BinaryCrossentropy(
                                from_logits=True,
                                label_smoothing=0.0,
                                axis=-1,
                                reduction='auto',
                                name='binary_crossentropy')
    loss = metric(y_true, y_pred)
    return loss

In [7]:
def load_LR_inet():
    path = utilities_LR.inet_path_LR(config_LR)
    
    model = keras.models.load_model(path + '/modelKeras', custom_objects={'custom_loss': custom_loss})
    print(path)
    return model

In [8]:
model_LR = load_LR_inet()

data_LR/nda10000_nsa4000_nfe10_nin8_nta1_ncc2_sep3.0_noi0_shuTrue_ran42/tsi-1_vsi0.25_ranNone_shuTrue_strNone_bat32_epo100_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1/tsi0.3_vsi0.2_ranNone_shuTrue_strNone_bat256_epo1000_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1


In [9]:
model_LR.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4096)              29745152  
                                                                 
 dense_1 (Dense)             (None, 2048)              8390656   
                                                                 
 dense_2 (Dense)             (None, 1024)              2098176   
                                                                 
 dense_3 (Dense)             (None, 512)               524800    
                                                                 
 dense_4 (Dense)             (None, 10)                5130      
                                                                 
Total params: 40,763,914
Trainable params: 40,763,914
Non-trainable params: 0
_________________________________________________________________


# Load Testing Data

In [10]:
def get_y_pred_data():
    directory = utilities_LR.lambda_path_LR(config_LR)
    
    return np.load(directory + '/lambda_preds_list.npy', allow_pickle=True)

In [11]:
y_predictions_from_lambda = get_y_pred_data()

In [12]:
def load_lambda():
    directory = utilities_LR.lambda_path_LR(config_LR)
    
    return np.load(directory + '/lambda_weights_list.npy', allow_pickle=True)

In [13]:
lambda_weights = load_lambda()

In [14]:
directory = utilities_LR.data_path_LR(config_LR)

with open(directory + '/X_datasets_list_dataForLambda.npy', "rb") as f:
    X_datasets_list_LR_test = np.load(f, allow_pickle=True)

# Evaluate Inet for LR

In [15]:
def precision(tp, fp, tn, fn):
    return tp / (tp + fp)

In [16]:
def recall(tp, fp, tn, fn):
    return tp / (tp + fn)

In [17]:
def f1(tp, fp, tn, fn):
    pre = precision(tp, fp, tn, fn)
    rec = recall(tp, fp, tn, fn)
    return 2 * (pre * rec) / (pre + rec) 

# Evaluation on already known data

In [18]:
#X_test_weights = np.zeros([config_LR['eval']['n_datasets'], 8301, ])
#
#X_queryLambda_test = np.zeros([config_LR['eval']['n_datasets'], config_LR['eval']['n_samples_queryLambda'], config_LR['data']['n_features']])
#
#if  config_LR['data']['n_targets'] < 2:
#    y_test_coefs = np.zeros([config_LR['eval']['n_datasets'], config_LR['data']['n_features'], ])
#else:

#    print("#################### NOT YET IMPLEMENTED ######################")

In [19]:
model_LR.summary(expand_nested=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4096)              29745152  
                                                                 
 dense_1 (Dense)             (None, 2048)              8390656   
                                                                 
 dense_2 (Dense)             (None, 1024)              2098176   
                                                                 
 dense_3 (Dense)             (None, 512)               524800    
                                                                 
 dense_4 (Dense)             (None, 10)                5130      
                                                                 
Total params: 40,763,914
Trainable params: 40,763,914
Non-trainable params: 0
_________________________________________________________________


In [20]:
def evaluateOnDataset(index, X, y_pred_from_lambda, lambda_weight, inet):
    
    lambda_weight = lambda_weight.reshape([1, 7261])
    
    coef_pred_inet = inet.predict(lambda_weight)
    
    coef_pred_inet = coef_pred_inet.reshape([10])
    
    y_pred = np.dot(X, coef_pred_inet)
    y_pred = keras.activations.sigmoid(y_pred).numpy()
    y_pred = [1 if y>=0.5 else 0 for y in y_pred]
    
    y_true = y_pred_from_lambda
    y_true = [1 if y>=0.5 else 0 for y in y_true]
    
    #print(y_pred, y_true)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    return np.array([index, mean_squared_error(y_true, y_pred), tn, fp, fn, tp, precision(tp, fp, tn, fn), recall(tp, fp, tn, fn), f1(tp, fp, tn, fn)]).reshape([1, 9])

In [22]:
#X = X_datasets_list_LR_test[10]
#y_pred_from_lambda = y_predictions_from_lambda[10]
#lambda_weight = lambda_weights[10]
#
#lambda_weight = lambda_weight.reshape([1, 7261])
#
##print(X.shape)
##print(y_pred_from_lambda.shape)
##print(lambda_weight.shape)
#
#tn, fp, fn, tp = evaluateOnDataset(10, X, y_pred_from_lambda, lambda_weight, model_LR)
#
#print(tn, fp, fn, tp)

In [23]:
print(X_datasets_list_LR_test.shape)
print(y_predictions_from_lambda.shape)
print(lambda_weights.shape)

(10000, 4000, 10)
(10000, 4000)
(10000, 7261)


In [24]:
result = pd.DataFrame(columns=["index_0=aggregated", "mse",  "tp", "fn", "fp", "tn", "precision", "recall", "f1"])

for index, (X, y_pred_from_lambda, lambda_weight) in enumerate(zip(X_datasets_list_LR_test, y_predictions_from_lambda, lambda_weights), start=1):
    newRow = pd.DataFrame(evaluateOnDataset(index, X, y_pred_from_lambda, lambda_weight, model_LR), columns=["index_0=aggregated", "mse",  "tp", "fn", "fp", "tn", "precision", "recall", "f1"])
    result = pd.concat([result, newRow], axis=0, ignore_index=True)
    if index % 100 == 0:
        print("done", index)

done 100
done 200
done 300
done 400
done 500
done 600
done 700
done 800
done 900
done 1000
done 1100
done 1200
done 1300
done 1400
done 1500
done 1600
done 1700
done 1800
done 1900
done 2000
done 2100
done 2200
done 2300
done 2400
done 2500
done 2600
done 2700
done 2800
done 2900
done 3000
done 3100
done 3200
done 3300
done 3400
done 3500
done 3600
done 3700
done 3800
done 3900
done 4000
done 4100
done 4200
done 4300
done 4400
done 4500
done 4600
done 4700
done 4800
done 4900
done 5000
done 5100
done 5200
done 5300
done 5400
done 5500
done 5600
done 5700
done 5800
done 5900
done 6000
done 6100
done 6200
done 6300
done 6400
done 6500
done 6600
done 6700
done 6800
done 6900
done 7000
done 7100
done 7200
done 7300
done 7400
done 7500
done 7600
done 7700
done 7800
done 7900
done 8000
done 8100
done 8200
done 8300
done 8400
done 8500
done 8600
done 8700
done 8800
done 8900
done 9000
done 9100
done 9200
done 9300
done 9400
done 9500
done 9600
done 9700
done 9800
done 9900
done 10000


In [25]:
##parallel = Parallel(n_jobs=config_LR['computation']['n_jobs'], verbose=10, backend='loky') #loky
#parallel = Parallel(n_jobs=1, verbose=10, backend='loky') #loky
#
#results = parallel(delayed(evaluateOnDataset)(X, y_pred_from_lambda, lambda_weight, model_LR) for (X, y_pred_from_lambda, lambda_weight) in zip(X_datasets_list_LR_test, y_predictions_from_lambda, lambda_weights))
#                                  
#del parallel

In [26]:
result.head(200)

Unnamed: 0,index_0=aggregated,mse,tp,fn,fp,tn,precision,recall,f1
0,1.0,0.02775,1927.0,73.0,38.0,1962.0,0.964128,0.981,0.972491
1,2.0,0.00825,1981.0,19.0,14.0,1986.0,0.990524,0.993,0.99176
2,3.0,0.0225,1932.0,68.0,22.0,1978.0,0.966764,0.989,0.977756
3,4.0,0.689,964.0,1036.0,1720.0,280.0,0.212766,0.14,0.168878
4,5.0,0.0035,1987.0,10.0,4.0,1999.0,0.995022,0.998003,0.99651
...,...,...,...,...,...,...,...,...,...
195,196.0,0.0015,1998.0,2.0,4.0,1996.0,0.998999,0.998,0.998499
196,197.0,0.032,1925.0,75.0,53.0,1947.0,0.962908,0.9735,0.968175
197,198.0,0.004,1990.0,10.0,6.0,1994.0,0.99501,0.997,0.996004
198,199.0,0.261,1997.0,3.0,1041.0,959.0,0.996881,0.4795,0.647535


In [27]:
aggragated = pd.DataFrame(result.mean(numeric_only=False)).transpose()

In [28]:
aggragated

Unnamed: 0,index_0=aggregated,mse,tp,fn,fp,tn,precision,recall,f1
0,5000.5,0.116838,1765.2091,234.8057,232.5481,1767.4371,0.884191,0.883725,0.879406


In [29]:
aggragated.at[0, "index_0=aggregated"] = 0
result_aggregated = pd.concat([aggragated, result], axis=0, ignore_index=True)

In [30]:
result_aggregated

Unnamed: 0,index_0=aggregated,mse,tp,fn,fp,tn,precision,recall,f1
0,0.0,0.116838,1765.2091,234.8057,232.5481,1767.4371,0.884191,0.883725,0.879406
1,1.0,0.02775,1927.0,73.0,38.0,1962.0,0.964128,0.981,0.972491
2,2.0,0.00825,1981.0,19.0,14.0,1986.0,0.990524,0.993,0.99176
3,3.0,0.0225,1932.0,68.0,22.0,1978.0,0.966764,0.989,0.977756
4,4.0,0.689,964.0,1036.0,1720.0,280.0,0.212766,0.14,0.168878
...,...,...,...,...,...,...,...,...,...
9996,9996.0,0.02225,1957.0,40.0,49.0,1954.0,0.97994,0.975537,0.977733
9997,9997.0,0.2975,1991.0,6.0,1184.0,819.0,0.992727,0.408887,0.579208
9998,9998.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0
9999,9999.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0


In [31]:
def save_eval_res_valid(df):
    path = utilities_LR.inet_path_LR(config_LR)
    
    model = df.to_csv(path + '/evalRes_valid.csv')
    print(path)

In [32]:
save_eval_res_valid(result_aggregated)

data_LR/nda10000_nsa4000_nfe10_nin8_nta1_ncc2_sep3.0_noi0_shuTrue_ran42/tsi-1_vsi0.25_ranNone_shuTrue_strNone_bat32_epo100_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1/tsi0.3_vsi0.2_ranNone_shuTrue_strNone_bat256_epo1000_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1


# For Comparison: symbolic regression

In [36]:
def evaluateSymbolicReg(index, X_dataset, y_predictions_from_lambda):
    
    y_true = y_pred_from_lambda
    y_true = [1 if y>=0.5 else 0 for y in y_true]
    
    logreg = LogisticRegression()
    logreg.fit(X_dataset, y_true)
    
    y_pred = logreg.predict(X_dataset)
    y_pred = [1 if y>=0.5 else 0 for y in y_pred]
    
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    return np.array([index, mean_squared_error(y_true, y_pred), tn, fp, fn, tp, precision(tp, fp, tn, fn), recall(tp, fp, tn, fn), f1(tp, fp, tn, fn)]).reshape([1, 9])

In [37]:
result_symreg = pd.DataFrame(columns=["index_0=aggregated", "mse",  "tp", "fn", "fp", "tn", "precision", "recall", "f1"])

for index, (X, y_pred_from_lambda) in enumerate(zip(X_datasets_list_LR_test, y_predictions_from_lambda), start=1):
    newRow = pd.DataFrame(evaluateSymbolicReg(index, X, y_pred_from_lambda), columns=["index_0=aggregated", "mse",  "tp", "fn", "fp", "tn", "precision", "recall", "f1"])
    result_symreg = pd.concat([result_symreg, newRow], axis=0, ignore_index=True)
    if index % 100 == 0:
        print("done", index)

done 100
done 200
done 300
done 400
done 500
done 600
done 700
done 800
done 900
done 1000
done 1100
done 1200
done 1300
done 1400
done 1500
done 1600
done 1700
done 1800
done 1900
done 2000
done 2100
done 2200
done 2300
done 2400
done 2500
done 2600
done 2700
done 2800
done 2900
done 3000
done 3100
done 3200
done 3300
done 3400
done 3500
done 3600
done 3700
done 3800
done 3900
done 4000
done 4100
done 4200
done 4300
done 4400
done 4500
done 4600
done 4700
done 4800
done 4900
done 5000
done 5100
done 5200
done 5300
done 5400
done 5500
done 5600
done 5700
done 5800
done 5900
done 6000
done 6100
done 6200
done 6300
done 6400
done 6500
done 6600
done 6700
done 6800
done 6900
done 7000
done 7100
done 7200
done 7300
done 7400
done 7500
done 7600
done 7700
done 7800
done 7900
done 8000
done 8100
done 8200
done 8300
done 8400
done 8500
done 8600
done 8700
done 8800
done 8900
done 9000
done 9100
done 9200
done 9300
done 9400
done 9500
done 9600
done 9700
done 9800
done 9900
done 10000


In [38]:
result_symreg.head(200)

Unnamed: 0,index_0=aggregated,mse,tp,fn,fp,tn,precision,recall,f1
0,1.0,0.02,1966.0,34.0,46.0,1954.0,0.982897,0.977,0.97994
1,2.0,0.00725,1985.0,15.0,14.0,1986.0,0.992504,0.993,0.992752
2,3.0,0.001,1997.0,3.0,1.0,1999.0,0.998501,0.9995,0.999
3,4.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0
4,5.0,0.00325,1988.0,9.0,4.0,1999.0,0.995518,0.998003,0.996759
...,...,...,...,...,...,...,...,...,...
195,196.0,0.0015,1998.0,2.0,4.0,1996.0,0.998999,0.998,0.998499
196,197.0,0.03425,1919.0,81.0,56.0,1944.0,0.96,0.972,0.965963
197,198.0,0.004,1991.0,9.0,7.0,1993.0,0.995504,0.9965,0.996002
198,199.0,0.00575,1992.0,8.0,15.0,1985.0,0.995986,0.9925,0.99424


In [39]:
aggragated_symreg = pd.DataFrame(result_symreg.mean(numeric_only=False)).transpose()

In [40]:
aggragated_symreg.at[0, "index_0=aggregated"] = 0
result_aggregated_symreg = pd.concat([aggragated_symreg, result_symreg], axis=0, ignore_index=True)

In [43]:
result_aggregated_symreg.head(20)

Unnamed: 0,index_0=aggregated,mse,tp,fn,fp,tn,precision,recall,f1
0,0.0,0.013247,1973.5497,26.4651,26.521,1973.4642,0.98678,0.986738,0.986752
1,1.0,0.02,1966.0,34.0,46.0,1954.0,0.982897,0.977,0.97994
2,2.0,0.00725,1985.0,15.0,14.0,1986.0,0.992504,0.993,0.992752
3,3.0,0.001,1997.0,3.0,1.0,1999.0,0.998501,0.9995,0.999
4,4.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0
5,5.0,0.00325,1988.0,9.0,4.0,1999.0,0.995518,0.998003,0.996759
6,6.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0
7,7.0,0.0,2000.0,0.0,0.0,2000.0,1.0,1.0,1.0
8,8.0,0.00625,1990.0,10.0,15.0,1985.0,0.994987,0.9925,0.993742
9,9.0,0.01575,1970.0,30.0,33.0,1967.0,0.984977,0.9835,0.984238


In [41]:
def save_eval_res_symreg(df):
    path = utilities_LR.inet_path_LR(config_LR)
    
    model = df.to_csv(path + '/evalRes_symreg.csv')
    print(path)

In [42]:
save_eval_res_symreg(result_aggregated_symreg)

data_LR/nda10000_nsa4000_nfe10_nin8_nta1_ncc2_sep3.0_noi0_shuTrue_ran42/tsi-1_vsi0.25_ranNone_shuTrue_strNone_bat32_epo100_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1/tsi0.3_vsi0.2_ranNone_shuTrue_strNone_bat256_epo1000_shuTrue_claNone_samNone_ini0_steNone_vstNone_vbsNone_vfr1
