# Config & Imports

In [1]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import os

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization

import tensorflow as tf

import utilities_LR

from joblib import Parallel, delayed

from IPython.display import Image


In [13]:
config = {
    'data': {
        'n_datasets': 10_000, # the number of datasets
        
        'n_samples': 4_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        'n_informative': 8,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 3.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 42,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'lambda': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': -1, # currently not used
                'val_size': 0.25,
                'random_state': None,
                'shuffle': True,
                'stratify': None
            }
        },
        'model_compile': {
            'optimizer_lambda': 'adam',
            'loss': 'mae', #tf.keras.losses.get(config['lambda_net']['loss_lambda']),
            'metrics': ['mae']
        },
        'model_fit': { # refer to keras API
            'batch_size': 32,
            'epochs': 100,
            'verbose': 0,
            'callbacks': None,
            'shuffle': True,
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'computation':{
        'n_jobs': 35,
        'use_gpu': True,
        'gpu_numbers': '5',
        'RANDOM_SEED': 1,   
    }
}

## Settings

In [5]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = config['computation']['gpu_numbers'] if config['computation']['use_gpu'] else ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if config['computation']['use_gpu'] else ''

os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda-11.4' if config['computation']['use_gpu'] else ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 ,--tf_xla_enable_xla_devices' if config['computation']['use_gpu'] else ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices


In [6]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num XLA-GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

Num GPUs Available:  1
Num XLA-GPUs Available:  0


In [7]:
import logging
logging.getLogger('tensorflow').disabled = True

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [8]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 2:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
    #coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])
    #coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], config['data']['n_targets']])

In [9]:
directory = utilities_LR.data_path_LR(config)

with open(directory + '/X_datasets_list_dataForLambda.npy', "rb") as f:
    X_datasets_list = np.load(f, allow_pickle=True)
with open(directory + '/y_datasets_list_dataForLambda.npy', "rb") as f:
    y_datasets_list = np.load(f, allow_pickle=True)
#with open(directory + '/coef_list_targetForInet.npy', "rb") as f:
#    coef_list = np.load(f, allow_pickle=True)

# Save Model & Metrics (functions)

In [10]:
def save_models_predictions(weights_list, y_pred_list):
    directory = utilities_LR.lambda_path_LR(config)
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    np.save(directory + '/lambda_weights_list.npy', weights_list, allow_pickle=True)
    np.save(directory + '/lambda_preds_list.npy', y_pred_list, allow_pickle=True)

# Train Model

In [11]:
def train_nn(X, y, index):
    # Data Prep
    #X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(X,
    #                                                                      y)
    
    
    
    # Model Def
    model = Sequential()
    model.add(BatchNormalization(input_dim=config['data']['n_features']))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(config['data']['n_targets'], activation='sigmoid'))

    
    model.compile(optimizer=config['lambda']['model_compile']['optimizer_lambda'],
                  loss=config['lambda']['model_compile']['loss'],
                  metrics=config['lambda']['model_compile']['metrics']
                 )
    
    # Model fit
    _ = model.fit(x=X,
                        y=y,
                        batch_size=config['lambda']['model_fit']['batch_size'],
                        epochs=config['lambda']['model_fit']['epochs'],
                        verbose=config['lambda']['model_fit']['verbose'],
                        callbacks=config['lambda']['model_fit']['callbacks'],
                        #validation_data=(X_val, y_val),
                        validation_split=config['lambda']['data_prep']['train_test_val_split']['val_size'],
                        shuffle=config['lambda']['model_fit']['shuffle'],
                        class_weight=config['lambda']['model_fit']['class_weight'],
                        sample_weight=config['lambda']['model_fit']['sample_weight'],
                        initial_epoch=config['lambda']['model_fit']['initial_epoch'],
                        steps_per_epoch=config['lambda']['model_fit']['steps_per_epoch'],
                        validation_steps=config['lambda']['model_fit']['validation_steps'],
                        validation_batch_size=config['lambda']['model_fit']['validation_batch_size'],
                        validation_freq=config['lambda']['model_fit']['validation_freq'],
                       )
    
    lambda_weights = np.concatenate([x.flatten() for x in model.get_weights()])
    
    y_pred = model.predict(X)
    
    return lambda_weights, y_pred

In [14]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=10, backend='loky') #loky

weights_ypred_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list, y_datasets_list)))
                                  
del parallel

[Parallel(n_jobs=35)]: Using backend LokyBackend with 35 concurrent workers.
[Parallel(n_jobs=35)]: Done   2 tasks      | elapsed:  2.8min
[Parallel(n_jobs=35)]: Done  15 tasks      | elapsed:  2.9min
[Parallel(n_jobs=35)]: Done  28 tasks      | elapsed:  3.0min
[Parallel(n_jobs=35)]: Done  43 tasks      | elapsed:  5.3min
[Parallel(n_jobs=35)]: Done  58 tasks      | elapsed:  5.3min
[Parallel(n_jobs=35)]: Done  75 tasks      | elapsed:  7.5min
[Parallel(n_jobs=35)]: Done  92 tasks      | elapsed:  7.6min
[Parallel(n_jobs=35)]: Done 111 tasks      | elapsed:  9.9min
[Parallel(n_jobs=35)]: Done 130 tasks      | elapsed: 10.0min
[Parallel(n_jobs=35)]: Done 151 tasks      | elapsed: 12.3min
[Parallel(n_jobs=35)]: Done 172 tasks      | elapsed: 12.3min
[Parallel(n_jobs=35)]: Done 195 tasks      | elapsed: 14.6min
[Parallel(n_jobs=35)]: Done 218 tasks      | elapsed: 16.9min
[Parallel(n_jobs=35)]: Done 243 tasks      | elapsed: 17.0min
[Parallel(n_jobs=35)]: Done 268 tasks      | elapsed: 1

In [16]:
weights_list = np.stack([np.array(x[0]) for x in weights_ypred_list])
y_pred_list = np.stack([x[1] for x in weights_ypred_list])
y_pred_list = y_pred_list.reshape([config['data']['n_datasets'], config['data']['n_samples']])

# Inspect Metrics

In [17]:
weights_list.shape

(10000, 7261)

In [18]:
y_pred_list.shape

(10000, 4000)

In [19]:
import pandas as pd

In [20]:
pd.DataFrame(y_pred_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
0,1.0,6.434744e-14,1.307849e-21,7.946125e-13,5.303885e-12,4.21758e-13,1.0,3.680902e-11,2.960233e-19,1.0,...,5.229654e-14,1.0,1.57854e-13,1.0,1.352486e-16,1.032287e-15,1.0,1.0,1.0,4.209724e-15
1,1.113612e-11,1.0,1.0,1.0,2.072294e-16,2.880929e-23,1.0,1.0,1.0,1.0,...,1.215176e-11,1.9791040000000001e-22,1.0,1.0,4.319543e-12,8.373536e-15,1.949261e-14,3.193125e-21,3.155327e-10,1.0
2,7.670487e-15,1.0,6.390081e-13,1.696512e-11,1.0,6.291825e-10,1.0,1.9953190000000002e-22,2.346648e-14,1.0,...,1.8938e-13,1.0,1.0,1.0,4.524408e-16,3.6052060000000004e-23,2.01709e-16,2.521275e-19,1.0,1.423379e-14
3,4.192408e-26,1.0,7.200497e-18,1.8058320000000001e-25,8.48525e-21,2.930819e-17,4.6270720000000004e-17,1.0,4.869994e-13,1.0,...,9.158597999999999e-19,2.627438e-11,6.278842e-21,5.328234e-14,1.584057e-12,1.0,1.0,1.0,2.500075e-15,8.357590999999999e-20
4,3.5379829999999996e-24,1.0,1.0,1.0,1.483066e-15,4.504925e-18,1.640185e-29,1.304689e-15,2.487296e-14,2.444714e-16,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.527134e-11,4.840085e-20
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.227178e-19,2.45579e-12,1.0,1.0,1.0,1.524246e-14,3.68925e-16,3.0334070000000003e-25,4.4829840000000004e-17,1.259786e-15
6,9.104615e-24,1.0,3.170448e-16,1.0,1.774593e-21,1.0,1.0,1.0255e-16,1.117709e-18,1.0,...,1.0,5.43568e-13,1.032251e-15,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,6.880455e-14,6.599462e-18,1.0,1.0,1.0,1.0,1.926338e-15,...,1.0,9.653662e-16,3.092498e-14,1.0,1.770146e-08,2.000253e-11,1.343858e-20,1.208355e-09,1.0,1.742911e-16
8,1.18613e-31,2.3744490000000003e-22,4.630487e-14,1.0,1.0,8.535185e-27,1.0,8.608199e-20,1.0,7.892273e-29,...,1.856042e-15,1.0,1.0,1.169841e-16,0.999999,1.0,2.3446540000000002e-31,1.0,1.0,1.0
9,2.304477e-16,1.0,7.092364e-15,4.923382e-27,1.0,7.886491e-15,1.004017e-15,1.773818e-17,1.0,1.0,...,3.521201e-12,1.0,1.0,1.0,1.0,1.0,1.0,4.4436129999999996e-20,2.145659e-15,1.0


In [21]:
pd.DataFrame(weights_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7251,7252,7253,7254,7255,7256,7257,7258,7259,7260
0,1.224533,1.311303,1.069598,1.03223,0.913382,1.305164,0.891319,0.90594,1.186823,1.291981,...,0.283368,0.252753,-0.441036,-0.059673,0.464651,0.466574,0.478461,0.483865,-0.364532,0.098874
1,0.858855,1.057937,1.157303,1.328648,1.160229,1.026177,1.224373,1.127543,1.146065,1.263658,...,-0.419536,-0.264416,0.429057,-0.05503,0.512675,0.452411,-0.218909,-0.597525,0.308038,0.02341
2,1.462836,1.216883,1.126388,1.262367,1.119699,1.186647,0.960948,1.119254,1.043021,1.147554,...,0.344703,-0.224332,-0.009354,-0.29781,0.343289,0.308894,-0.230992,-0.220588,0.480867,-0.009434
3,1.142646,1.13509,1.311475,1.098766,1.135826,1.019746,1.233476,1.097263,0.991391,1.118042,...,0.276372,0.254382,0.191522,-0.376129,0.315664,0.243907,0.336152,-0.384309,-0.274546,0.060623
4,1.173387,1.049877,1.181393,1.137446,1.051264,1.187962,1.23578,1.192659,1.177141,1.210095,...,-0.445464,-0.531127,0.411707,0.305037,0.350934,-0.360666,0.481839,0.438938,0.307473,0.033234
5,1.107387,1.051951,1.167784,1.133991,1.221337,1.078113,1.013483,1.20724,1.152337,1.097755,...,0.323898,-0.413662,0.443275,0.318392,0.283597,-0.435471,-0.307024,0.298594,0.144047,0.041056
6,1.051855,1.135371,1.038824,1.207669,1.290952,0.984603,1.191565,1.166449,0.986744,1.140216,...,0.397898,-0.420655,-0.136681,0.242602,-0.382245,-0.217799,-0.046897,-0.401685,-0.228928,0.004136
7,0.87859,1.086588,1.080111,0.997233,0.925531,1.340595,1.077622,1.207699,1.147755,1.395204,...,-0.399505,-0.391623,-0.337186,-0.549944,-0.366693,-0.07773,-0.384776,0.361813,0.301339,0.006039
8,1.321473,0.995298,1.215532,1.327715,1.080058,1.145172,1.269938,0.797027,1.167891,0.978216,...,0.057234,-0.362935,-0.427748,0.085066,0.409106,-0.288275,-0.320231,0.444427,-0.411386,-0.02359
9,1.062808,1.285533,0.957622,1.070989,1.314091,1.182464,1.160878,1.084304,1.085577,1.285623,...,0.248037,-0.261183,0.177985,0.12917,-0.315921,-0.178998,0.262306,0.357137,-0.396296,0.020522


# Save Models

In [22]:
save_models_predictions(weights_list, y_pred_list)