# Config & Imports

In [1]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import os

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping

import tensorflow as tf

import utilities_LR

from joblib import Parallel, delayed

from IPython.display import Image


2022-07-08 19:09:25.925317: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-08 19:09:25.925360: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
config = {
    'data': {
        'n_datasets': 45_000, # the number of datasets
        
        'n_samples': 5_000, # the number of samples per dataset
        
        'n_features': 20, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        #'n_informative': random.randint(2, 10),
        'n_informative': 'random',
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        ### int or 'random'
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 1,
        # The number of clusters per class.
        
        'class_sep': 1.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 46,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'lambda': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': 0.1,
                'val_size': 0.15,
                'random_state': None,
                'shuffle': False, # should be always false
                'stratify': None
            }
        },
        'model_compile': {
            'optimizer_lambda': 'adam',
            'loss': 'mae',# keras.losses.BinaryCrossentropy(from_logits=False), #tf.keras.losses.get(config['lambda_net']['loss_lambda']), # 'mae'
            'metrics': [], #'mae', keras.losses.BinaryCrossentropy(from_logits=False)]
        },
        'model_fit': { # refer to keras API
            'batch_size': 64,
            'epochs': 500,
            'verbose': 0,
            'callbacks': None,
            'shuffle': True, # usually true
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'computation':{
        'n_jobs': 100,
        'use_gpu': False,
        'gpu_numbers': '4',
        'RANDOM_SEED': 1,   
    }
}

## Settings

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = config['computation']['gpu_numbers'] if config['computation']['use_gpu'] else ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if config['computation']['use_gpu'] else ''

os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda-11.4' if config['computation']['use_gpu'] else ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 ,--tf_xla_enable_xla_devices' if config['computation']['use_gpu'] else ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices


In [4]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num XLA-GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

Num GPUs Available:  0
Num XLA-GPUs Available:  0


2022-07-08 19:09:30.121707: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-08 19:09:30.121744: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-08 19:09:30.121775: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dws-11): /proc/driver/nvidia/version does not exist


In [5]:
import logging
logging.getLogger('tensorflow').disabled = True

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [6]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 2:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])

In [7]:
directory = utilities_LR.data_path_LR(config)

with open(directory + '/X.npy', "rb") as f:
    X_datasets_list = np.load(f, allow_pickle=True)
with open(directory + '/y.npy', "rb") as f:
    y_datasets_list = np.load(f, allow_pickle=True)

# Save Model & Metrics (functions)

In [8]:
def save_models_predictions(weights_list, y_pred_list):
    directory = utilities_LR.lambda_path_LR(config)
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    np.save(directory + '/lambda_weights_list.npy', weights_list, allow_pickle=True)
    np.save(directory + '/lambda_preds_list.npy', y_pred_list, allow_pickle=True)

# Train Model

In [9]:
def train_nn(X, y, index):
    # Data Prep
    X_train, _, y_train, _ = train_test_split(X, y, 
                                                        test_size=config['lambda']['data_prep']['train_test_val_split']['test_size'],
                                                        train_size=None,
                                                        random_state=None,
                                                        shuffle=config['lambda']['data_prep']['train_test_val_split']['shuffle'],
                                                        stratify=None,
                                                       )
    
    
    
    # Model Def
    
    early_stopping = EarlyStopping(monitor='val_loss',
                                min_delta=0.001,
                                patience=15,
                                verbose=0,
                                mode='auto',
                                baseline=None,
                                restore_best_weights=True)
    
    
    model = Sequential()
    model.add(BatchNormalization(input_dim=config['data']['n_features']))
    model.add(Dense(100, activation='swish'))
    #model.add(Dense(60, activation='relu'))
    model.add(Dense(config['data']['n_targets'], activation='sigmoid'))

    
    model.compile(optimizer=config['lambda']['model_compile']['optimizer_lambda'],
                  loss=config['lambda']['model_compile']['loss'],
                  metrics=config['lambda']['model_compile']['metrics']
                 )
    
    #print(model.summary())
    
    # Model fit
    _ = model.fit(x=X_train,
                        y=y_train,
                        batch_size=config['lambda']['model_fit']['batch_size'],
                        epochs=config['lambda']['model_fit']['epochs'],
                        verbose=config['lambda']['model_fit']['verbose'],
                        callbacks=[early_stopping],
                        #validation_data=(X_val, y_val),
                        validation_split=config['lambda']['data_prep']['train_test_val_split']['val_size'],
                        shuffle=config['lambda']['model_fit']['shuffle'],
                        class_weight=config['lambda']['model_fit']['class_weight'],
                        sample_weight=config['lambda']['model_fit']['sample_weight'],
                        initial_epoch=config['lambda']['model_fit']['initial_epoch'],
                        steps_per_epoch=config['lambda']['model_fit']['steps_per_epoch'],
                        validation_steps=config['lambda']['model_fit']['validation_steps'],
                        validation_batch_size=config['lambda']['model_fit']['validation_batch_size'],
                        validation_freq=config['lambda']['model_fit']['validation_freq'],
                       )
    
    lambda_weights = np.concatenate([x.flatten() for x in model.get_weights()])
    
    y_pred = model.predict(X, verbose=0)
    
    return lambda_weights, y_pred

In [10]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=10, backend='loky') #loky

weights_ypred_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list, y_datasets_list)))
#weights_ypred_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list[:5], y_datasets_list[:5])))
                                  
del parallel

[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done  21 tasks      | elapsed:  1.5min
[Parallel(n_jobs=100)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=100)]: Done  65 tasks      | elapsed:  2.6min
[Parallel(n_jobs=100)]: Done  88 tasks      | elapsed:  3.2min
[Parallel(n_jobs=100)]: Done 113 tasks      | elapsed:  3.6min
[Parallel(n_jobs=100)]: Done 138 tasks      | elapsed:  4.1min
[Parallel(n_jobs=100)]: Done 165 tasks      | elapsed:  4.5min
[Parallel(n_jobs=100)]: Done 192 tasks      | elapsed:  5.1min
[Parallel(n_jobs=100)]: Done 221 tasks      | elapsed:  5.6min
[Parallel(n_jobs=100)]: Done 250 tasks      | elapsed:  6.1min
[Parallel(n_jobs=100)]: Done 281 tasks      | elapsed:  6.8min
[Parallel(n_jobs=100)]: Done 312 tasks      | elapsed:  7.3min
[Parallel(n_jobs=100)]: Done 345 tasks      | elapsed:  7.9min
[Parallel(n_jobs=100)]: Done 378 tasks      | elapsed:  8.5min
[Parallel(n_jobs=100)]: Done 413 tasks 

In [11]:
weights_list = np.stack([np.array(x[0]) for x in weights_ypred_list])
y_pred_list = np.stack([x[1] for x in weights_ypred_list])

In [12]:
y_pred_list = y_pred_list.reshape([config['data']['n_datasets'], config['data']['n_samples']])

# Inspect Metrics

In [13]:
weights_list.shape

(45000, 2281)

In [14]:
y_pred_list.shape

(45000, 5000)

In [15]:
import pandas as pd

In [16]:
pd.DataFrame(y_pred_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.9999962,0.9999999,1.0,1.0,1.0,6.138527e-11,1.0,8.691417e-12,0.9999999,8.473344e-14,...,1.473599e-05,1.0,1.0,2.476363e-05,0.8923207,6.114637e-09,3.915742e-09,1.046712e-12,0.99751,0.9999999
1,1.0,0.9999999,1.0,1.0,0.9999998,1.0,0.9999994,2.286794e-10,1.0,8.94285e-09,...,2.430906e-06,0.1617538,1.0,1.0,3.631763e-12,6.95983e-15,2.689273e-12,0.999999,1.0,2.578691e-07
2,2.124277e-05,0.9976035,1.0,0.001017615,1.0,1.7931e-30,4.044016e-18,0.9999999,2.6381890000000002e-23,1.0,...,1.895738e-07,2.164144e-07,1.0,5.00853e-09,1.0,9.608814e-21,4.042871e-12,0.9999999,1.0,1.0
3,0.9999983,0.0002809809,1.0,1.0,0.9619053,1.0,2.76957e-07,0.999996,0.0001479902,5.253218e-10,...,0.9999979,0.9999946,1.0,1.0,0.9999999,6.640541e-09,0.9999997,2.296707e-05,1.0,0.9997872
4,0.9999999,1.0,1.550677e-07,0.9938993,1.0,1.0,3.646212e-05,1.0,0.9999982,0.9999999,...,4.706445e-08,1.0,1.0,1.0,1.282193e-07,3.215503e-08,0.0002011468,0.0003064805,6.523809e-07,0.0001184045
5,1.402515e-17,5.543542e-07,1.0,7.456075e-08,7.482244e-12,1.958541e-10,8.700747e-10,0.9999967,0.0202347,1.0,...,0.9999977,2.42238e-07,6.209589e-13,2.724104e-12,1.0,1.0,1.0,0.9975051,2.286279e-05,3.689187e-12
6,0.9980575,4.346909e-07,2.332441e-07,0.9999734,0.9998739,0.9997934,7.784935e-06,0.0001354322,0.9999295,8.806789e-05,...,0.0004230794,0.9993427,3.445297e-06,0.9999983,0.0008130342,0.002895789,0.9999972,0.9998887,3.69981e-05,0.9999873
7,0.9999235,0.9999724,0.0003998061,1.867596e-09,1.0,0.0008170942,1.526337e-15,2.91181e-08,0.001155214,0.9880936,...,1.0,5.063271e-08,7.934025e-13,0.01255124,0.999999,1.0,0.9999994,6.022833e-10,1.006256e-06,9.189248e-09
8,1.328889e-08,7.525453e-07,1.0,2.695145e-10,4.717558e-07,1.205683e-10,0.9999977,1.278498e-14,5.679117e-18,1.0,...,0.9998423,1.0,1.0,5.265351e-08,1.0,3.029898e-10,0.9999999,0.998575,1.0,0.9999973
9,7.284993e-07,0.9993867,8.589901e-11,0.9999997,1.730399e-14,1.068417e-10,1.433693e-07,6.932717e-09,1.0,8.65296e-06,...,2.043868e-14,1.0,0.9999996,0.9999977,5.957645e-07,0.06663851,1.0,0.9999989,1.292809e-05,0.9999996


In [17]:
pd.DataFrame(weights_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2271,2272,2273,2274,2275,2276,2277,2278,2279,2280
0,1.273227,1.456176,1.714342,1.167481,1.291423,1.314084,1.478568,1.341347,1.867055,1.218742,...,-0.387057,0.617674,0.485932,-0.735704,-0.275552,0.716748,-0.403787,-0.365568,-0.695516,-0.020497
1,1.634331,1.709461,1.313607,1.399012,1.978154,1.488322,1.506199,1.55869,1.309333,1.548079,...,-0.490861,0.677688,0.405649,0.571978,0.397829,0.622126,0.780143,-0.562525,0.507592,0.00839
2,1.308087,1.54586,1.570379,1.837858,1.582708,1.463777,1.472044,1.886343,1.327492,1.787939,...,-0.362005,-0.610704,0.512364,0.264245,-0.504969,-0.093688,-0.32362,-0.561719,-0.442283,-0.00541
3,1.44857,1.513339,1.374404,1.130764,1.329992,1.798237,1.195872,1.451789,1.149218,1.790473,...,0.521757,-0.382411,0.489943,0.205914,0.575427,0.234203,0.454195,-0.310278,-0.510932,-0.002442
4,1.369443,1.265511,1.327023,1.326311,1.251862,1.635917,1.545733,1.274655,1.463871,1.37549,...,-0.286706,0.415688,-0.305275,0.260771,-0.063645,-0.169689,0.369789,-0.298626,0.32318,-0.010952
5,1.461969,1.720233,1.537704,1.797598,1.412351,1.749615,1.409657,1.762415,1.479785,1.499247,...,-0.611237,-0.896081,0.610535,0.402509,0.417682,0.249992,-0.673877,0.267124,0.228789,-0.029592
6,1.27665,1.013818,1.250898,1.314274,1.151043,1.106136,1.21679,1.246972,1.202147,1.125733,...,-0.2644,0.23252,-0.236806,-0.236197,-0.346603,-0.074671,0.188674,0.371697,0.367823,0.009762
7,1.587411,1.365236,1.99934,1.280902,1.472057,1.595768,1.455997,1.807051,1.417556,1.392983,...,0.766232,0.425392,-0.462283,0.40209,-0.53625,-0.479621,0.458906,-0.424887,0.471068,0.00629
8,1.700307,1.677092,1.44455,1.377154,1.543291,1.429593,1.71455,1.602946,1.690839,1.846873,...,-0.276569,-0.477766,-0.443003,0.528952,-0.840266,-0.47849,-0.05866,-0.643963,-0.801707,-0.000342
9,1.421675,1.486324,1.242986,1.327877,1.194204,1.303395,1.670265,1.255335,1.376114,1.306573,...,-0.28437,0.43877,0.380937,0.027612,0.452854,-0.393547,0.273923,-0.470944,0.172919,-0.036892


# Save Models

In [18]:
save_models_predictions(weights_list, y_pred_list)