# Config & Imports

In [1]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import os

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping

import tensorflow as tf

import utilities_LR

from joblib import Parallel, delayed

from IPython.display import Image


2022-07-07 11:07:05.453104: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-07 11:07:05.453147: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
config = {
     'data': {
        'n_datasets': 45_000, # the number of datasets
        
        'n_samples': 5_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        #'n_informative': random.randint(2, 10),
        'n_informative': 'random',
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        ### int or 'random'
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 1,
        # The number of clusters per class.
        
        'class_sep': 1.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 44,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'lambda': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': 0.1,
                'val_size': 0.15,
                'random_state': None,
                'shuffle': False, # should be always false
                'stratify': None
            }
        },
        'model_compile': {
            'optimizer_lambda': 'adam',
            'loss': 'mae',# keras.losses.BinaryCrossentropy(from_logits=False), #tf.keras.losses.get(config['lambda_net']['loss_lambda']), # 'mae'
            'metrics': [] # 'mae', keras.losses.BinaryCrossentropy(from_logits=False)]
        },
        'model_fit': { # refer to keras API
            'batch_size': 64,
            'epochs': 500,
            'verbose': 0,
            'callbacks': None,
            'shuffle': True, # usually true
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'computation':{
        'n_jobs': 50,
        'use_gpu': True,
        'gpu_numbers': '2',
        'RANDOM_SEED': 1,   
    }
}

## Settings

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = config['computation']['gpu_numbers'] if config['computation']['use_gpu'] else ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if config['computation']['use_gpu'] else ''

os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda-11.4' if config['computation']['use_gpu'] else ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 ,--tf_xla_enable_xla_devices' if config['computation']['use_gpu'] else ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices


In [4]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num XLA-GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

Num GPUs Available:  0
Num XLA-GPUs Available:  0


2022-07-07 11:07:08.666150: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-07 11:07:08.666188: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-07 11:07:08.666212: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dws-02): /proc/driver/nvidia/version does not exist


In [5]:
import logging
logging.getLogger('tensorflow').disabled = True

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [6]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 2:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])

In [7]:
directory = utilities_LR.data_path_LR(config)

with open(directory + '/X.npy', "rb") as f:
    X_datasets_list = np.load(f, allow_pickle=True)
with open(directory + '/y.npy', "rb") as f:
    y_datasets_list = np.load(f, allow_pickle=True)

# Save Model & Metrics (functions)

In [8]:
def save_models_predictions(weights_list, y_pred_list):
    directory = utilities_LR.lambda_path_LR(config)
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    np.save(directory + '/lambda_weights_list.npy', weights_list, allow_pickle=True)
    np.save(directory + '/lambda_preds_list.npy', y_pred_list, allow_pickle=True)

# Train Model

In [9]:
def train_nn(X, y, index):
    # Data Prep
    X_train, _, y_train, _ = train_test_split(X, y, 
                                                        test_size=config['lambda']['data_prep']['train_test_val_split']['test_size'],
                                                        train_size=None,
                                                        random_state=None,
                                                        shuffle=config['lambda']['data_prep']['train_test_val_split']['shuffle'],
                                                        stratify=None,
                                                       )
    
    
    
    # Model Def
    
    early_stopping = EarlyStopping(monitor='val_loss',
                                min_delta=0.001,
                                patience=12,
                                verbose=0,
                                mode='auto',
                                baseline=None,
                                restore_best_weights=True)
    
    
    model = Sequential()
    model.add(BatchNormalization(input_dim=config['data']['n_features']))
    model.add(Dense(50, activation='ReLU'))
    #model.add(Dense(60, activation='relu'))
    model.add(Dense(config['data']['n_targets'], activation='sigmoid'))

    
    model.compile(optimizer=config['lambda']['model_compile']['optimizer_lambda'],
                  loss=config['lambda']['model_compile']['loss'],
                  metrics=config['lambda']['model_compile']['metrics']
                 )
    
    #print(model.summary())
    
    # Model fit
    _ = model.fit(x=X_train,
                        y=y_train,
                        batch_size=config['lambda']['model_fit']['batch_size'],
                        epochs=config['lambda']['model_fit']['epochs'],
                        verbose=config['lambda']['model_fit']['verbose'],
                        callbacks=[early_stopping],
                        #validation_data=(X_val, y_val),
                        validation_split=config['lambda']['data_prep']['train_test_val_split']['val_size'],
                        shuffle=config['lambda']['model_fit']['shuffle'],
                        class_weight=config['lambda']['model_fit']['class_weight'],
                        sample_weight=config['lambda']['model_fit']['sample_weight'],
                        initial_epoch=config['lambda']['model_fit']['initial_epoch'],
                        steps_per_epoch=config['lambda']['model_fit']['steps_per_epoch'],
                        validation_steps=config['lambda']['model_fit']['validation_steps'],
                        validation_batch_size=config['lambda']['model_fit']['validation_batch_size'],
                        validation_freq=config['lambda']['model_fit']['validation_freq'],
                       )
    
    lambda_weights = np.concatenate([x.flatten() for x in model.get_weights()])
    
    y_pred = model.predict(X, verbose=0)
    
    return lambda_weights, y_pred

In [10]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=10, backend='loky') #loky

weights_ypred_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list, y_datasets_list)))
#weights_ypred_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list[:5], y_datasets_list[:5])))
                                  
del parallel

[Parallel(n_jobs=50)]: Using backend LokyBackend with 50 concurrent workers.
[Parallel(n_jobs=50)]: Done  13 tasks      | elapsed:   55.4s
[Parallel(n_jobs=50)]: Done  28 tasks      | elapsed:  1.1min
[Parallel(n_jobs=50)]: Done  45 tasks      | elapsed:  1.3min
[Parallel(n_jobs=50)]: Done  62 tasks      | elapsed:  1.6min
[Parallel(n_jobs=50)]: Done  81 tasks      | elapsed:  1.8min
[Parallel(n_jobs=50)]: Done 100 tasks      | elapsed:  2.2min
[Parallel(n_jobs=50)]: Done 121 tasks      | elapsed:  2.5min
[Parallel(n_jobs=50)]: Done 142 tasks      | elapsed:  2.8min
[Parallel(n_jobs=50)]: Done 165 tasks      | elapsed:  3.1min
[Parallel(n_jobs=50)]: Done 188 tasks      | elapsed:  3.5min
[Parallel(n_jobs=50)]: Done 213 tasks      | elapsed:  3.9min
[Parallel(n_jobs=50)]: Done 238 tasks      | elapsed:  4.3min
[Parallel(n_jobs=50)]: Done 265 tasks      | elapsed:  4.7min
[Parallel(n_jobs=50)]: Done 292 tasks      | elapsed:  5.1min
[Parallel(n_jobs=50)]: Done 321 tasks      | elapsed:  

In [11]:
weights_list = np.stack([np.array(x[0]) for x in weights_ypred_list])
y_pred_list = np.stack([x[1] for x in weights_ypred_list])

In [12]:
y_pred_list = y_pred_list.reshape([config['data']['n_datasets'], config['data']['n_samples']])

# Inspect Metrics

In [13]:
weights_list.shape

(45000, 641)

In [14]:
y_pred_list.shape

(45000, 5000)

In [15]:
import pandas as pd

In [16]:
pd.DataFrame(y_pred_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,4.205672e-05,0.9999943,1.0,0.0004726976,5.805218e-05,0.9981251,0.9999991,1.631108e-08,4.327217e-07,0.9999905,...,0.9999346,3.574395e-08,1.0,2.25946e-06,0.000659498,0.9998621,0.9999989,0.001190422,0.9994459,1.0
1,0.01390169,0.9979529,3.673022e-06,2.003713e-05,1.848896e-06,0.9998658,7.999154e-05,2.10719e-06,0.001319765,1.485519e-05,...,1.738463e-07,1.150213e-06,3.547199e-05,0.0002872142,0.9999992,0.9998686,0.09165456,0.9999762,0.9988837,0.000109433
2,9.689543e-05,2.999646e-10,1.301978e-05,0.999702,1.0,0.9999684,3.945545e-08,1.0,6.121484e-07,1.938206e-06,...,3.320938e-06,3.398682e-06,5.401766e-06,0.9999998,0.0001822156,3.593247e-05,6.02293e-07,7.663682e-09,5.518998e-07,1.0
3,1.0,1.2119559999999999e-20,0.003974197,1.641967e-08,0.0005147309,0.9854988,0.2428273,2.028707e-13,0.0001731283,0.9999956,...,0.9999955,3.6451379999999996e-26,1.412348e-10,5.114396e-11,9.190845e-12,1.872328e-10,1.0,0.9999998,1.0,0.001139395
4,2.730897e-07,1.0,1.0,1.023245e-05,0.9999992,3.882052e-05,8.45669e-12,1.0,8.935854e-10,0.9963199,...,1.0,0.9999654,1.0,1.986376e-18,1.0,7.2844e-11,2.572836e-12,0.9998657,1.0,2.600572e-13
5,0.0002748289,3.905157e-07,1.058003e-05,6.743187e-07,0.9999923,0.9998977,0.9998838,0.9999982,0.9999983,1.235056e-06,...,0.9995495,3.031982e-06,1.23381e-09,0.9999978,0.9999997,2.041932e-06,0.9999169,0.9996908,0.9999972,9.085975e-07
6,0.6435905,1.0,4.924858e-15,1.0,0.0002606284,0.00121488,1.0,1.0,0.9999995,3.451193e-07,...,0.004497204,1.0,1.796831e-09,1.0,1.097852e-10,0.9999303,4.818037e-05,0.0002236084,1.0,1.0
7,4.151213e-08,1.0,1.0,1.0,0.01151642,2.10883e-12,5.227026e-05,6.699341e-05,9.056657e-05,0.001486605,...,0.9995966,8.19452e-12,8.037652e-05,1.800258e-09,1.0,1.0,6.972214e-11,8.941127e-07,0.9999985,1.0
8,1.0,3.071118e-07,1.0,0.9999944,0.9999999,4.193002e-06,0.9999999,3.699674e-06,1.0,5.31244e-08,...,2.046317e-11,2.011017e-08,1.190598e-05,0.9998519,1.0,4.954937e-10,0.9999999,7.639632e-07,1.408444e-06,0.9999998
9,3.308081e-09,1.9957559999999998e-19,2.663339e-21,0.009357464,9.266161e-09,1.0,0.9999954,0.999995,3.469586e-12,1.0,...,1.026828e-14,5.385362e-09,1.0,0.9999216,0.8045511,0.9999996,0.9999958,0.9988156,1.110085e-19,0.9944763


In [17]:
pd.DataFrame(weights_list).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,631,632,633,634,635,636,637,638,639,640
0,1.355054,1.258699,1.212406,1.113385,1.439091,1.52083,1.3969,1.409739,1.068713,1.309842,...,0.357826,0.100532,-0.180221,-0.412792,0.4376,0.520335,0.640489,-0.424582,0.494775,0.020274
1,1.147983,0.965386,1.220783,1.476104,1.238748,1.163151,1.406884,1.262292,1.173563,1.231986,...,-0.161895,0.34095,-0.504974,0.317095,-0.430866,-0.11582,-0.490671,0.420619,0.336737,0.000318
2,1.364424,1.709586,1.079928,1.279753,1.209944,1.744203,1.197026,1.061418,1.359827,1.387002,...,-0.662387,0.740861,-0.408153,0.503132,-0.700361,-0.296916,-0.468912,-0.293297,-0.328104,0.087038
3,1.910812,1.598887,2.027739,1.516452,1.392765,1.454487,1.867737,1.575727,1.950724,1.89181,...,0.963764,-0.773927,-0.961524,-0.573745,0.70824,1.036717,-0.302263,-1.971713,1.102798,-0.079984
4,1.492469,1.648434,1.413505,1.61859,1.299272,1.720709,1.295839,1.750335,1.117914,1.587039,...,0.804552,1.08473,0.954627,-0.861212,-0.635381,-0.877119,0.675543,-0.370964,-1.527482,0.125211
5,1.509707,1.248541,1.039783,1.286908,1.33623,1.361229,1.372215,1.171927,1.506479,1.184397,...,0.412547,-0.41085,0.273514,0.453569,0.208198,0.280949,-0.486797,-0.425799,0.439879,-0.14157
6,1.984638,1.731068,1.686331,1.585992,1.18534,1.459885,1.2252,1.550375,1.512282,1.677134,...,0.628079,-0.569607,-0.323184,0.868867,-0.348999,-0.335417,1.066389,-0.673192,-0.631434,0.081441
7,1.608833,1.75271,1.716441,1.819558,1.731056,1.213343,1.357826,1.818805,1.274058,1.27526,...,0.742374,0.715313,0.8033,0.817026,0.566485,-0.4931,-0.503522,0.9424,-0.472521,-0.233865
8,1.512649,1.573318,1.431265,1.583191,1.476306,1.324691,1.453362,1.311196,1.532918,1.606914,...,-0.634049,-0.576689,-0.682543,0.749241,0.703365,-0.242877,-0.678376,-0.466662,0.335054,-0.024174
9,1.454038,1.619206,1.293487,1.269309,1.661178,1.718328,1.517605,1.769718,1.42803,1.34819,...,-0.586217,-1.602535,0.366264,0.26571,-0.751554,0.524476,0.610941,-1.788216,-1.334791,0.105862


# Save Models

In [19]:
save_models_predictions(weights_list, y_pred_list)