# Config & Imports

In [1]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


import os

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization

import utilities_LR

from joblib import Parallel, delayed

from IPython.display import Image


In [2]:
config = {
    'data': {
        'n_datasets': 9_000, # the number of datasets
        
        'n_samples': 4_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        'n_informative': 8,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 3.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 42,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'lambda': {
        'data_prep': {
            'train_test_val_split': { # refer to sklearn doc
                'test_size': 0.3,
                'val_size': 0.1,
                'random_state': None,
                'shuffle': True,
                'stratify': None
            }
        },
        'model_compile': {
            'optimizer_lambda': 'adam',
            'loss': 'mae', #tf.keras.losses.get(config['lambda_net']['loss_lambda']),
            'metrics': ['mae']
        },
        'model_fit': { # refer to keras API
            'batch_size': 32,
            'epochs': 100,
            'verbose': 0,
            'callbacks': None,
            'shuffle': True,
            'class_weight': None,
            'sample_weight': None,
            'initial_epoch': 0,
            'steps_per_epoch': None,
            'validation_steps': None,
            'validation_batch_size': None,
            'validation_freq': 1
        }
    },
    'computation':{
        'n_jobs': 30,
        'use_gpu': True,
        'gpu_numbers': '1',
        'RANDOM_SEED': 1,   
    }
}

## Settings

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = config['computation']['gpu_numbers'] if config['computation']['use_gpu'] else ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if config['computation']['use_gpu'] else ''

os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda-11.4' if config['computation']['use_gpu'] else ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 ,--tf_xla_enable_xla_devices' if config['computation']['use_gpu'] else ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices


In [4]:
import logging
logging.getLogger('tensorflow').disabled = True

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [5]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 2:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], config['data']['n_targets']])

In [6]:
directory = utilities_LR.data_path_LR(config)

with open(directory + '/X_datasets_list_dataForLambda.npy', "rb") as f:
    X_datasets_list = np.load(f, allow_pickle=True)
with open(directory + '/y_datasets_list_dataForLambda.npy', "rb") as f:
    y_datasets_list = np.load(f, allow_pickle=True)
#with open(directory + '/coef_list_targetForInet.npy', "rb") as f:
#    coef_list = np.load(f, allow_pickle=True)

# Prepare Data (Functions)

In [7]:
def train_test_val_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=config['lambda']['data_prep']['train_test_val_split']['test_size'] + config['lambda']['data_prep']['train_test_val_split']['val_size'], 
                                                        random_state=config['lambda']['data_prep']['train_test_val_split']['random_state'], 
                                                        shuffle=config['lambda']['data_prep']['train_test_val_split']['shuffle'], 
                                                        stratify=config['lambda']['data_prep']['train_test_val_split']['stratify'])
    X_test, X_val, y__test, y_val = train_test_split(X_test, 
                                                    y_test, 
                                                    test_size=config['lambda']['data_prep']['train_test_val_split']['val_size'] / (config['lambda']['data_prep']['train_test_val_split']['test_size'] + config['lambda']['data_prep']['train_test_val_split']['val_size']), 
                                                    random_state=config['lambda']['data_prep']['train_test_val_split']['random_state'], 
                                                    shuffle=config['lambda']['data_prep']['train_test_val_split']['shuffle'], 
                                                    stratify=config['lambda']['data_prep']['train_test_val_split']['stratify'])
    return X_train, X_test, X_val, y_train, y_test, y_val
    

# Save Model & Metrics (functions)

In [8]:
def save_models(weights_list):
    directory = utilities_LR.lambda_path_LR(config)
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    with open(directory + '/lambda_weights_list.npy', "wb") as f:
        np.save(f, weights_list, allow_pickle=True)

# Train Model

In [9]:
def train_nn(X, y, index):
    # Data Prep
    X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(X,
                                                                          y)
    
    # Model Def
    model = Sequential()
    model.add(BatchNormalization(input_dim=config['data']['n_features']))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(config['data']['n_targets'], activation='sigmoid'))

    
    model.compile(optimizer=config['lambda']['model_compile']['optimizer_lambda'],
                  loss=config['lambda']['model_compile']['loss'],
                  metrics=config['lambda']['model_compile']['metrics']
                 )
    
    # Model fit
    history = model.fit(x=X_train,
                        y=y_train,
                        batch_size=config['lambda']['model_fit']['batch_size'],
                        epochs=config['lambda']['model_fit']['epochs'],
                        verbose=config['lambda']['model_fit']['verbose'],
                        callbacks=config['lambda']['model_fit']['callbacks'],
                        validation_data=(X_val, y_val),
                        shuffle=config['lambda']['model_fit']['shuffle'],
                        class_weight=config['lambda']['model_fit']['class_weight'],
                        sample_weight=config['lambda']['model_fit']['sample_weight'],
                        initial_epoch=config['lambda']['model_fit']['initial_epoch'],
                        steps_per_epoch=config['lambda']['model_fit']['steps_per_epoch'],
                        validation_steps=config['lambda']['model_fit']['validation_steps'],
                        validation_batch_size=config['lambda']['model_fit']['validation_batch_size'],
                        validation_freq=config['lambda']['model_fit']['validation_freq'],
                       )
    
    
    
    return np.concatenate([x.flatten() for x in model.get_weights()])

In [10]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=10, backend='loky') #loky

weights_list = parallel(delayed(train_nn)(X_data, y_data, index) for index, (X_data, y_data) in enumerate(zip(X_datasets_list, y_datasets_list)))
                                  
del parallel

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:  1.3min
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:  1.4min
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:  2.4min
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:  2.5min
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:  3.5min
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed:  3.5min
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed:  4.5min
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed:  5.4min
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:  5.6min
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed:  6.6min
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed:  7.5min
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed:  7.7min
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed:  8.7min
[Parallel(n_jobs=30)]: Done 253 tasks      | elapsed:  

# Inspect Metrics

In [11]:
weights_list = np.stack([np.array(x) for x in weights_list])

In [12]:
weights_list.shape

(9000, 7261)

# Save Models

In [13]:
save_models(weights_list)

# Create Coefficients for Inet y-Data

In [14]:
def generate_base_model(config):
    
    model = Sequential()
    model.add(BatchNormalization(input_dim=config['data']['n_features']))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(config['data']['n_targets'], activation='sigmoid'))
    
    return model
    
def shape_flat_network_parameters(flat_network_parameters, target_network_parameters):
               
    shaped_network_parameters =[]
    start = 0  
    
    for parameters in target_network_parameters:
        target_shape = parameters.shape
        size = np.prod(target_shape)
        shaped_parameters = np.reshape(flat_network_parameters[start:start+size], target_shape)
        shaped_network_parameters.append(shaped_parameters)
        start += size

    return shaped_network_parameters

def network_parameters_to_network(network_parameters, config):
    
    model = generate_base_model(config)    

    model_network_parameters = model.get_weights()    
 

    # Shape weights (flat) into correct model structure
    shaped_network_parameters = shape_flat_network_parameters(network_parameters, model_network_parameters)
    
    model.set_weights(shaped_network_parameters)
    
    model.compile(optimizer=config['lambda']['model_compile']['optimizer_lambda'],
                  loss=config['lambda']['model_compile']['loss'],
                  metrics=config['lambda']['model_compile']['metrics']
                 )
    
    return model

In [15]:
def get_LR(X, y):
    model = LogisticRegression(penalty='l2',
        dual=False,
        tol=0.0001,
        C=1.0,
        fit_intercept=True,
        intercept_scaling=1,
        class_weight=None,
        random_state=None,
        solver='lbfgs',
        max_iter=100,
        multi_class='auto',
        verbose=0,
        warm_start=False,
        n_jobs=None,
        l1_ratio=None
                              )
    model.fit(X, y)
    return model

In [16]:
def save_coefs(coef_list):
    directory = utilities_LR.lambda_path_LR(config)
    
    Path(directory).mkdir(parents=True, exist_ok=True)
    
    with open(directory + '/lambda_generated_coef_list_target_for_inet.npy', "wb") as f:
        np.save(f, coef_list, allow_pickle=True)

In [17]:
def create_coef(X_dataset, weights):
    lambda_model =  network_parameters_to_network(weights, config)
    y_data = lambda_model.predict(X_dataset)
    
    y_data = [1.0 if y>=0.5 else 0.0 for y in y_data]
    
    logregmodel = get_LR(X_dataset, y_data)
    
    return logregmodel.coef_

In [18]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=10, backend='loky') #loky

coef_list = parallel(delayed(create_coef)(X_data, weights) for (X_data, weights) in zip(X_datasets_list, weights_list))
                                  
del parallel

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    1.6s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    2.2s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    2.6s
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:    2.9s
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed:    3.2s
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed:    3.8s
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed:    4.2s
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:    4.6s
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed:    5.1s
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed:    5.7s
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed:    6.3s
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed:    6.9s
[Parallel(n_jobs=30)]: Done 253 tasks      | elapsed:  

In [19]:
coef_list = np.stack([np.array(x[0]) for x in coef_list])

In [20]:
coef_list.shape

(9000, 10)

In [21]:
save_coefs(coef_list)