# Config & Imports

In [1]:
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

from joblib import Parallel, delayed

import utilities_LR
import random

import os


In [2]:
config = {
    'data': {
        'n_datasets': 45_000, # the number of datasets
        
        'n_samples': 5_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        #'n_informative': random.randint(2, 10),
        'n_informative': 'random',
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        ### int or 'random'
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 1,
        # The number of clusters per class.
        
        'class_sep': 1.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 44,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'computation':{
        'n_jobs': 30
    }
}


# Generate Data

In [3]:
# deactivate GPU

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '' 
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = ''

os.environ['XLA_FLAGS'] = ''#-10.1' #--xla_gpu_cuda_data_dir=/usr/local/cuda, 
os.environ['TF_XLA_FLAGS'] = ''#'--tf_xla_auto_jit=2' #, --tf_xla_enable_xla_devices

In [4]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])
X_datasets_list_eval = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 2:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])

In [5]:
def create_data(i):
    n_features = config['data']['n_features']
    if config['data']['n_informative'] == 'random':
        n_informative = random.randint(n_features - 5, n_features)
        n_redundant = random.randint(0, n_features-n_informative)
        n_repeated = n_features - n_informative - n_redundant
    else:
        n_informative = config['data']['n_informative']
        n_redundant = config['data']['n_features']-config['data']['n_informative']
        n_repeated = 0
    X, y =  sklearn.datasets.make_classification(n_samples=config['data']['n_samples'],
                                                                                         n_features=n_features,
                                                                                         n_informative=n_informative,
                                                                                         n_redundant=n_redundant,
                                                                                         n_repeated=n_repeated,
                                                                                         n_classes=config['data']['n_targets']+1, 
                                                                                         n_clusters_per_class=config['data']['n_clusters_per_class'],
                                                                                         weights=None,
                                                                                         flip_y=0, # noise is set in 2a
                                                                                         class_sep=config['data']['class_sep'],
                                                                                         shuffle=config['data']['shuffle'],
                                                                                         random_state=config['data']['random_state']+i
                                                )
    return X, y

In [6]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky


results= parallel(delayed(create_data)(i) for i in range(config['data']['n_datasets']))
                                  
del parallel

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:   16.2s
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed:   18.1s
[Parallel(n_jobs=30)]: Done 452 tasks      | elapsed:   19.7s
[Parallel(n_jobs=30)]: Done 740 tasks      | elapsed:   22.3s
[Parallel(n_jobs=30)]: Done 1092 tasks      | elapsed:   26.0s
[Parallel(n_jobs=30)]: Done 1508 tasks      | elapsed:   30.3s
[Parallel(n_jobs=30)]: Done 1988 tasks      | elapsed:   34.0s
[Parallel(n_jobs=30)]: Done 2532 tasks      | elapsed:   39.2s
[Parallel(n_jobs=30)]: Done 3140 tasks      | elapsed:   45.8s
[Parallel(n_jobs=30)]: Done 3812 tasks      | elapsed:   52.5s
[Parallel(n_jobs=30)]: Done 4548 tasks      | elapsed:   58.4s
[Parallel(n_jobs=30)]: Done 5348 tasks      | elapsed:  1.1min
[Parallel(n_jobs=30)]: Done 6212 tasks      | elapsed:  1.2min
[Parallel(n_jobs=30)]: Done 7140 tasks      | elapsed:  1.3min
[Parallel(n_jobs=30)]: Done 8132 tasks      |

In [7]:
for i in range(config['data']['n_datasets']):
    X_datasets_list[i] = results[i][0]
    y_datasets_list[i] = results[i][1]

# Save Data

In [8]:
directory = utilities_LR.data_path_LR(config)

Path(directory).mkdir(parents=True, exist_ok=True)

with open(directory + '/X.npy', "wb") as f:
    np.save(f, X_datasets_list)
with open(directory + '/y.npy', "wb") as f:
    np.save(f, y_datasets_list)