# Config & Imports

In [8]:
config = {
    'data': {
        'n_datasets': 5_000, # the number of datasets
        
        'n_samples': 5_000, # the number of samples per dataset
        
        'n_features': 20, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        'n_informative': 10,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 1.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0.01,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': None,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },    
    'computation':{
        'n_jobs': 100
    }
}


In [9]:
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from pathlib import Path

from joblib import Parallel, delayed

import utilities_LR

# Generate Data

In [10]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 3:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], config['data']['n_targets']])

In [11]:
def create_training_data():
    X_datasets_list, y_datasets_list = sklearn.datasets.make_classification(n_samples=config['data']['n_samples'], 
                                                                                         n_features=config['data']['n_features'],
                                                                                         n_informative=config['data']['n_informative'], 
                                                                                         n_redundant=config['data']['n_features']-config['data']['n_informative'],
                                                                                         n_repeated=0,
                                                                                         n_classes=config['data']['n_targets']+1, 
                                                                                         n_clusters_per_class=config['data']['n_clusters_per_class'],
                                                                                         weights=None,
                                                                                         flip_y=config['data']['noise'],
                                                                                         class_sep=config['data']['class_sep'],
                                                                                         shuffle=config['data']['shuffle'],
                                                                                         random_state=config['data']['random_state'])
    model_train = LogisticRegression()
    model_train.fit(X_datasets_list, y_datasets_list)
    return X_datasets_list, y_datasets_list, model_train.coef_

In [12]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky


results= parallel(delayed(create_training_data)() for i in range(config['data']['n_datasets']))
                                  
del parallel

[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done  89 tasks      | elapsed:   18.0s
[Parallel(n_jobs=100)]: Done 313 tasks      | elapsed:   18.9s
[Parallel(n_jobs=100)]: Done 601 tasks      | elapsed:   19.6s
[Parallel(n_jobs=100)]: Done 953 tasks      | elapsed:   20.3s
[Parallel(n_jobs=100)]: Done 1369 tasks      | elapsed:   21.2s
[Parallel(n_jobs=100)]: Done 1849 tasks      | elapsed:   22.1s
[Parallel(n_jobs=100)]: Done 2393 tasks      | elapsed:   23.3s
[Parallel(n_jobs=100)]: Done 3001 tasks      | elapsed:   24.5s
[Parallel(n_jobs=100)]: Done 3673 tasks      | elapsed:   25.9s
[Parallel(n_jobs=100)]: Done 4409 tasks      | elapsed:   27.3s
[Parallel(n_jobs=100)]: Done 5000 out of 5000 | elapsed:   28.4s finished


In [13]:
for i in range(config['data']['n_datasets']):
    X_datasets_list[i] = results[i][0]
    y_datasets_list[i] = results[i][1]
    coef_list[i] = results[i][2]

# Save Data

In [14]:
directory = utilities_LR.data_path_LR(config)

Path(directory).mkdir(parents=True, exist_ok=True)

with open(directory + '/X_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, X_datasets_list)
with open(directory + '/y_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, y_datasets_list)

np.save(directory + '/coef_list_targetForInet.npy', coef_list, allow_pickle=True)