# Config & Imports

In [13]:
config = {
    'data': {
        'n_datasets': 10_000, # the number of datasets
        
        'n_samples': 5_000, # the number of samples per dataset
        
        'n_features': 20, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        'n_informative': 10,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 2,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 1.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0.01,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': None,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },    
    'computation':{
        'n_jobs': 10,
        'use_gpu': True,
        'gpu_numbers': '4',
        'RANDOM_SEED': 1,   
    }
}


In [14]:
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from pathlib import Path

import utilities_LR

# Generate Data

In [15]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 3:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], config['data']['n_targets']])

In [16]:
for i in range(config['data']['n_datasets']):
    X_datasets_list[i], y_datasets_list[i] = sklearn.datasets.make_classification(n_samples=config['data']['n_samples'], 
                                                                                         n_features=config['data']['n_features'],
                                                                                         n_informative=config['data']['n_informative'], 
                                                                                         n_classes=config['data']['n_targets'], 
                                                                                         n_clusters_per_class=config['data']['n_clusters_per_class'],
                                                                                         weights=None,
                                                                                         flip_y=config['data']['noise'],
                                                                                         class_sep=config['data']['class_sep'],
                                                                                         shuffle=config['data']['shuffle'],
                                                                                         random_state=config['data']['random_state'])
    model = LogisticRegression()
    model.fit(X_datasets_list[i], y_datasets_list[i])
    coef_list[i] = model.coef_

In [17]:
coef_list[0]

array([ 0.00495198, -0.01854766, -0.03919359,  0.04854344,  0.0091744 ,
       -0.07450365, -0.04318035,  0.03160309, -0.11410224, -0.01132964,
        0.00093653,  0.15812977, -0.24470446, -0.2362686 ,  0.00529975,
       -0.03122651, -0.50247647, -0.02148568,  0.04833143, -0.04019827])

# Save Data

In [18]:
#directory = utilities_LR.data_path_LR(config)
#
#Path(directory).mkdir(parents=True, exist_ok=True)
#
#with open(directory + '/X_datasets_list_dataForLambda.npy', "wb") as f:
#    np.save(f, X_datasets_list)
#with open(directory + '/y_datasets_list_dataForLambda.npy', "wb") as f:
#    np.save(f, y_datasets_list)
#with open(directory + '/coef_list_targetForInet.npy', "wb") as f:
#    np.save(f, coef_list)

In [19]:
directory = utilities_LR.data_path_LR(config)

Path(directory).mkdir(parents=True, exist_ok=True)

with open(directory + '/X_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, X_datasets_list)
with open(directory + '/y_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, y_datasets_list)

np.save(directory + '/coef_list_targetForInet.npy', coef_list, allow_pickle=True)