# Config & Imports

In [1]:
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

from joblib import Parallel, delayed

import utilities_LR
import random

In [2]:
config = {
    'data': {
        'n_datasets': 9_000, # the number of datasets
        
        'n_samples': 4_000, # the number of samples per dataset
        
        'n_features': 10, 
        # The total number of features. 
        # These comprise n_informative informative features, n_redundant redundant features, n_repeated duplicated features and 
        # n_features-n_informative-n_redundant-n_repeated useless features drawn at random.
        
        #'n_informative': random.randint(2, 10),
        'n_informative': 8,
        # The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices 
        # of a hypercube in a subspace of dimension n_informative. For each cluster, informative features are drawn independently 
        # from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then 
        # placed on the vertices of the hypercube.
        
        'n_targets': 1,
        # The number of targets (or labels) of the classification problem.
    
        'n_clusters_per_class': 2,
        # The number of clusters per class.
        
        'class_sep': 3.0,
        # class_sepfloat, default=1.0
        # The factor multiplying the hypercube size. Larger values spread out the clusters/classes and make the classification task 
        # easier.
        
        'noise': 0,
        # flip_y (fraction of samples whose class is assigned randomly)
        
        'shuffle': True,
        # Shuffle the samples and the features.
        
        'random_state': 42,
        # Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls.
    },
    'eval': {
        'n_datasets': 9_000,
        'n_samples_train': 2000,
        'n_samples_queryLambda': 1000, # _forLogRegBaseModel
        'n_samples_comparison': 1000 # compare inet and basemodel
    },
    'computation':{
        'n_jobs': 100
    }
}


In [3]:
config['data']['n_informative']

8

# Generate Data

In [4]:
X_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])
X_datasets_list_eval = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_features']])

if  config['data']['n_targets'] < 3:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], ])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], ])
else:
    y_datasets_list = np.zeros([config['data']['n_datasets'], config['data']['n_samples'], config['data']['n_targets']])
    coef_list = np.zeros([config['data']['n_datasets'], config['data']['n_features'], config['data']['n_targets']])
    
X_valid = np.zeros([config['data']['n_datasets'], config['eval']['n_samples_train'] + config['eval']['n_samples_queryLambda'] + config['eval']['n_samples_comparison'], config['data']['n_features']])
y_valid = np.zeros([config['data']['n_datasets'], config['eval']['n_samples_train'] + config['eval']['n_samples_queryLambda'] + config['eval']['n_samples_comparison'], ])

In [5]:
def create_data():
    X, y =  sklearn.datasets.make_classification(n_samples=config['data']['n_samples'] + config['eval']['n_samples_train'] + config['eval']['n_samples_queryLambda'] + config['eval']['n_samples_comparison'], 
                                                                                         n_features=config['data']['n_features'],
                                                                                         n_informative=config['data']['n_informative'],
                                                                                         n_redundant=config['data']['n_features']-config['data']['n_informative'],
                                                                                         n_repeated=0,
                                                                                         n_classes=config['data']['n_targets']+1, 
                                                                                         n_clusters_per_class=config['data']['n_clusters_per_class'],
                                                                                         weights=None,
                                                                                         flip_y=config['data']['noise'],
                                                                                         class_sep=config['data']['class_sep'],
                                                                                         shuffle=config['data']['shuffle'],
                                                                                         random_state=config['data']['random_state'])
    X_datasets_list, X_valid, y_datasets_list, y_valid = train_test_split(X, y, test_size=config['eval']['n_samples_train'] + config['eval']['n_samples_queryLambda'] + config['eval']['n_samples_comparison'])
    
    model_train = LogisticRegression()
    model_train.fit(X_datasets_list, y_datasets_list)
    
    return X_datasets_list, y_datasets_list, model_train.coef_, X_valid, y_valid

In [None]:
parallel = Parallel(n_jobs=config['computation']['n_jobs'], verbose=3, backend='loky') #loky


results= parallel(delayed(create_data)() for i in range(config['data']['n_datasets']))
                                  
del parallel

[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done  88 tasks      | elapsed:    5.2s
[Parallel(n_jobs=100)]: Done 312 tasks      | elapsed:    5.9s
[Parallel(n_jobs=100)]: Done 600 tasks      | elapsed:    6.6s
[Parallel(n_jobs=100)]: Done 952 tasks      | elapsed:    7.5s
[Parallel(n_jobs=100)]: Done 1368 tasks      | elapsed:    8.5s
[Parallel(n_jobs=100)]: Done 1848 tasks      | elapsed:    9.7s
[Parallel(n_jobs=100)]: Done 2392 tasks      | elapsed:   10.9s


In [None]:
for i in range(config['data']['n_datasets']):
    X_datasets_list[i] = results[i][0]
    y_datasets_list[i] = results[i][1]
    coef_list[i] = results[i][2]
    X_valid[i] = results[i][3]
    y_valid[i] = results[i][4]

In [None]:
X_datasets_list.shape

In [None]:
y_datasets_list.shape

In [None]:
coef_list.shape

In [None]:
X_valid.shape

In [None]:
y_valid.shape

# Save Data

In [None]:
directory = utilities_LR.data_path_LR(config)

Path(directory).mkdir(parents=True, exist_ok=True)

with open(directory + '/X_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, X_datasets_list)
with open(directory + '/y_datasets_list_dataForLambda.npy', "wb") as f:
    np.save(f, y_datasets_list)

np.save(directory + '/coef_list_targetForInet.npy', coef_list, allow_pickle=True)

np.save(directory + '/X_datasets_valid.npy', X_valid, allow_pickle=True)
np.save(directory + '/y_datasets_valid.npy', y_valid, allow_pickle=True)