In [None]:
import imputation as imp
import generation as gen
import prediction
import utils
import files
import os
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import datasets

In [None]:
datasets_name = ['meps_19', 'bio', 'concrete', 'bike']

In [None]:
datasets_sizes = {'meps_19': {'train': 1000, 'cal': 500, 'test_pattern': 100},
                  'meps_20': {'train': 1000, 'cal': 500, 'test_pattern': 100},
                  'meps_21': {'train': 1000, 'cal': 500, 'test_pattern': 100},
                  'bio': {'train': 1000, 'cal': 500, 'test_pattern': 100},
                  'concrete': {'train': 630, 'cal': 200, 'test_pattern': 100},
                  'bike': {'train': 1000, 'cal': 500, 'test_pattern': 100}}

In [None]:
n_rep = 100
alpha = 0.1

In [None]:
prob_missing = 0.2

In [None]:
imputation = 'iterative_ridge'

methods = ['QR', 'QR_TrainCal', 'CQR', 'CQR_Masking_Cal']
basemodels = ['NNet']
masks = ['Yes']
protections = ['No']
subsets = [False, True]

cores = 1

params_basemodel = {'cores':cores}

In [None]:
dataset_base_path = "./data/cqr_datasets/"

In [None]:
for dataset_name in tqdm(datasets_name):
    
    df, target, var_missing = datasets.GetDataset(dataset_name, dataset_base_path)
    
    params_missing = {}
    params_missing['var_missing'] = var_missing
    params_missing['prob_missing'] = prob_missing
    
    d = df.shape[1]-1
    
    if dataset_name == 'concrete':
        nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']
        params_test = {'iid':{'test_size': 200}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}
    else:
        nb_sample_pattern = datasets_sizes[dataset_name]['test_pattern']
        params_test = {'iid':{'test_size': 2000}, 'fixed_nb_sample_pattern':{'nb_sample_pattern': nb_sample_pattern}}
    params_test = gen.process_test(params_test, d=d, params_missing=params_missing)
    
    max_test_size = np.max(params_test['test_size'])
    
    train_size = datasets_sizes[dataset_name]['train']
    cal_size = datasets_sizes[dataset_name]['cal']

    name = files.get_name_data(train_size, cal_size, params_test, 
                               dataset=dataset_name, params_missing=params_missing, seed=n_rep)
    
    if os.path.isfile('data/'+name+'.xz'):
        print('data found')
        data = files.load_file('data', name, 'xz')
    else:
        print('data not found')
        X, X_missing, M, Y = gen.generate_multiple_real_data_MCAR(df, target, train_size=train_size, 
                                                                  cal_size=cal_size, params_test=params_test,
                                                                  params_missing=params_missing, seed_max=n_rep)
        data = {'X': X, 'X_missing': X_missing, 'M': M,'Y': Y}
        files.write_file('data', name, 'xz', data)
        
    name_imputed = files.get_name_data_imputed(train_size, cal_size, params_test, imputation=imputation,
                                               dataset=dataset_name, params_missing=params_missing, seed=n_rep)

    if os.path.isfile('data/'+name_imputed+'.pkl'):
        print('imputation found')
        X_imp = files.load_file('data', name_imputed, 'pkl')
    else:
        print('imputation not found')
        if imputation == 'complete':
            X_imp = data['X']
        else:
            X_imp = imp.impute(data, imputation)
        files.write_file('data', name_imputed, 'pkl', X_imp)
    data_imputed = {'X': data['X'], 'X_missing': data['X_missing'], 'X_imp': X_imp, 'M': data['M'],'Y': data['Y']}

    
    results, methods_ran = prediction.run_experiments(data_imputed, alpha=alpha, methods=methods,
                                                      basemodels=basemodels, params_basemodel=params_basemodel,
                                                      masks=masks, protections=protections, 
                                                      subsets=subsets, imputation=imputation)

    for method in methods_ran:
        name_dir, name_method = files.get_name_results(method, train_size, cal_size, n_rep, 
                                                       dataset=dataset_name, imputation=imputation,
                                                       params_missing=params_missing)
        
        results_method = results[method]
        files.write_file('results/'+name_dir, name_method, 'xz', results_method)