# Load Data

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut, StratifiedKFold
import pickle
import os
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

## Load raw data

In [2]:
config_file_path = 'data/config.csv'
df_config = pd.read_csv(config_file_path)
train_id = 0

In [3]:
dev_social = pd.read_csv(os.path.join(df_config['processed_file_folder'][train_id],'dev_social.csv'))
dev_personal = pd.read_csv(os.path.join(df_config['processed_file_folder'][train_id],'dev_personal.csv'))
dev_clinical = pd.read_csv(os.path.join(df_config['processed_file_folder'][train_id],'dev_clinical.csv'))
dev_pos_test = pd.read_csv(os.path.join(df_config['processed_file_folder'][train_id],'dev_pos_test.csv'))
dev_target = pd.read_csv(os.path.join(df_config['processed_file_folder'][train_id],'dev_target.csv'))

In [4]:
print('Dev. Social Data:', dev_social.shape)
print('Dev. Personal Data:', dev_personal.shape)
print('Dev. Clinical Data:', dev_clinical.shape)
print('Dev. Pos-test Data:', dev_pos_test.shape)
print('Dev. Target Data:',dev_target.shape)

Dev. Social Data: (54, 13)
Dev. Personal Data: (54, 10)
Dev. Clinical Data: (54, 16)
Dev. Pos-test Data: (54, 4)
Dev. Target Data: (54, 1)


## Create on pipeline to each dataset

the approach is very simple.

 1 - split the dataset with different strategies: Leave One Out and k-Fold (5-folds),
 
 2 - create one pipeline per fold in each strategy,
 
 3 - save each pipeline with a specific name,

In [5]:
datasets_data = [dev_social, dev_clinical, dev_pos_test, dev_personal]
datasets_name = ['social', 'clinical', 'pos_test', 'personal']

In [6]:
for idataset, dataset_name in enumerate(datasets_name): 
    print('Processing %s'%(dataset_name))
    data = datasets_data[idataset]
    print('Dataset shape: %i, %i'%(data.shape[0],data.shape[1]))
    
    imputer_strat = df_config['imputer_strat'][train_id]
    
    n_folds = df_config['cv_folds'][train_id]
    
    cv_path = df_config['cv_path'][train_id]
    for ifold in range(n_folds):
        cv_name = '%s_%s_CV_fold_%i_of_%i_cv_indexes.pkl'%(df_config['hash_id'][train_id],
                                                             df_config['cv_alg'][train_id],
                                                             ifold, n_folds)
        with open(os.path.join(cv_path,cv_name),'rb') as file_handler:
            [trn_idx,val_idx] = pickle.load(file_handler)
            
        if dataset_name != 'personal':
            pipe = Pipeline(steps=[("imputer", SimpleImputer(missing_values=np.nan, strategy=imputer_strat))])
        else:
            numeric_features = ['idade']
            numeric_transformer = Pipeline(
                steps=[("imputer", SimpleImputer(strategy="mean")), 
                       ("scaler", StandardScaler())]
            )
            categorical_features = ['sexo', 'est civil', 'estuda', 
                                    'escolaridade', 'emprego','hcw', 
                                    'renda', 'renda ant', 'inst chef']
            categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(missing_values=np.nan, 
                                                                                strategy=imputer_strat))])
            preprocessor = ColumnTransformer(
                transformers=[("num", numeric_transformer, numeric_features),
                              ("cat", categorical_transformer, categorical_features),
                             ])
            pipe = Pipeline(steps=[("preprocessor", preprocessor)])
        
        pipe.fit(data.loc[trn_idx,:])
        pipe_path = df_config['pipeline_path'][train_id]
        pipe_name = '%s_%s_CV_fold_%i_of_%i_pipe_%s.jbl'%(df_config['hash_id'][train_id],
                                                          df_config['cv_alg'][train_id],
                                                          ifold, n_folds, 
                                                          datasets_name[idataset])
        with open(os.path.join(pipe_path,pipe_name),'wb') as file_handler:
            joblib.dump(pipe, file_handler)
        

Processing social
Dataset shape: 54, 13
Processing clinical
Dataset shape: 54, 16
Processing pos_test
Dataset shape: 54, 4
Processing personal
Dataset shape: 54, 10


In [8]:
train_id = 0
ifold = 1
idataset = 3
print("Dataset: %s"%(datasets_name[idataset]))
n_folds = df_config['cv_folds'][train_id]
pipe_path = df_config['pipeline_path'][train_id]
pipe_name = '%s_%s_CV_fold_%i_of_%i_pipe_%s.jbl'%(df_config['hash_id'][train_id],
                                                  df_config['cv_alg'][train_id],
                                                  ifold, n_folds, 
                                                  datasets_name[idataset])
with open(os.path.join(pipe_path,pipe_name),'rb') as file_handler:
    pipe = joblib.load(file_handler)
pipe

Dataset: personal
