# Load Data

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut, StratifiedKFold
import pickle
import os
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

## Load raw data

In [2]:
config_file_path = 'data/config.csv'
df_config = pd.read_csv(config_file_path)
train_id = 0

In [3]:
df_data = pd.read_csv(df_config['train_data_path'][train_id])
df_trgt = pd.read_csv(df_config['train_trgt_path'][train_id])

# Create a pipeline - data flow processing chain

In [4]:
n_folds = df_config['cv_folds'][train_id]
cv_path = df_config['cv_path'][train_id]
pipe_path = df_config['pipeline_path'][train_id]

for ifold in range(n_folds):
    cv_name = '%s_%s_CV_fold_%i_of_%i_cv_indexes.pkl'%(df_config['hash_id'][train_id],
                                                         df_config['cv_alg'][train_id],
                                                         ifold, n_folds)
    with open(os.path.join(cv_path,cv_name),'rb') as file_handler:
        [trn_idx,val_idx] = pickle.load(file_handler)
            
    if df_config['scaler_alg'][train_id] == 'StandardScaler':
        pipe = Pipeline(steps=[("scaler", StandardScaler())])

    pipe.fit(df_data.loc[trn_idx,:])
    
    pipe_name ='%s_%s_CV_fold_%i_of_%i_pipe.pkl'%(df_config['hash_id'][train_id],
                                                       df_config['cv_alg'][train_id],
                                                       ifold, n_folds)
    
    with open(os.path.join(pipe_path,pipe_name),'wb') as file_handler:
        joblib.dump(pipe, file_handler)
