In [1]:
# load credentials
import pandas as pd
from itertools import chain
from preprocessing.fileprocessing import read_and_clean
from preprocessing.transformers import Z_Score, MaxFeatureIndex, MaxFeatureAbsMeanDiff, CustomNormalizer
from preprocessing.pipetools import PipelineBuilder, JooblePipe

In [2]:
# set constants
TRAIN_DATA_DIR = "data/train.tsv"
TEST_DATA_DIR = "data/test.tsv"

In [3]:
#load data
train_data_dict = read_and_clean(TRAIN_DATA_DIR)
test_data_dict = read_and_clean(TEST_DATA_DIR)
print(type(train_data_dict),train_data_dict.keys())

<class 'dict'> dict_keys([2])


In [4]:
train_data_dict[2].head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,247,248,249,250,251,252,253,254,255,256
id_job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1864791934054678713,9835,9999,9941,9945,9386,9899,9421,9954,9952,9884,...,8818,9954,9925,9934,8689,9958,9086,9114,9950,9875
-7413918695841089440,9082,9999,9700,9669,9981,9729,9822,9667,9526,9469,...,9979,9752,9695,9676,9974,9788,9955,9907,9747,9824
-9223271545392256405,9064,9999,9730,9585,9890,9740,9751,9538,9590,9771,...,9930,9705,9645,9652,9954,9582,9947,9876,9722,9791
-9223240803898726824,9402,9999,9711,9742,9975,9728,9924,9692,9514,9435,...,9959,9737,9611,9661,9912,9768,9963,9971,9767,9809
-9223102057156184105,9655,9996,9751,9722,9925,9736,9987,9733,9475,9877,...,9908,8208,9631,9571,9971,9407,9964,9990,9783,9907


In [5]:
# build pipeline
builder = PipelineBuilder()
builder.add_step('z_score', Z_Score())
builder.add_step('mfi', MaxFeatureIndex())
builder.add_step('mfamd', MaxFeatureAbsMeanDiff())
builded_pipe = builder.build_transformer()

Pipeline steps:  [('z_score', Z_Score())]
Pipeline steps:  [('z_score', Z_Score()), ('mfi', MaxFeatureIndex())]
Pipeline steps:  [('z_score', Z_Score()), ('mfi', MaxFeatureIndex()), ('mfamd', MaxFeatureAbsMeanDiff())]


In [6]:
# apply pipeline

result = []

# each feature type must be processed with,own transformers and states
for each_key in train_data_dict.keys():
    
    # split each data by it's feature type
    train_dx = train_data_dict[each_key].index
    test_idx = test_data_dict[each_key].index
    
    train_set = train_data_dict[each_key]
    test_set = test_data_dict[each_key]
    
    # for each feature type's train and test sets - build its own pipeline
    pipeline = JooblePipe()
    
    # train pipeline from builder object
    pipeline.train(builded_pipe, train_set)
    
    # save state (if needed)
    pipeline.save_transformer_state('states/feature_{i}_transformer.pkl'.format(i = each_key))
    
    # apply pipeline for feature type I for train and test sets
    test_set_transformed = pipeline.transform_test(test_set)
    test_set_transformed.set_index(test_idx, inplace=True)
    
    # generate column names
    standarterized_col_names = ['feature_{i}_stand_{i2}'.format(i = each_key, i2 = feature_index) for feature_index in range(0,256)]
    max_feature_index_colname = ['max_feature_{i}_index'.format(i = each_key)]
    max_feat_abs_diff_colname = ["max_feature_{i}_abs_mean_diff".format(i = each_key)]
    final_cols = list(chain(standarterized_col_names,max_feature_index_colname, max_feat_abs_diff_colname))
    
    test_set_transformed.columns = final_cols
    result.append(test_set_transformed)
                                    
# stack results if there  
main_df = pd.concat(result,axis = 1)
main_df.to_csv('output/test_proc_multiple_ftypes.tsv', sep='\t')

In [7]:
main_df.head()

Unnamed: 0_level_0,feature_2_stand_0,feature_2_stand_1,feature_2_stand_2,feature_2_stand_3,feature_2_stand_4,feature_2_stand_5,feature_2_stand_6,feature_2_stand_7,feature_2_stand_8,feature_2_stand_9,...,feature_2_stand_248,feature_2_stand_249,feature_2_stand_250,feature_2_stand_251,feature_2_stand_252,feature_2_stand_253,feature_2_stand_254,feature_2_stand_255,max_feature_2_index,max_feature_2_abs_mean_diff
id_job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9168029089769934451,0.695184,0.265061,0.708352,0.68052,-0.534121,0.634674,0.196027,0.593331,0.574582,0.56969,...,0.605324,0.634955,0.179064,0.795506,-0.790513,0.175298,0.758256,0.317085,161.0,154.17
-9167993139315005259,-0.596739,0.265061,-0.324361,-0.04477,0.843588,-0.662418,0.248979,0.009313,-0.430596,-2.187248,...,-0.258077,-0.063723,0.619837,-0.026486,0.600701,0.49582,-0.085143,0.100702,161.0,154.17
-9167993136660569470,-0.602276,0.265061,-0.657494,-0.266144,0.647868,-0.374175,0.217208,-0.211667,-0.360014,-3.429385,...,-0.400768,-0.384614,0.537192,-0.339626,0.626601,0.31152,-0.370292,-0.03146,161.0,154.17
-9167993126042826314,-0.792373,0.263512,-1.157194,-0.834142,0.797536,-0.903821,0.492557,-0.671666,-0.754297,-0.517403,...,-0.855444,-0.737367,0.769385,-1.713962,0.693202,0.527872,-1.012882,-0.058669,161.0,154.17
-9167914043308884846,-1.115354,-0.115987,-1.612476,-1.530304,0.214216,-1.224491,0.601991,-1.505978,-1.377362,-0.269688,...,-0.97395,-1.72735,0.596224,-2.357638,0.397199,0.554582,-1.526954,0.085154,203.0,21.32
