In [6]:
%matplotlib inline
from sklearn import preprocessing
import os, sys, time
import pandas as pd
import numpy as np
from os.path import join

sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'


start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'
nan_fill = -2.

In [4]:
bin_num = 1 ## number of bins to separate data by start_time
tmp_train, tmp_test, bins, bin_names = bosch_functions.create_grouped_index_df(bin_num)

data loading takes  66.6  seconds.


In [7]:
## create the skipped row numbers
#LR_selected_features = ['L3_S31_F3834', 'L1_S25_F1855', 'L0_S8_F144', 'L0_S8_F146', 'L3_S32_F3850']
LR_selected_features = ['L0_S0_F0', 'L3_S29_F3315', 'L0_S1_F28', 'L0_S1_F24', 
                        'L1_S25_F1855', 'L0_S0_F16', 'L0_S12_F330', 'L0_S10_F219',
                        'L0_S9_F155', 'L0_S11_F282', 'L2_S28_F3222', 'L3_S31_F3834',
                        'L1_S24_F1118', 'L0_S0_F4', 'L0_S8_F144', 'L3_S32_F3850','L0_S8_F146']

none_selected_window_num = [np.NaN]
skipped_test_row_num = tmp_test.loc[tmp_test['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()
skipped_train_row_num = tmp_train.loc[tmp_train['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()

train_cat_cols  = pd.read_csv(join(data_path, train_cat_file), index_col=id_column_name, nrows=0)
train_date_cols = pd.read_csv(join(data_path, train_date_file), index_col=id_column_name, nrows=0)
train_num_cols  = pd.read_csv(join(data_path, train_num_file), index_col=id_column_name, nrows=0)

regular_data_path = '/home/ymm/kaggle/bosch/data_1_bins_cross_fit_xgb_models/data_bin_0_models'

def collect_feature_names(data_path, fea_name='feature', thres_name = None, thres = 10):
    csv_files = [f for f in os.listdir(data_path) if '.csv' in f]
    feature_names = set()
    for file_name in csv_files:
        data = pd.read_csv(join(data_path, file_name), index_col=0)
        if thres_name is None:
            feature_names = feature_names.union(data[fea_name])
        else:
            feature_names = feature_names.union(data.loc[data[thres_name] > thres, fea_name])

    return feature_names

## collect feature names based on the fscore
bin_regular_selected_col_name = collect_feature_names(regular_data_path, 'feature', 'fscore', 10)

## based on the selected features from xgboost to create column list
selected_num_col_names =  train_num_cols.columns[train_num_cols.columns.isin(bin_regular_selected_col_name)].tolist()

## add LR important features into the column set
for feature_name in LR_selected_features:
    if feature_name not in selected_num_col_names:
        selected_num_col_names.append(feature_name)

selected_cat_col_names =  train_cat_cols.columns[train_cat_cols.columns.isin(bin_regular_selected_col_name)].tolist()
selected_dat_col_names =  train_date_cols.columns[train_date_cols.columns.isin(bin_regular_selected_col_name)].tolist()
test_num_col_names     =  selected_num_col_names[:]
        
selected_dat_col_names.extend([id_column_name, start_time_column_name])
selected_cat_col_names.extend([id_column_name])
selected_num_col_names.extend([id_column_name, dep_var_name])
test_num_col_names.extend([id_column_name])

In [8]:
print len(test_num_col_names), len(selected_num_col_names)

296 297


In [9]:
start_time = time.time()
train_cat  = pd.read_csv(join(data_path, train_cat_file),   index_col='Id', skiprows=skipped_train_row_num, usecols=selected_cat_col_names)
test_cat   = pd.read_csv(join(data_path, test_cat_file),    index_col='Id', skiprows=skipped_test_row_num,  usecols=selected_cat_col_names)
train_dat  = pd.read_csv(join(data_path, train_date_file),  index_col='Id', skiprows=skipped_train_row_num, usecols=selected_dat_col_names)
test_dat   = pd.read_csv(join(data_path, test_date_file),   index_col='Id', skiprows=skipped_test_row_num,  usecols=selected_dat_col_names)
train_num  = pd.read_csv(join(data_path, train_num_file),   index_col='Id', skiprows=skipped_train_row_num, usecols=selected_num_col_names)
test_num   = pd.read_csv(join(data_path, test_num_file),    index_col='Id', skiprows=skipped_test_row_num,  usecols=test_num_col_names)

print 'finish reading data by columns selected using xgboost feature importance, using {} seconds.'.format(round(time.time() - start_time, 2))

finish reading data by columns selected using xgboost feature importance, using 231.54 seconds.


In [10]:
print train_cat.shape, train_num.shape, train_dat.shape

(673861, 12) (673861, 296) (673861, 67)


In [11]:
print test_cat.shape, test_num.shape, test_dat.shape

(674503, 12) (674503, 295) (674503, 67)


### process the date features

In [13]:
tmp_train_date = train_dat.copy()
tmp_test_date = test_dat.copy()

In [14]:
## normalized the date feature with start_time
start_time = time.time()
tmp_train_date['start_time'] = tmp_train_date[start_time_column_name]
for column in tmp_train_date.columns:
    if column != 'start_time':
        tmp_train_date[column] = tmp_train_date[column] - tmp_train_date['start_time']
        
        
tmp_test_date['start_time'] = tmp_test_date[start_time_column_name]
for column in tmp_train_date.columns:
    if column != 'start_time':
        tmp_test_date[column] = tmp_test_date[column] - tmp_test_date['start_time']
        
print 'finish substract start_time using {} seconds'.format(round(time.time() - start_time, 1))

finish substract start_time using 0.8 seconds


#### feature engineering works on the date featues

In [15]:
start_time = time.time()
combined_date = pd.concat([tmp_train_date, tmp_test_date])
train_date_index = tmp_train_date.index
test_date_index  = tmp_test_date.index

def date_columns_feature_engineering(df):
    tmp_df = df.loc[:, df.columns != 'start_time']
    new_fea_df = pd.DataFrame()
    new_fea_df['time_sum'] = tmp_df.sum(axis=1)
    new_fea_df['time_max'] = tmp_df.max(axis=1)
    new_fea_df['date_nan_col_count'] = tmp_df.isnull().sum(axis=1)
    new_fea_df['date_reg_col_count'] = tmp_df.shape[1] - tmp_df.isnull().sum(axis=1)
    idmax = tmp_df.idxmax(axis=1)
    new_fea_df = pd.merge(new_fea_df, pd.get_dummies(idmax, prefix='oneHot'), how='left', left_index=True, right_index=True)
    return new_fea_df

combined_date_new_fea = date_columns_feature_engineering(combined_date)
print 'finish creating new date features using {} seconds'.format(round(time.time() - start_time, 2))                                                        

finish creating new date features using 13.86 seconds


In [16]:
print combined_date_new_fea.shape
combined_date_new_fea.head()

(1348364, 18)


Unnamed: 0_level_0,time_sum,time_max,date_nan_col_count,date_reg_col_count,oneHot_L2_S26_D3037,oneHot_L2_S27_D3130,oneHot_L3_S31_D3836,oneHot_L3_S32_D3852,oneHot_L3_S33_D3856,oneHot_L3_S34_D3875,oneHot_L3_S35_D3886,oneHot_L3_S36_D3919,oneHot_L3_S37_D3942,oneHot_L3_S38_D3953,oneHot_L3_S47_D4140,oneHot_L3_S49_D4208,oneHot_L3_S50_D4242,oneHot_L3_S51_D4255
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4,121.02,5.05,32,35,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,143.04,5.72,30,37,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,113.74,4.96,33,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11,77.66,3.38,32,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,185.32,8.07,32,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
tmp_train_date = pd.merge(tmp_train_date, combined_date_new_fea.ix[train_date_index], how='left', left_index=True, right_index=True)
tmp_test_date = pd.merge(tmp_test_date, combined_date_new_fea.ix[test_date_index], how='left', left_index=True, right_index=True)

In [18]:
print tmp_test_date.shape
tmp_test_date.head()

(674503, 86)


Unnamed: 0_level_0,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S3_D74,L0_S4_D106,L0_S4_D111,L0_S5_D115,L0_S5_D117,L0_S6_D120,...,oneHot_L3_S33_D3856,oneHot_L3_S34_D3875,oneHot_L3_S35_D3886,oneHot_L3_S36_D3919,oneHot_L3_S37_D3942,oneHot_L3_S38_D3953,oneHot_L3_S47_D4140,oneHot_L3_S49_D4208,oneHot_L3_S50_D4242,oneHot_L3_S51_D4255
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.01,,,0.03,0.03,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,,,,,0.03,0.03,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,,0.0,0.0,0.02,0.02,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15,0.0,0.0,,0.0,0.0,,,0.01,0.01,0.02,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.01,,,0.02,0.02,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
print tmp_train_date.shape
tmp_train_date.head()

(673861, 86)


Unnamed: 0_level_0,L0_S0_D1,L0_S1_D26,L0_S2_D34,L0_S3_D70,L0_S3_D74,L0_S4_D106,L0_S4_D111,L0_S5_D115,L0_S5_D117,L0_S6_D120,...,oneHot_L3_S33_D3856,oneHot_L3_S34_D3875,oneHot_L3_S35_D3886,oneHot_L3_S36_D3919,oneHot_L3_S37_D3942,oneHot_L3_S38_D3953,oneHot_L3_S47_D4140,oneHot_L3_S49_D4208,oneHot_L3_S50_D4242,oneHot_L3_S51_D4255
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,,,0.02,0.02,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,,,,,0.02,0.02,0.02,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.01,,,0.02,0.02,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,,0.0,0.0,0.02,0.02,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,,0.01,0.01,0.02,0.02,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
print tmp_test_date.min().min(), tmp_train_date.max().max()
print tmp_train_date.min().min(), tmp_train_date.max().max()

0.0 18621.81
0.0 18621.81


In [21]:
tmp_train_date = tmp_train_date.fillna(nan_fill)
tmp_test_date = tmp_test_date.fillna(nan_fill)

In [22]:
tmp_train_date.dtypes.value_counts()

float64    84
int64       2
dtype: int64

### encode the numerical data

In [23]:
tmp_num_train = train_num.copy()
tmp_num_test = test_num.copy()

In [24]:
print tmp_num_train.shape
tmp_num_train.head()

(673861, 296)


Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,L3_S47_F4153,L3_S47_F4158,L3_S47_F4163,L3_S48_F4196,L3_S48_F4198,L3_S49_F4211,L3_S49_F4226,L3_S50_F4243,L3_S50_F4253,Response
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,0.083,...,,,,,,,,,,0
7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,-0.147,...,,,,,,,,,,0
9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,-0.046,...,,,,,,,,,,0
11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,-0.099,...,,,,,,,,,,0
13,0.003,0.019,0.294,0.312,0.031,0.161,0.022,0.088,-0.005,-0.003,...,,,,,,,,,,0


### feature engineering on the numerical features

In [26]:
def num_columns_feature_engineering(df, col_ignore = ['Response']):
    #tmp_df = df.loc[:, df.columns != 'start_time']
    tmp_df = df.loc[:, ~df.columns.isin(col_ignore)]
    new_fea_df = pd.DataFrame()
    encoder = preprocessing.LabelEncoder()
    new_fea_df['num_sum'] = tmp_df.sum(axis=1)
    new_fea_df['num_max'] = tmp_df.max(axis=1)
    new_fea_df['num_min'] = tmp_df.min(axis=1)
    new_fea_df['mun_max_min_ratio'] = new_fea_df['num_max'] / new_fea_df['num_min']
    new_fea_df['num_nan_col_count'] = tmp_df.isnull().sum(axis=1)
    new_fea_df['num_reg_col_count'] = tmp_df.shape[1] - tmp_df.isnull().sum(axis=1)
    new_fea_df['idxmax'] = tmp_df.idxmax(axis=1)
    new_fea_df['idxmax'] = encoder.fit_transform(new_fea_df['idxmax'])
    new_fea_df['idxmin'] = tmp_df.idxmin(axis=1)
    new_fea_df['idxmin'] = encoder.fit_transform(new_fea_df['idxmin'])
    #new_fea_df = pd.merge(new_fea_df, pd.get_dummies(idmax, prefix='oneHot'), how='left', left_index=True, right_index=True)
    return new_fea_df


## generic function to encode categorical features
def sweep_up_categorical_encode_by_dep_var(df, fea_name, test_df = None, dep_var_name='Response', count_thres = 10, nan_fill = -1., const_scale = 1.):
    tmp_df = df[[fea_name, dep_var_name]]
    tmp_df = tmp_df.fillna(nan_fill)
    value_counts = tmp_df[fea_name].value_counts()
    minor_keys = []
    key_dep_var_map = {}
    
    ## training sweep-up
    for counts, key in zip(value_counts.values, value_counts.index):
        if counts > count_thres:
            mean_dep_var = const_scale * tmp_df.loc[tmp_df[fea_name] == key, dep_var_name].mean()
            key_dep_var_map[key] = mean_dep_var
        else:
            minor_keys.append(key)
            
    ## mean value of dep_var for all the minor levels
    if len(minor_keys) > 0:
        minor_key_dep_var_mean = const_scale * tmp_df.loc[tmp_df[fea_name].isin(minor_keys), dep_var_name].mean()
        ## update the key_dep_var_map with minor key
        for key in minor_keys:
            key_dep_var_map[key] = minor_key_dep_var_mean
    
    encoded_train = tmp_df[fea_name].replace(key_dep_var_map)
    overall_mean_dep_var = tmp_df[dep_var_name].mean()
    
    ## sweep up the test column
    if test_df is not None:
        test_value_counts = test_df[fea_name].value_counts()
        test_minor_keys = []
        test_key_dep_var_map = key_dep_var_map.copy()
    
        for counts, key in zip(test_value_counts.values, test_value_counts.index):
            if key not in test_key_dep_var_map:
                print 'new level {} with counts {} found in test data'.format(key, counts)
                if counts > count_thres:
                    print 'warning! new level {} is found in test data!'.format(key)
                else:
                    test_minor_keys.append(key)
        
        if len(test_minor_keys) > 0:
            for key in test_minor_keys:
                test_key_dep_var_map[key] = const_scale * overall_mean_dep_var
        
        encoded_test = test_df[fea_name].replace(test_key_dep_var_map)
        return encoded_train, encoded_test
    
    else:
        return encoded_train

In [27]:
#tmp_num_dep_var = tmp_num_train[dep_var_name]
combined_num = pd.concat([tmp_num_train, tmp_num_test])
train_num_index = tmp_num_train.index
test_num_index  = tmp_num_test.index

In [28]:
## feature engineering on the numerical features without filling NaN
start_time = time.time()
new_fea_combined_num = num_columns_feature_engineering(combined_num)
print 'finish creating new numerical features using {} seconds'.format(round(time.time() - start_time, 0))                                                        

finish creating new numerical features using 155.0 seconds


In [29]:
tmp_num_train = pd.merge(tmp_num_train, new_fea_combined_num.ix[train_date_index], how='left', left_index=True, right_index=True)
tmp_num_test = pd.merge(tmp_num_test, new_fea_combined_num.ix[test_date_index], how='left', left_index=True, right_index=True)

In [30]:
print tmp_num_train.shape, tmp_num_test.shape

(673861, 304) (674503, 303)


In [None]:
#print tmp_num_train.min().min(), tmp_num_train.max().max()
#print tmp_num_test.min().min(), tmp_num_test.max().max()

#### multiple important features are identified from a LR model trained on the important numerical features


In [31]:
categorical_like_columns = []
level_num_thres = 10
for column in tmp_num_train.columns:
    level_num = len(tmp_num_train[column].unique())
    if level_num < level_num_thres:
        if column != dep_var_name:
            categorical_like_columns.append(column)
            print 'column {} has #levels {} if in LR features: {}'.format(column, level_num, column in LR_selected_features)
        #print combined_num[column].value_counts()

column L0_S2_F32 has #levels 7 if in LR features: False
column L0_S2_F56 has #levels 8 if in LR features: False
column L0_S3_F68 has #levels 8 if in LR features: False
column L0_S3_F92 has #levels 6 if in LR features: False
column L0_S6_F118 has #levels 4 if in LR features: False
column L0_S7_F136 has #levels 4 if in LR features: False
column L0_S8_F144 has #levels 4 if in LR features: True
column L0_S12_F340 has #levels 7 if in LR features: False
column L0_S15_F412 has #levels 4 if in LR features: False
column L3_S29_F3360 has #levels 5 if in LR features: False
column L3_S29_F3398 has #levels 5 if in LR features: False
column L3_S29_F3464 has #levels 3 if in LR features: False
column L3_S50_F4253 has #levels 4 if in LR features: False


In [32]:
count_thres = 50
const_scale = 100.

for fea_name in categorical_like_columns:
    new_fea_name = '{}_{}'.format('dep_var_encoded', fea_name)
    tmp_num_train[new_fea_name], tmp_num_test[new_fea_name] = sweep_up_categorical_encode_by_dep_var(tmp_num_train, 
                                                                                                     fea_name, 
                                                                                                     test_df=tmp_num_test,
                                                                                                     count_thres=count_thres,
                                                                                                     nan_fill = nan_fill,
                                                                                                     const_scale = const_scale)
    

new level 0.387 with counts 1 found in test data
new level 0.409 with counts 1 found in test data


In [33]:
tmp_num_train.head()

Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,dep_var_encoded_L0_S3_F92,dep_var_encoded_L0_S6_F118,dep_var_encoded_L0_S7_F136,dep_var_encoded_L0_S8_F144,dep_var_encoded_L0_S12_F340,dep_var_encoded_L0_S15_F412,dep_var_encoded_L3_S29_F3360,dep_var_encoded_L3_S29_F3398,dep_var_encoded_L3_S29_F3464,dep_var_encoded_L3_S50_F4253
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,0.083,...,0.535063,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,-0.147,...,0.535063,0.575852,0.533018,0.619574,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687
9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,-0.046,...,0.535063,0.537803,0.590763,0.35118,0.534721,0.53516,0.537707,0.537707,0.471441,0.53687
11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,-0.099,...,0.527795,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
13,0.003,0.019,0.294,0.312,0.031,0.161,0.022,0.088,-0.005,-0.003,...,0.536443,0.537803,0.590763,0.35118,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687


In [34]:
print tmp_num_train.shape, tmp_num_test.shape, dep_var_name in tmp_num_train.columns

(673861, 317) (674503, 316) True


In [35]:
tmp_num_train.head()

Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,dep_var_encoded_L0_S3_F92,dep_var_encoded_L0_S6_F118,dep_var_encoded_L0_S7_F136,dep_var_encoded_L0_S8_F144,dep_var_encoded_L0_S12_F340,dep_var_encoded_L0_S15_F412,dep_var_encoded_L3_S29_F3360,dep_var_encoded_L3_S29_F3398,dep_var_encoded_L3_S29_F3464,dep_var_encoded_L3_S50_F4253
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,0.083,...,0.535063,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,-0.147,...,0.535063,0.575852,0.533018,0.619574,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687
9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,-0.046,...,0.535063,0.537803,0.590763,0.35118,0.534721,0.53516,0.537707,0.537707,0.471441,0.53687
11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,-0.099,...,0.527795,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
13,0.003,0.019,0.294,0.312,0.031,0.161,0.022,0.088,-0.005,-0.003,...,0.536443,0.537803,0.590763,0.35118,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687


In [36]:
tmp_num_train = tmp_num_train.fillna(nan_fill)
tmp_num_test = tmp_num_test.fillna(nan_fill)

In [37]:
tmp_num_train.head()

Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,dep_var_encoded_L0_S3_F92,dep_var_encoded_L0_S6_F118,dep_var_encoded_L0_S7_F136,dep_var_encoded_L0_S8_F144,dep_var_encoded_L0_S12_F340,dep_var_encoded_L0_S15_F412,dep_var_encoded_L3_S29_F3360,dep_var_encoded_L3_S29_F3398,dep_var_encoded_L3_S29_F3464,dep_var_encoded_L3_S50_F4253
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,0.083,...,0.535063,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,-0.147,...,0.535063,0.575852,0.533018,0.619574,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687
9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,-0.046,...,0.535063,0.537803,0.590763,0.35118,0.534721,0.53516,0.537707,0.537707,0.471441,0.53687
11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,-0.099,...,0.527795,0.537803,0.590763,0.619574,0.534721,0.53516,0.537707,0.537707,0.625934,0.53687
13,0.003,0.019,0.294,0.312,0.031,0.161,0.022,0.088,-0.005,-0.003,...,0.536443,0.537803,0.590763,0.35118,0.534721,0.53516,0.538792,0.538792,0.471441,0.53687


### process the categorical features

In [38]:
start_time = time.time()
combined_cat = pd.concat([train_cat, test_cat])
## convert to string so that column is categorical
combined_cat = combined_cat.astype(str)
## One-Hot encode all the categorical columns
oneHot_combined_cat = pd.get_dummies(combined_cat, dummy_na=True)

train_index = train_cat.index
test_index  = test_cat.index
oneHot_train_cat = oneHot_combined_cat.ix[train_index]
oneHot_test_cat  = oneHot_combined_cat.ix[test_index]
print 'finish OneHot encoding the categorical columns, using {} seconds'.format(round(time.time() - start_time, 2))

finish OneHot encoding the categorical columns, using 13.57 seconds


In [39]:
print oneHot_train_cat.shape
oneHot_train_cat.head()

(673861, 77)


Unnamed: 0_level_0,L2_S26_F3038_1.0,L2_S26_F3038_nan,L2_S26_F3038_nan,L2_S27_F3131_1.0,L2_S27_F3131_nan,L2_S27_F3131_nan,L3_S29_F3317_1.0,L3_S29_F3317_nan,L3_S29_F3317_nan,L3_S29_F3475_1.0,...,L3_S35_F3912_nan,L3_S35_F3912_nan,L3_S49_F4217_1.0,L3_S49_F4217_nan,L3_S49_F4217_nan,L3_S49_F4220_16.0,L3_S49_F4220_2.0,L3_S49_F4220_4.0,L3_S49_F4220_nan,L3_S49_F4220_nan
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
11,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
13,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
combined_train = pd.concat([oneHot_train_cat, tmp_num_train, tmp_train_date], axis=1)
combined_test  = pd.concat([oneHot_test_cat,  tmp_num_test,  tmp_test_date],  axis=1)

In [41]:
print combined_train.shape, combined_test.shape

(673861, 480) (674503, 479)


In [43]:
print combined_test.isnull().sum().sum(), '\n \n', combined_train.isnull().sum().sum()

0 
 
0


In [44]:
print combined_test.dtypes.value_counts(), '\n \n',  combined_train.dtypes.value_counts()

float64    473
int64        6
dtype: int64 
 
float64    473
int64        7
dtype: int64


In [45]:
dep_var_name in combined_train.columns, dep_var_name in combined_test

(True, False)

In [None]:
start_time = time.time()
combined_train.to_csv('bosch_processed_regular_filled_FE_thres_10_train_data.csv')
combined_test.to_csv('bosch_processed_regular_filled_FE_thres_10_test_data.csv')
print 'saving data using {} seconds'.format(round(time.time() - start_time, 2))