In [1]:
import os, sys, time
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import yaml
import cPickle as pickle

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'

In [2]:
'''
This section loads a fraction of categorical data and save the columns
names into a pickle file named by 'pickle_column_names_file'.
So that the following categorical data loading can directly use explicitly types
'''
def create_categorical_column_name_pickle(train_cat_file, pickle_column_names_file):

    tmp_train_cat = pd.read_csv(data_path + train_cat_file, index_col='Id', nrows=1000)

    #for col, dtype in zip(tmp_train_cat.columns, tmp_train_cat.dtypes):
    #    print len(train_cat[col].unique()), dtype

    ## save the column names to pickle file
    col_names = tmp_train_cat.columns.tolist()
    with open(pickle_column_names_file, 'wb') as pickle_file:
        pickle.dump(col_names, pickle_file)


pickle_column_names_file = data_path + 'cat_col_names.pkl'
start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'
bin_num = 15 ## number of bins to separate data by start_time


In [3]:
## load the labels and start_time column for train and test data
start_time = time.time()
train_labels = pd.read_csv(data_path + train_num_file, index_col='Id', usecols=['Id', dep_var_name])
train_date_start_columm = pd.read_csv(data_path + train_date_file, index_col='Id', usecols=['Id', start_time_column_name])
test_date_start_columm = pd.read_csv(data_path + test_date_file, index_col='Id', usecols=['Id', start_time_column_name])
end_time = time.time()
print 'data loading takes ', round((end_time - start_time), 1), ' seconds.'

## join the start_time with labels, then drop the NaN in start_time
labeled_start_time = pd.merge(train_labels, train_date_start_columm, how='left', left_index=True, right_index=True)
## this labeled_start_time dataFrame doesn't contain the NaN
## can be directly used for calculating the mquantiles
labeled_start_time = labeled_start_time[~labeled_start_time[start_time_column_name].isnull()]



data loading takes  57.5  seconds.


In [4]:
'''
section to subset the data by start_time
'''
from scipy.stats.mstats import mquantiles

prob_list = [1.*i/bin_num for i in range(1, bin_num)]
quantile_values = mquantiles(labeled_start_time[start_time_column_name], prob=prob_list)

bins = [labeled_start_time[start_time_column_name].min()]
bins.extend(quantile_values)
bins.append(labeled_start_time[start_time_column_name].max())
bin_names = [str(i) for i in range(len(bins)-1)]

## cut the entire dataframe into different time_windows by start_time
tmp_train = train_date_start_columm.copy()
tmp_test = test_date_start_columm.copy()

tmp_train['time_window_num'] = pd.cut(tmp_train[start_time_column_name], bins, labels=bin_names)
tmp_test['time_window_num'] = pd.cut(tmp_test[start_time_column_name], bins, labels=bin_names)
## create a row number column, start index is 1
tmp_train['row_num'] = range(1, (tmp_train.shape[0] + 1))
tmp_test['row_num'] = range(1, (tmp_test.shape[0] + 1))


In [77]:
def encode_categorical_data(train, test, fill_missing = False):
    '''
    encoding is an extemely slow process
    So only use the training data to trian the encoder
    '''
    le = LabelEncoder()

    ## new dataFrame is created from here
    if fill_missing:
        train = train.fillna(value='missing')
        test = test.fillna(value='missing')

    ## idealy combine the train and test
    #combined = pd.concat([train, test], axis=0)
    counter = 0
    start_time = time.time()
    for col in train.columns:
        combined_df = pd.concat([train[col], test[col]], axis=0)
        le.fit(combined_df)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
        #print train[col].unique()
        #print test[col].unique()
        counter += 1
        if counter % 20 == 0:
            print '{} out of {} is process...'.format(str(counter), str(train.shape[1]))

    for col in train.columns:
        print col, train[col].unique()
        
    end_time = time.time()
    print 'encoding process takes ', round((end_time - start_time)), 'seconds'

    return train, test
    
    
def process_date_data(train_date, test_date, start_time_column_name):
    print 'raw date data dimension: ', train_date.shape, test_date.shape
    train_date['start_time'] = train_date[start_time_column_name]
    test_date['start_time'] = test_date[start_time_column_name]
    single_value_column_names = []

    for column in train_date.columns:
        if column != 'start_time':
            train_date[column] = train_date[column] - train_date['start_time']
            test_date[column] = test_date[column] - test_date['start_time']
        if len(train_date[column].unique()) == 1:
            single_value_column_names.append(column)

    ## drop single-valued columns
    train_date.drop(single_value_column_names, axis=1, inplace=True)
    test_date.drop(single_value_column_names, axis=1, inplace=True)
    print 'processed date data dimension: ', train_date.shape, test_date.shape



def remove_single_value_categorical_columns(train, test):
    print 'raw categorical data dimension: ', train.shape, test.shape
    single_value_column_names = []
    for col in train.columns:
        if len(train[col].unique()) == 1:
            single_value_column_names.append(col)

    train.drop(single_value_column_names, axis=1, inplace=True)
    test.drop(single_value_column_names, axis=1, inplace=True)
    print 'processed categorical data dimension: ', train.shape, test.shape


In [7]:
selected_bin_name = '0'
none_selected_window_num = bin_names[:]
none_selected_window_num.append(np.NaN)
none_selected_window_num.remove(selected_bin_name)
skipped_test_row_num = tmp_test.loc[tmp_test['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()
skipped_train_row_num = tmp_train.loc[tmp_train['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()
    

In [87]:
start_time = time.time()

##### section to load column names for categorical data  ##########
if not os.path.isfile(pickle_column_names_file):
    print 'create new column name pickle file ...'
    create_categorical_column_name_pickle(train_cat_file, pickle_column_names_file)

with open(pickle_column_names_file, 'rb') as pickle_file:
    cat_column_names = pickle.load(pickle_file)

column_types = [np.object] * len(cat_column_names)
column_types_dict = dict(zip(cat_column_names, column_types))
################

train_date = pd.read_csv(data_path + train_date_file, index_col='Id', skiprows=skipped_train_row_num)
train_num = pd.read_csv(data_path + train_num_file, index_col='Id', skiprows=skipped_train_row_num)
train_cat = pd.read_csv(data_path + train_cat_file, index_col='Id', skiprows=skipped_train_row_num)

test_date = pd.read_csv(data_path + test_date_file, index_col='Id', skiprows=skipped_test_row_num)
test_num = pd.read_csv(data_path + test_num_file, index_col='Id', skiprows=skipped_test_row_num)
test_cat = pd.read_csv(data_path + test_cat_file, index_col='Id', skiprows=skipped_test_row_num)

end_time = time.time()
print 'data loading takes ', round((end_time - start_time), 2), 'seconds'
        

data loading takes  88.94 seconds


In [91]:
sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils

In [93]:
from utils import abc

In [94]:
dir(abc)

['__builtins__', '__doc__', '__file__', '__name__', '__package__', 'aa', 'bb']

In [88]:
le = LabelEncoderEncoderelEncoder()

In [20]:
## make a copy of the original data
tmp_train_date = train_date.copy()
tmp_train_num = train_num.copy()
tmp_train_cat = train_cat.copy()

tmp_test_date = test_date.copy()
tmp_test_num = test_num.copy()
tmp_test_cat = test_cat.copy()

In [63]:
## process the date data
process_date_data(tmp_train_date, tmp_test_date, start_time_column_name)
print 'finish processing date data ...'

raw date data dimension:  (44923, 1156) (45340, 1156)
processed date data dimension:  (44923, 546) (45340, 546)
finish processing date data ...


In [None]:
## data quality check:
## expect all the numerical columns

In [26]:
#tmp_train_num.dtypes[tmp_train_num.dtypes != 'float64']

Response    int64
dtype: object

In [16]:
all(tmp_test_date.dtypes == 'float64')

True

In [27]:
## process categorical data
remove_single_value_categorical_columns(tmp_train_cat, tmp_test_cat)
encode_categorical_data(tmp_train_cat, tmp_test_cat, True)
print 'finish processing categorical data ...'

## combine the data and save into csv files
#combined_train = pd.concat([train_cat, train_num, train_date], axis=1)
#combined_test = pd.concat([test_cat, test_num, test_date], axis=1)


raw categorical data dimension:  (44923, 2140) (45340, 2140)
processed categorical data dimension:  (44923, 343) (45340, 343)
20 out of 343 is process...
40 out of 343 is process...
60 out of 343 is process...
80 out of 343 is process...
100 out of 343 is process...
120 out of 343 is process...
140 out of 343 is process...
160 out of 343 is process...
180 out of 343 is process...
200 out of 343 is process...
220 out of 343 is process...
240 out of 343 is process...
260 out of 343 is process...
280 out of 343 is process...
300 out of 343 is process...
320 out of 343 is process...
340 out of 343 is process...
encoding process takes  61.0 seconds
finish processing categorical data ...


In [78]:
encode_categorical_data(tmp_train_cat, tmp_test_cat, True)

20 out of 343 is process...
40 out of 343 is process...
60 out of 343 is process...
80 out of 343 is process...
100 out of 343 is process...
120 out of 343 is process...
140 out of 343 is process...
160 out of 343 is process...
180 out of 343 is process...
200 out of 343 is process...
220 out of 343 is process...
240 out of 343 is process...
260 out of 343 is process...
280 out of 343 is process...
300 out of 343 is process...
320 out of 343 is process...
340 out of 343 is process...
L0_S2_F33 [1 0]
L0_S2_F35 [3 1 2]
L0_S2_F37 [1 0]
L0_S2_F39 [3 1 2]
L0_S2_F41 [1 0]
L0_S2_F43 [3 1 2]
L0_S2_F45 [1 0]
L0_S2_F47 [3 1 2]
L0_S2_F49 [1 0]
L0_S2_F51 [3 1 2]
L0_S2_F53 [1 0]
L0_S2_F55 [3 1 2]
L0_S2_F57 [1 0]
L0_S2_F59 [3 1 2]
L0_S2_F61 [1 0]
L0_S2_F63 [3 1 2]
L0_S2_F65 [1 0]
L0_S2_F67 [3 1 2]
L0_S6_F119 [1 0]
L0_S6_F121 [1 0]
L0_S6_F123 [1 0]
L0_S6_F125 [1 0]
L0_S6_F126 [1 0]
L0_S6_F128 [1 0]
L0_S6_F129 [1 0]
L0_S6_F131 [1 0]
L0_S6_F133 [1 0]
L0_S6_F135 [1 0]
L0_S9_F151 [1 0]
L0_S9_F153 [1 0]
L

In [81]:
tmp_train_cat['L3_S31_F3841'].unique()

array([nan, 'T128'], dtype=object)

In [80]:
for col in tmp_train_cat.columns:
    print tmp_train_cat[col].unique()

[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T32' 'T96']
[nan 'T1']
[nan 'T16']
[nan 'T1']
[nan 'T16']
[nan 'T1']
[nan 'T16']
[nan 'T1']
[nan 'T16']
[nan 'T1']
[nan 'T16']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T65536' 'T6553' 'T96' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T65536' 'T6553' 'T96' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T96' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T65536' 'T6553' 'T96' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']
[nan 'T48576' 'T8' 'T16777557' 'T16777232']
[nan 'T1']
[nan 'T1']