In [3]:
import os, sys, time, random
import pandas as pd
import numpy as np
from os.path import join
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions
from utils.wrapped_xgboost import xgboost_classifier
from utils.validation_tools import score_MCC, MCC, create_validation_index
from utils.models import CombinedModel
from utils.data_munge import remove_single_value_columns
from utils.feature_engineering import NumericalFeatureEngineering, getRelativeTimeColumns, BasicDate_FeatureEngineering
from utils.feature_engineering import getTimeChangeColumns, getTimeSteps, build_IndexFeatures

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'

start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'

In [4]:
tot_row_num = 1183747
num_rows = 50000
skip = sorted(random.sample(xrange(1,tot_row_num + 1),tot_row_num - num_rows))

In [5]:
## randomly select certain rows
train_num = pd.read_csv(join(data_path, train_num_file),    index_col='Id', skiprows=skip, nrows=num_rows)
train_dat = pd.read_csv(join(data_path, train_date_file),   index_col='Id', skiprows=skip, nrows=num_rows)
train_cat = pd.read_csv(join(data_path, train_cat_file),    index_col='Id', skiprows=skip, nrows=num_rows)

test_num = pd.read_csv(join(data_path, test_num_file),      index_col='Id', nrows=num_rows)
test_dat = pd.read_csv(join(data_path, test_date_file),     index_col='Id', nrows=num_rows)
test_cat = pd.read_csv(join(data_path, test_cat_file),      index_col='Id', nrows=num_rows)

print 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))

KeyboardInterrupt: 

In [4]:
remove_single_value_columns(train_num, 'Response', test=test_num)
remove_single_value_columns(train_dat, test=test_dat)
remove_single_value_columns(train_cat, test=test_cat)

raw train data dimension:  (50000, 969)
raw test data dimension:  (50000, 968)
processed train data dimension:  (50000, 969)
processed test data dimension:  (50000, 968)
raw train data dimension:  (50000, 1156)
raw test data dimension:  (50000, 1156)
processed train data dimension:  (50000, 1150)
processed test data dimension:  (50000, 1150)
raw train data dimension:  (50000, 2140)
raw test data dimension:  (50000, 2140)
processed train data dimension:  (50000, 1082)
processed test data dimension:  (50000, 1082)


In [55]:
dat_columns = tmp_train_dat.columns.tolist()
num_columns = tmp_train_num.columns.tolist()



def build_column_dict(columns):
    col_dict = {}
    for col in columns:
        tmpList = col.split('_')[0:2]
        key = ('_').join(tmpList)
        if key not in col_dict:
            col_dict[key] = [col]
        else:
            col_dict[key].append(col)
            
    return col_dict

dat_col_dict = build_column_dict(dat_columns)
num_col_dict = build_column_dict(num_columns)
        

In [38]:
tmp_train_dat.shape

(50000, 1150)

In [56]:
start_time = time.time()


def build_station_features(df, col_dict, prefix='dat'):
    features = pd.DataFrame()
    for key, value in col_dict.items():
        features['{}_{}_{}'.format(prefix, key, 'mean')] = df[value].mean(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'max')] = df[value].max(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'min')] = df[value].min(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'var')] = df[value].var(axis=1)
    return features
    
dat_features = build_station_features(tmp_train_dat, dat_col_dict, 'dat')
num_features = build_station_features(tmp_train_num, num_col_dict, 'num')
print 'finish feature engineering date station using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date station using 0.13 minutes


In [72]:
#print features.shape
#features.head()

In [6]:
#### numerical feature engineering work
train_num_Basics = NumericalFeatureEngineering(train_num)
test_num_Basics = NumericalFeatureEngineering(test_num)

In [7]:
missing_value= -1.5
train_num_Basics.fillna(missing_value, inplace=True)
test_num_Basics.fillna(missing_value, inplace=True)

train_num.fillna(missing_value, inplace=True)
test_num.fillna(missing_value, inplace=True)

In [8]:
combined_train_num = pd.concat([train_num, train_num_Basics], axis=1)
combined_test_num  = pd.concat([test_num, test_num_Basics], axis=1)                                                                            
print 'combined train numerical feature shape: {}, combined test numerical features shape: {}'.format(combined_train_num.shape, combined_test_num.shape)

combined train numerical feature shape: (50000, 978), combined test numerical features shape: (50000, 977)


In [9]:
#print combined_test_num.isnull().sum().sum(), combined_train_num.isnull().sum().sum()

In [10]:
#train_num.min().hist(bins=50)

In [11]:
#train_num.max().hist(bins=50)

In [12]:
### section of date features

## basic features from tmp_train_dat
train_dat_Basics = BasicDate_FeatureEngineering(train_dat)
test_dat_Basics  = BasicDate_FeatureEngineering(test_dat)

In [13]:
## normalized date columns
train_dat_Norm = train_dat.apply(getRelativeTimeColumns, axis=1)
test_dat_Norm  = test_dat.apply(getRelativeTimeColumns, axis=1)
## remove single-valued columns
remove_single_value_columns(train_dat_Norm, test=test_dat_Norm)

raw train data dimension:  (50000, 1150)
raw test data dimension:  (50000, 1150)
processed train data dimension:  (50000, 952)
processed test data dimension:  (50000, 952)


In [14]:
encoder = preprocessing.LabelEncoder()
column_names = train_dat.columns.tolist()
column_names.append('NaN')
encoder.fit(column_names)

LabelEncoder()

In [15]:
## TimeDiff features
train_dat_TimeDiff = train_dat.apply(getTimeChangeColumns, axis=1)
test_dat_TimeDiff  = test_dat.apply(getTimeChangeColumns, axis=1)
TimeDiff_ColumnNames = ['time_diff_start_col', 'time_diff_end_col', 'time_diff_value',
                        'time_ratio_value', 'first_time_value', 'last_time_value', 'first_date_value']
train_dat_TimeDiff.columns = TimeDiff_ColumnNames
test_dat_TimeDiff.columns = TimeDiff_ColumnNames

for column in ['time_diff_start_col', 'time_diff_end_col']:
    train_dat_TimeDiff[column].fillna('NaN', inplace=True)
    train_dat_TimeDiff[column] = encoder.transform(train_dat_TimeDiff[column])
    
    test_dat_TimeDiff[column].fillna('NaN', inplace=True)
    test_dat_TimeDiff[column] = encoder.transform(test_dat_TimeDiff[column])


In [16]:
start_time = time.time()

## section to create timeStep features

unique_value_counts = 6
timeStep_columnNames = []
column_name_columns = []
for i in xrange(unique_value_counts):
    timeStep_columnNames.extend(['time_diff_step_{}'.format(i), 'column_counts_step_{}'.format(i),
                                 'time_cost_step_{}'.format(i), 'first_column_step_{}'.format(i)])
    column_name_columns.append('first_column_step_{}'.format(i))

train_dat_TimeStep = train_dat_Norm.apply(getTimeSteps, axis=1)
test_dat_TimeStep  = test_dat_Norm.apply(getTimeSteps, axis=1)
train_dat_TimeStep.columns = timeStep_columnNames
test_dat_TimeStep.columns  = timeStep_columnNames

for column in column_name_columns:
    train_dat_TimeStep[column].fillna('NaN', inplace=True)
    test_dat_TimeStep[column].fillna('NaN', inplace=True)
    train_dat_TimeStep[column] = encoder.transform(train_dat_TimeStep[column])
    test_dat_TimeStep[column] = encoder.transform(test_dat_TimeStep[column])


print 'finish generating TimeStep features using {} seconds'.format(round(time.time() - start_time, 0))


finish generating TimeStep features using 672.0 seconds


In [17]:
#print train_dat_TimeDiff.min()
#print train_dat_TimeDiff.isnull().sum()

In [18]:
#train_dat_Basics.head()

In [19]:
train_dat_Norm.fillna(-1., inplace=True)
test_dat_Norm.fillna(-1., inplace=True)

In [20]:
train_dat_Basics.fillna(-1., inplace=True)
test_dat_Basics.fillna(-1., inplace=True)

In [21]:
train_dat_TimeStep.fillna(0, inplace=True)
test_dat_TimeStep.fillna(0, inplace=True)

In [22]:
train_dat_TimeDiff.fillna(0, inplace=True)
test_dat_TimeDiff.fillna(0, inplace=True)

In [23]:
#expected_columns = ['first_time_value', 'last_time_value', 'time_ratio_value', 'first_date_value']

In [24]:
tmp_train_dat = train_dat_TimeDiff.copy()
tmp_test_dat = test_dat_TimeDiff.copy()
if 'start_time' in train_dat_Basics:
    tmp_train_dat['start_time'] = train_dat_Basics['start_time']
    tmp_test_dat['start_time']  = test_dat_Basics['start_time']
    
train_test_datIndex_features = build_IndexFeatures(tmp_train_dat, tmp_test_dat)

In [25]:
start_time = time.time()
combined_train_dat = pd.concat([train_dat_Norm, train_dat_Basics, train_dat_TimeDiff, train_dat_TimeStep], axis=1)
combined_test_dat  = pd.concat([test_dat_Norm, test_dat_Basics, test_dat_TimeDiff, test_dat_TimeStep], axis=1)                                                                                                                                                 
print 'finish feature engineering date using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date using 0.01 minutes


In [26]:
print combined_train_dat.shape, combined_test_dat.shape
combined_train_dat.head()

(50000, 993) (50000, 993)


Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,time_cost_step_3,first_column_step_3,time_diff_step_4,column_counts_step_4,time_cost_step_4,first_column_step_4,time_diff_step_5,column_counts_step_5,time_cost_step_5,first_column_step_5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.2575,1013,5.04,15,0.336,1018,5.05,14,0.360714,1033
6,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,1150,0.0,0,0.0,1150,0.0,0,0.0,1150
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.083971,945,5.72,29,0.197241,1018,0.0,0,0.0,1150
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0725,945,4.94,10,0.494,1018,4.95,5,0.99,1028
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.224667,1018,3.38,14,0.241429,1041,0.0,0,0.0,1150


In [27]:
combined_train = pd.concat([combined_train_num, combined_train_dat, train_test_datIndex_features.ix[combined_train_num.index, :]], axis=1)
combined_test  = pd.concat([combined_test_num,  combined_test_dat,  train_test_datIndex_features.ix[combined_test_num.index, :]], axis=1)

#combined_train.to_csv('bosch_FE_raw_train_num_dat_data.csv') 

In [28]:
print combined_test.shape
combined_test.head()

(50000, 1993)


Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,first_date_value_index_ratio_1,first_date_value_index_ratio_2,time_ratio_value_index_diff_0,time_ratio_value_index_diff_1,first_time_value_index_diff_0,first_time_value_index_diff_1,last_time_value_index_diff_0,last_time_value_index_diff_1,first_date_value_index_diff_0,first_date_value_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,...,71856.0,41239.0,-60087,-3232,-18352,-3232,-76879,-3232,9999999,-1
2,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,...,34967.5,9818.5,-67274,-62612,-72870,-5943,-46601,-61348,1,-1
3,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,...,8798.0,21954.666667,-81490,-24124,-31865,-17769,-97025,-20967,1,-3
5,-0.016,-0.026,-0.033,-0.016,0.205,-0.157,0.0,0.008,0.087,0.077,...,13784.6,2666.8,-88702,-95353,-74259,-13304,-77946,-4697,-74259,-33948
8,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,...,2229.875,8185.0,-90231,-31470,-55325,-4312,-74511,-4312,2,-6


In [29]:
combined_train.isnull().sum()

L0_S0_F0                          0
L0_S0_F2                          0
L0_S0_F4                          0
L0_S0_F6                          0
L0_S0_F8                          0
L0_S0_F10                         0
L0_S0_F12                         0
L0_S0_F14                         0
L0_S0_F16                         0
L0_S0_F18                         0
L0_S0_F20                         0
L0_S0_F22                         0
L0_S1_F24                         0
L0_S1_F28                         0
L0_S2_F32                         0
L0_S2_F36                         0
L0_S2_F40                         0
L0_S2_F44                         0
L0_S2_F48                         0
L0_S2_F52                         0
L0_S2_F56                         0
L0_S2_F60                         0
L0_S2_F64                         0
L0_S3_F68                         0
L0_S3_F72                         0
L0_S3_F76                         0
L0_S3_F80                         0
L0_S3_F84                   

In [30]:
print combined_train.shape
combined_train.head()

(50000, 1994)


Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,first_date_value_index_ratio_1,first_date_value_index_ratio_2,time_ratio_value_index_diff_0,time_ratio_value_index_diff_1,first_time_value_index_diff_0,first_time_value_index_diff_1,last_time_value_index_diff_0,last_time_value_index_diff_1,first_date_value_index_diff_0,first_date_value_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.118,0.116,-0.015,-0.032,0.02,0.083,...,10158.75,14870.0,-68076,-67669,-92464,-47255,-89751,-3482,-93208,-47255
6,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,-1.5,...,10528.333333,14795.666667,-93062,-79568,-60941,-67530,-46827,-58861,3,-2
7,0.088,0.086,0.003,-0.052,0.161,0.025,-0.015,-0.072,-0.225,-0.147,...,14052.285714,5805.0,-67143,-38624,-90452,-26876,-64873,-13283,-71530,-26876
9,-0.036,-0.064,0.294,0.33,0.074,0.161,0.022,0.128,-0.026,-0.046,...,4686.777778,3931.0,-61853,-3564,-98445,-3564,-83397,-3564,-98445,-3564
11,-0.055,-0.086,0.294,0.33,0.118,0.025,0.03,0.168,-0.169,-0.099,...,7406.181818,3694.0,-61721,-13963,-32921,-1,-62044,-13963,-21032,-1


In [31]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)

In [32]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.8
params["num_round"]                = 251
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False

model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='bosch_xgb_model')
model.fit(tmp_train, dep_var_name)

scale_pos_weight: 183.215789474
a base_score 0.00542841633096 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.906669
[50]	train-auc:0.973255
[100]	train-auc:0.977964
[150]	train-auc:0.981893
[200]	train-auc:0.984569
[250]	train-auc:0.986377
the xgboost fit is finished by using 185.292252064 seconds, saved into bosch_xgb_model


<utils.wrapped_xgboost.xgboost_classifier at 0x7fc54ccda050>

In [33]:
pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n \n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.005400360024
threshold for preds: 0.141219810168
0.366951681538

 

result from using flexsible threshold: (0.3922567473950953, 0.1621287763118744)


In [34]:
rf_params = {'random_state' : 9999, 'n_estimators' : 2000, 'max_depth' : 7, 'criterion' : 'gini', 'n_jobs' : -1}
et_params = {'random_state' : 9999, 'n_estimators' : 200, 'max_depth' : 18, 'criterion' : 'gini', 'n_jobs' : -1}
rf_clf = RandomForestClassifier(**rf_params)
rf_clf = rf_clf.fit(X, y)

et_clf = RandomForestClassifier(**et_params)
et_clf = et_clf.fit(X, y)

In [35]:
valid_X.isnull().sum().sum()

0

In [35]:
#rf_pred = rf_clf.predict_proba(valid_X)[:, 1]
et_pred = et_clf.predict_proba(valid_X)[:, 1]

print 'result from using constant fraction: \n', score_MCC(valid_y, rf_pred), score_MCC(valid_y, et_pred)
print '\n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, rf_pred), CombinedModel.mcc_eval_func(valid_y, et_pred)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').