In [1]:
import os, sys, time, random
import pandas as pd
import numpy as np
from os.path import join
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold

sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions
from utils.wrapped_xgboost import xgboost_classifier
from utils.validation_tools import score_MCC, MCC, create_validation_index
from utils.combined_model import CombinedModel
from utils.data_munge import remove_single_value_columns
from utils.feature_engineering import NumericalFeatureEngineering, getRelativeTimeColumns, BasicDate_FeatureEngineering
from utils.feature_engineering import getTimeChangeColumns, getTimeSteps, build_IndexFeatures, build_sortedData_indexDiff
from utils.feature_engineering import build_sortedData_rowDiff

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'

start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'
data_nrows = 50000

In [2]:
start_time = time.time()
data_path = '/home/ymm/kaggle/bosch_data/bosch_processed_data'
raw_data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

train_file_name = 'selected_combined_train.csv'
test_file_name = 'selected_combined_test.csv'

train = pd.read_csv(join(data_path, train_file_name), index_col='Id', nrows = data_nrows)
test = pd.read_csv(join(data_path, test_file_name), index_col='Id', nrows = data_nrows)
train_date = pd.read_csv(join(raw_data_path, train_date_file), index_col='Id', usecols=[id_column_name, start_time_column_name], nrows=data_nrows)
test_date = pd.read_csv(join(raw_data_path, test_date_file), index_col='Id', usecols=[id_column_name, start_time_column_name], nrows=data_nrows)

print 'it takes {} minutes to load train {} and test {}'.format(round((time.time()-start_time)/60, 2), train.shape, test.shape)

it takes 0.07 minutes to load train (50000, 161) and test (50000, 160)


In [3]:
for col in train.columns:
    if 'time' in col:
        print col

time_sum
time_mean
max_time
min_time
time_diff_end_col
time_diff_value
time_ratio_value
first_time_value
last_time_value
time_ratio_value_index_diff_1
first_time_value_index_diff_0
first_time_value_index_diff_1
last_time_value_index_diff_0
last_time_value_index_diff_1


In [4]:
train_test_date = pd.concat([train_date, test_date], axis=0)
#train_test_date.head()

In [5]:
single_important_features = ['L3_S30_F3754', 'L3_S30_F3759', 'L3_S33_F3857', 'L3_S33_F3859', 
                             'L3_S30_F3744', 'L3_S30_F3749', 'L3_S30_F3704', 'L3_S33_F3865']

time_important_features = ['time_sum', 'time_mean', 'max_time', 'min_time',
                          'time_diff_value', 'time_ratio_value', 'first_time_value', 'last_time_value']

top_important_features = ['num_L0_mean', 'L3_S30_F3754', 'num_L3_S33_var', 'num_L3_S29_var', 
                          'L3_S30_F3759', 'num_L0_S1_mean', 'num_L0_S0_mean', 'dat_L3_S36_mean',
                         'num_max_min_ratio', 'num_L3_mean', 'dat_L3_var', 'L3_S33_F3857',
                         'dat_L0_mean', 'L3_S33_F3859', 'dat_L3_S33_mean', 'L3_S30_F3744']

combined_important_features = set(top_important_features)
combined_important_features = combined_important_features.union(set(single_important_features))
combined_important_features = combined_important_features.union(set(time_important_features))
combined_important_features = list(combined_important_features)

In [6]:
print train.shape, test.shape
train.head()

(50000, 161) (50000, 160)


Unnamed: 0_level_0,L3_S32_F3854,cat_sum,cat_mean,cat_max,cat_max_min_diff,L3_S30_D3566,L3_S33_D3856,L3_S33_D3858,time_sum,time_mean,...,num_L3_S30_mean_index_diff_0,num_L3_S30_mean_index_diff_1,num_L3_S34_mean_index_diff_0,num_L3_S34_var_index_diff_0,num_L3_S38_mean_index_diff_1,num_L3_S38_var_index_diff_1,num_L0_S8_mean_index_diff_1,num_L2_mean_index_diff_0,num_L0_var_index_diff_0,num_L0_S1_var_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,5.04,5.04,15385.01,85.949777,...,-2348093,-14866,1,1,-1,-1,-1,-403328,-2237756,-98151
6,,,,,,,,,274861.02,1315.124498,...,-2343022,-182785,1,1,-1,-1,-2,2,-1093355,-2
7,,366.756,5.821524,5.852,0.032,5.71,5.72,5.72,335984.81,1623.115024,...,-2356401,-12245,1,1,-1,-1,-3,1,-1176607,-151
9,,366.756,5.821524,5.852,0.032,4.93,4.94,4.94,238673.76,1153.013333,...,-2338730,-5371,1,1,-1,-1,-4,2,-903180,-6392
11,,366.756,5.821524,5.852,0.032,3.36,3.37,3.37,125285.21,605.24256,...,-2367438,-1,1,1,-2,-2,-1,1,-1862355,-1


#### build difference feature between adjacent instances

In [7]:
#train_test = pd.concat([train[combined_important_features], test[combined_important_features]], axis=0)
train_test = pd.concat([train[time_important_features], test[time_important_features]], axis=0)

train_test['start_time'] = train_test_date[start_time_column_name]
train_test['index'] = train_test.index

In [8]:
new_fea = pd.DataFrame(index = train_test.index)
#build_sortedData_indexDiff(train_test, new_fea, combined_important_features, index_col_name='index')
#build_sortedData_indexDiff(train_test, new_fea, time_important_features, index_col_name='index')

#'''
#for col_name in time_important_features:
for col_name in combined_important_features:
    build_sortedData_rowDiff(train_test, new_fea, ['start_time'], col_name)
    build_sortedData_rowDiff(train_test, new_fea, ['start_time', 'index'], col_name)
    build_sortedData_rowDiff(train_test, new_fea, ['start_time', col_name, 'index'], 'index')
#'''

In [9]:
print new_fea.shape
new_fea.head()

(100000, 32)


Unnamed: 0_level_0,start_time_sorted_time_sum_diff_0,start_time_sorted_time_sum_diff_1,start_time-index_sorted_time_sum_diff_0,start_time-index_sorted_time_sum_diff_1,start_time_sorted_time_mean_diff_0,start_time_sorted_time_mean_diff_1,start_time-index_sorted_time_mean_diff_0,start_time-index_sorted_time_mean_diff_1,start_time_sorted_max_time_diff_0,start_time_sorted_max_time_diff_1,...,start_time-index_sorted_time_ratio_value_diff_0,start_time-index_sorted_time_ratio_value_diff_1,start_time_sorted_first_time_value_diff_0,start_time_sorted_first_time_value_diff_1,start_time-index_sorted_first_time_value_diff_0,start_time-index_sorted_first_time_value_diff_1,start_time_sorted_last_time_value_diff_0,start_time_sorted_last_time_value_diff_1,start_time-index_sorted_last_time_value_diff_0,start_time-index_sorted_last_time_value_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,262,3,262,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,-80631,-72852,100466,84999,-402,-348,614,552,-402,-348,...,0,0,-400,-349,641,569,-402,-348,604,545
7,-14,-6,0,-6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,-94,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,-1869,9,-1869,0,0,0,0,0,-17,...,0,0,0,0,0,0,0,-17,0,-17


#### section to combine new features with the current ones

In [23]:
#combined_train = pd.concat([train, new_fea.ix[train.index]], axis=1)
#combined_test  = pd.concat([test,  new_fea.ix[test.index]],  axis=1)

combined_train = train.copy()
combined_test  = test.copy()

In [18]:
print combined_train.shape, new_fea.shape
combined_train.head()

(50000, 193) (100000, 32)


Unnamed: 0_level_0,L3_S32_F3854,cat_sum,cat_mean,cat_max,cat_max_min_diff,L3_S30_D3566,L3_S33_D3856,L3_S33_D3858,time_sum,time_mean,...,start_time-index_sorted_time_ratio_value_diff_0,start_time-index_sorted_time_ratio_value_diff_1,start_time_sorted_first_time_value_diff_0,start_time_sorted_first_time_value_diff_1,start_time-index_sorted_first_time_value_diff_0,start_time-index_sorted_first_time_value_diff_1,start_time_sorted_last_time_value_diff_0,start_time_sorted_last_time_value_diff_1,start_time-index_sorted_last_time_value_diff_0,start_time-index_sorted_last_time_value_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,,,,,,,5.04,5.04,15385.01,85.949777,...,0,0,0,0,0,0,0,0,0,0
6,,,,,,,,,274861.02,1315.124498,...,0,0,-400,-349,641,569,-402,-348,604,545
7,,366.756,5.821524,5.852,0.032,5.71,5.72,5.72,335984.81,1623.115024,...,0,0,0,0,0,0,0,0,0,0
9,,366.756,5.821524,5.852,0.032,4.93,4.94,4.94,238673.76,1153.013333,...,0,0,0,0,0,0,0,0,0,0
11,,366.756,5.821524,5.852,0.032,3.36,3.37,3.37,125285.21,605.24256,...,0,0,0,0,0,0,0,-17,0,-17


In [19]:
print combined_test.shape
combined_test.head()

(50000, 192)


Unnamed: 0_level_0,L3_S32_F3854,cat_sum,cat_mean,cat_max,cat_max_min_diff,L3_S30_D3566,L3_S33_D3856,L3_S33_D3858,time_sum,time_mean,...,start_time-index_sorted_time_ratio_value_diff_0,start_time-index_sorted_time_ratio_value_diff_1,start_time_sorted_first_time_value_diff_0,start_time_sorted_first_time_value_diff_1,start_time-index_sorted_first_time_value_diff_0,start_time-index_sorted_first_time_value_diff_1,start_time_sorted_last_time_value_diff_0,start_time_sorted_last_time_value_diff_1,start_time-index_sorted_last_time_value_diff_0,start_time-index_sorted_last_time_value_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,474.324,8.03939,9.158,2.448,,,,196310.26,798.009187,...,0,0,-607,-232,-921,-232,-607,-259,-917,-259
2,,809.167,5.99383,6.809,1.478,,,,274750.19,1048.664847,...,0,0,232,353,232,353,259,348,259,348
3,,494.339,8.378627,9.653,2.186,,,,174394.09,700.377871,...,0,0,-353,-71,-353,-641,-348,-59,-348,-604
5,,567.323,6.303589,7.467,1.647,,0.82,0.82,51450.81,255.974179,...,0,0,0,0,0,0,0,0,0,0
8,,493.541,8.365102,9.653,2.186,,,,189861.42,762.495663,...,0,0,71,-667,-569,-919,59,-641,-545,-893


In [24]:
#'''
############## Section of regular validation #######################
#train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)
#'''

In [25]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.5
params["num_round"]                = 401
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False

#'''
model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='test_bosch_xgb_model')
model.fit(tmp_train, dep_var_name)
pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)
#'''

scale_pos_weight: 183.215789474
a base_score 0.00542841633096 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.919468
[50]	train-auc:0.970486
[100]	train-auc:0.976002
[150]	train-auc:0.981673
[200]	train-auc:0.984947
[250]	train-auc:0.987242
[300]	train-auc:0.989457
[350]	train-auc:0.991416
[400]	train-auc:0.993004
the xgboost fit is finished by using 2.8 minutes, saved into test_bosch_xgb_model
result from using constant fraction: 
mean of groud truth: 0.005400360024
threshold for preds: 0.0973999342252
0.379364393665


result from using flexsible threshold: (0.4866438520377654, 0.1605479121208191)


In [15]:
fea_imp = pd.read_csv(join('/home/ymm/Notebooks/ipyhon_notebooks', 'test_bosch_xgb_model_feature_importance.csv'), index_col=0)

In [16]:
fea_imp.tail(50)

Unnamed: 0,feature,fscore,norm_fscore
109,cat_max_min_diff,118,0.008538
110,num_L3_S33_min,122,0.008828
111,dat_L3_S34_mean_index_diff_1,124,0.008973
112,dat_L3_S35_mean_index_diff_0,125,0.009045
113,last_time_value_index_diff_1,125,0.009045
114,L3_S30_F3829,125,0.009045
115,L3_S33_D3856,126,0.009117
116,L3_S29_F3351,126,0.009117
117,dat_L3_S29_mean_index_diff_0,132,0.009551
118,num_L3_S33_max,133,0.009624


#### cross validate the xgb model

In [None]:
skf = StratifiedKFold(combined_train[dep_var_name], 4)

counter = 0
for train_index, valid_index in skf:
    valid_data = combined_train.iloc[valid_index]
    tmp_train  = combined_train.iloc[train_index]

    y = tmp_train[dep_var_name].values
    X = tmp_train.drop(dep_var_name, axis=1)

    valid_y = valid_data[dep_var_name].values
    valid_X = valid_data.drop(dep_var_name, axis=1)
    
    model_name = 'test_bosch_xgb_model_{}'.format(counter)
    counter += 1
    model = xgboost_classifier(label_name = dep_var_name, params = params, model_file = model_name)
    model.fit(tmp_train, dep_var_name)
    
    pred = model.predict(valid_X)
    print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
    print '\n'
    print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

In [None]:
fea_imp = pd.read_csv(join('/home/ymm/Notebooks/ipyhon_notebooks', 'test_bosch_xgb_model_feature_importance.csv'), index_col=0)