In [4]:
%matplotlib inline
import random
import os, sys, time
import pandas as pd
import numpy as np
from os.path import join
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions
from utils.wrapped_xgboost import xgboost_classifier
from utils.validation_tools import score_MCC, MCC, create_validation_index
from utils.models import CombinedModel
from utils.data_munge import remove_single_value_columns

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'


start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'
nan_fill_value = -2.


In [2]:
bin_num = 1 ## number of bins to separate data by start_time
tmp_train, tmp_test, bins, bin_names = bosch_functions.create_grouped_index_df(bin_num)

data loading takes  57.7  seconds.


In [5]:
nrows = 50000
sampled_index = random.sample(tmp_train.index.tolist(), nrows)
skipped_train_row_num = tmp_train.loc[~tmp_train.index.isin(sampled_index), 'row_num'].tolist()
print tmp_train.shape[0] - len(skipped_train_row_num)

50000


In [6]:
start_time = time.time()
train_num = pd.read_csv(join(data_path, train_num_file),  skiprows=skipped_train_row_num,  index_col='Id')
train_dat = pd.read_csv(join(data_path, train_date_file), skiprows=skipped_train_row_num,  index_col='Id')
print 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))
print train_num.shape, train_dat.shape

finish loading date using 26.0 seconds
(50000, 969) (50000, 1156)


In [7]:
tmp_train_num = train_num.copy()
tmp_train_dat = train_dat.copy()

In [39]:
tmp_train_num.head()

Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70,0.062,0.071,-0.179,-0.216,0.161,0.025,-0.022,-0.112,-0.174,-0.094,...,,,,,,0.0,0.0,0.0,0.0,0
87,0.016,0.041,0.33,0.312,0.031,0.116,0.015,0.088,0.214,0.179,...,,,,,,,,,,0
107,,,,,,,,,,,...,,,,,,,,,,0
127,,,,,,,,,,,...,,,,,,,,,,0
137,,,,,,,,,,,...,,,,,,,,,,0


In [38]:
tmp_train_dat.head()

Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70,187.57,187.57,187.57,187.57,187.57,187.57,187.57,187.57,187.57,187.57,...,,,,,,188.97,188.97,188.97,188.97,188.97
87,1213.34,1213.34,1213.34,1213.34,1213.34,1213.34,1213.34,1213.34,1213.34,1213.34,...,,,,,,,,,,
107,,,,,,,,,,,...,,,,,,,,,,
127,,,,,,,,,,,...,,,,,,,,,,
137,,,,,,,,,,,...,,,,,,,,,,


In [16]:
def BasicDate_FeatureEngineering(tmp_train_dat):
    ## feature engineering on the date features
    encoder = preprocessing.LabelEncoder()
    column_names = tmp_train_dat.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    dat_new_fea = pd.DataFrame()
    
    if 'L0_S0_D1' in tmp_train_dat.columns:
        dat_new_fea['start_time'] = tmp_train_dat['L0_S0_D1']
        
    dat_new_fea['time_sum'] = tmp_train_dat.sum(axis=1)
    dat_new_fea['time_mean'] = tmp_train_dat.mean(axis=1)
    dat_new_fea['dat_nan_count'] = tmp_train_dat.isnull().sum(axis=1)
    dat_new_fea['max_time'] = tmp_train_dat.max(axis=1)
    dat_new_fea['min_time'] = tmp_train_dat.min(axis=1)
    dat_new_fea['dat_max_min_diff'] = dat_new_fea['max_time'] - dat_new_fea['min_time']
    dat_new_fea['dat_max_min_ratio'] = dat_new_fea['min_time'] / dat_new_fea['max_time']

    dat_new_fea['dat_idxmax'] = tmp_train_dat.idxmax(axis=1)
    dat_new_fea['dat_idxmax'].fillna('NaN', inplace=True)
    dat_new_fea['dat_idxmax'] = encoder.transform(dat_new_fea['dat_idxmax'])
    dat_new_fea['dat_idxmin'] = tmp_train_dat.idxmin(axis=1)
    dat_new_fea['dat_idxmin'].fillna('NaN', inplace=True)
    dat_new_fea['dat_idxmin'] = encoder.transform(dat_new_fea['dat_idxmin'])
    return dat_new_fea


def getRelativeTimeColumns(series):
    '''
    normalize the time features by
    the start_time, the first none-NaN
    value
    '''
    if series[0] == np.NaN:
        start_time = series.dropna().index[0]
    else:
        start_time = series[0]
    new_series = series - start_time
    return new_series
   


def getTimeSteps(series, unique_value_counts = 10):
    '''
    in each row/series, use the sorted value_count
    to find the time steps and use the value, counts
    and column_index as features
    '''
    value_counts = series.value_counts()
    value_counts.sort_index(inplace=True)

    if 0. in value_counts.index:
        value_counts = value_counts[value_counts.index != 0.]
        
    available_counts = value_counts.shape[0]
    feature_array = []
    for i in xrange(unique_value_counts):
        if i < available_counts:
            date_value = value_counts.index[i]
            counts = value_counts[date_value]
            first_index = series[series == date_value].index[0]
            avg_time_cost = date_value / counts
            feature = [date_value, counts, avg_time_cost, first_index]
        else:
            feature = [np.NaN, 0, -1., 'NaN']
        feature_array.extend(feature)

    return pd.Series(feature_array)



def getTimeChangeColumns(series):
    start_time = series[0]
    tmp_series = series.dropna()
    if start_time == np.NaN:
        first_index     = tmp_series.index[0]
        last_index      = tmp_series.index[-1]
        first_id_value  = tmp_series[first_index]
        last_id_value   = tmp_series[last_index]
        first_num_value = first_id_value
        time_diff       = last_id_value - first_id_value
        time_ratio      = last_id_value / first_id_value
        return pd.Series([first_index, last_index, time_diff, time_ratio, 
                          first_id_value, last_id_value, first_num_value])
    else:
        first_num_value = start_time
        if np.sum(tmp_series != start_time) == 0:
            return pd.Series(['NaN', 'NaN', np.NaN, np.NaN, np.NaN, np.NaN, first_num_value])
        else:
            first_index     = tmp_series.index[tmp_series != start_time][0]
            last_index      = tmp_series.index[tmp_series != start_time][-1]
            first_id_value  = series[first_index]
            last_id_value   = series[last_index]
            first_id_value  = tmp_series[first_index]
            last_id_value   = tmp_series[last_index]
            time_diff       = last_id_value - first_id_value
            time_ratio      = last_id_value / first_id_value

            return pd.Series([first_index, last_index, time_diff, time_ratio,
                              first_id_value, last_id_value, first_num_value])


   

def NumericalFeatureEngineering(df, col_ignore = ['Response']):
    tmp_df = df.loc[:, ~df.columns.isin(col_ignore)]
    new_fea_df = pd.DataFrame()
    encoder = preprocessing.LabelEncoder()
    column_names = tmp_df.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    
    new_fea_df['num_mean'] = tmp_df.mean(axis=1)
    
    new_fea_df['num_sum'] = tmp_df.sum(axis=1)
    #num_sum_max = new_fea_df['num_sum'].max()
    #new_fea_df['num_sum'].fillna(1.*int(num_sum_max - 5), inplace=True)
    new_fea_df['num_max'] = tmp_df.max(axis=1)
    #num_max_max = new_fea_df['num_max'].max()
    #new_fea_df['num_max'].fillna(1.*int(num_max_max + 1), inplace=True)
    new_fea_df['num_min'] = tmp_df.min(axis=1)
    #num_min_min = new_fea_df['num_min'].min()
    #new_fea_df['num_min'].fillna(1.*int(num_min_min - 1.), inplace=True)
    new_fea_df['num_max_min_ratio'] = new_fea_df['num_min'] / new_fea_df['num_max']
    new_fea_df['num_max_min_ratio'] = new_fea_df['num_max_min_ratio'].replace([np.inf, -np.inf], np.NaN)
    new_fea_df['num_nan_col_count'] = tmp_df.isnull().sum(axis=1)
    new_fea_df['num_reg_col_count'] = tmp_df.shape[1] - tmp_df.isnull().sum(axis=1)
    new_fea_df['num_idxmax'] = tmp_df.idxmax(axis=1)
    new_fea_df['num_idxmax'].fillna('NaN', inplace=True)
    new_fea_df['num_idxmax'] = encoder.transform(new_fea_df['num_idxmax'])
    new_fea_df['num_idxmin'] = tmp_df.idxmin(axis=1)
    new_fea_df['num_idxmin'].fillna('NaN', inplace=True)
    new_fea_df['num_idxmin'] = encoder.transform(new_fea_df['num_idxmin'])
    return new_fea_df


In [9]:
tmp_train_num_Basics = NumericalFeatureEngineering(tmp_train_num)

In [10]:
combined_train_num = pd.concat([tmp_train_num, tmp_train_num_Basics], axis=1)

In [17]:
start_time = time.time()

## normalized date columns
tmp_train_dat_Norm = tmp_train_dat.apply(getRelativeTimeColumns, axis=1)
## basic features from tmp_train_dat
tmp_train_dat_Basics = BasicDate_FeatureEngineering(tmp_train_dat)

encoder = preprocessing.LabelEncoder()
column_names = tmp_train_dat.columns.tolist()
column_names.append('NaN')
encoder.fit(column_names)
    
#'''
tmp_train_dat_TimeDiff = tmp_train_dat.apply(getTimeChangeColumns, axis=1)
tmp_train_dat_TimeDiff.columns = ['time_diff_start_col', 'time_diff_end_col', 'time_diff_value', 
                                  'time_ratio_value', 'first_time_value', 'last_time_value', 'first_date_value']
                   
for column in ['time_diff_start_col', 'time_diff_end_col']:
    tmp_train_dat_TimeDiff[column].fillna('NaN', inplace=True)
    tmp_train_dat_TimeDiff[column] = encoder.transform(tmp_train_dat_TimeDiff[column])  
    
#'''


## section to create timeStep features
unique_value_counts = 10
timeStep_columnNames = []
column_name_columns = []
for i in xrange(unique_value_counts):
    timeStep_columnNames.extend(['time_diff_step_{}'.format(i), 'column_counts_step_{}'.format(i), 
                                 'time_cost_step_{}'.format(i), 'first_column_step_{}'.format(i)])
    column_name_columns.append('first_column_step_{}'.format(i))

tmp_train_dat_TimeStep = tmp_train_dat_Norm.apply(getTimeSteps, axis=1)
tmp_train_dat_TimeStep.columns = timeStep_columnNames
for column in column_name_columns:
    tmp_train_dat_TimeStep[column].fillna('NaN', inplace=True)
    tmp_train_dat_TimeStep[column] = encoder.transform(tmp_train_dat_TimeStep[column])
    

print 'finish feature engineering date using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date using 3.89 minutes


In [41]:
combined_train_dat = pd.concat([tmp_train_dat, tmp_train_dat_Norm, tmp_train_dat_Basics, tmp_train_dat_TimeDiff, tmp_train_dat_TimeStep], axis=1)

In [24]:
def build_IndexFeatures(combined_train_dat):
    dat_new_fea = pd.DataFrame()
    dat_new_fea['first_time_index']  = combined_train_dat['first_time_value'].argsort() + 1
    dat_new_fea['last_time_index']   = combined_train_dat['last_time_value'].argsort() + 1
    dat_new_fea['index_ratio']       = dat_new_fea['first_time_index'] / dat_new_fea['last_time_index']
    dat_new_fea['index']             = combined_train_dat.index
    dat_new_fea['sequence']          = range(dat_new_fea.shape[0])

    if 'start_time' in combined_train_dat.columns:
        dat_new_fea['start_time_diff']          = combined_train_dat['start_time'].diff()
        dat_new_fea['start_time_index']         = combined_train_dat['start_time'].argsort() + 1
        dat_new_fea['start_time_index_ratio_1'] = dat_new_fea['first_time_index'] / dat_new_fea['index']
        dat_new_fea['start_time_index_ratio_2'] = dat_new_fea['last_time_index'] / dat_new_fea['index']
    
    dat_new_fea['time_ratio_value_index']    = combined_train_dat['time_ratio_value'].argsort() + 1
    dat_new_fea['first_time_value_index']    = combined_train_dat['first_time_value'].argsort() + 1
    dat_new_fea['first_date_value_index']    = combined_train_dat['first_date_value'].argsort() + 1
    dat_new_fea['first_date_value_index_ratio_1'] = dat_new_fea['first_time_index'] / dat_new_fea['index']
    dat_new_fea['first_date_value_index_ratio_2'] = dat_new_fea['last_time_index'] / dat_new_fea['index']
    return dat_new_fea

In [25]:
dat_index_features = build_IndexFeatures(combined_train_dat)

In [26]:
dat_index_features.head()

Unnamed: 0_level_0,first_time_index,last_time_index,index_ratio,index,sequence,start_time_diff,start_time_index,start_time_index_ratio_1,start_time_index_ratio_2,time_ratio_value_index,first_time_value_index,first_date_value_index,first_date_value_index_ratio_1,first_date_value_index_ratio_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
70,5059,62,81.596774,70,0,,2961,72.271429,0.885714,8727,5059,2961,72.271429,0.885714
87,8114,49284,0.164638,87,1,1025.77,4726,93.264368,566.482759,9956,8114,4726,93.264368,566.482759
107,17402,11125,1.564225,107,2,,0,162.635514,103.971963,11306,17402,0,162.635514,103.971963
127,7073,6841,1.033913,127,3,,0,55.692913,53.866142,17088,7073,0,55.692913,53.866142
137,18951,12569,1.507757,137,4,,0,138.328467,91.744526,33556,18951,0,138.328467,91.744526


In [42]:
remove_single_value_columns(combined_train_num)
remove_single_value_columns(combined_train_dat)

raw train data dimension:  (50000, 978)
processed train data dimension:  (50000, 978)
raw train data dimension:  (50000, 2369)


AttributeError: 'DataFrame' object has no attribute 'unique'

In [43]:
combined_train = pd.concat([combined_train_num, combined_train_dat, dat_index_features], axis=1)

In [44]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)

In [45]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.8
params["num_round"]                = 601
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False

model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='bosch_xgb_model')
model.fit(tmp_train, dep_var_name)

scale_pos_weight: 193.45
a base_score 0.00514271020828 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.752936
[50]	train-auc:0.90694
[100]	train-auc:0.932577
[150]	train-auc:0.959672
[200]	train-auc:0.974456
[250]	train-auc:0.981354
[300]	train-auc:0.9861
[350]	train-auc:0.990073
[400]	train-auc:0.992706
[450]	train-auc:0.994208
[500]	train-auc:0.9957
[550]	train-auc:0.996612
[600]	train-auc:0.997366
the xgboost fit is finished by using 201.829978943 seconds, saved into bosch_xgb_model


<utils.wrapped_xgboost.xgboost_classifier at 0x7fb5439ab750>

In [46]:
pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n \n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00506700446696
threshold for preds: 0.0253322910408
0.180055865725

 

result from using flexsible threshold: (0.20865397305709307, 0.11240047961473465)


In [47]:
fea_imp = pd.read_csv('/home/ymm/Notebooks/ipyhon_notebooks/bosch_xgb_model_feature_importance.csv', index_col=0)

In [48]:
fea_imp[fea_imp['feature']=='start_time']

Unnamed: 0,feature,fscore,norm_fscore


In [49]:
fea_imp[fea_imp['feature']=='sequence']

Unnamed: 0,feature,fscore,norm_fscore
302,sequence,11,0.001315


In [50]:
fea_imp.sort('fscore').tail(200)

  if __name__ == '__main__':


Unnamed: 0,feature,fscore,norm_fscore
286,L3_S29_F3424,10,0.001195
305,first_date_value_index_ratio_1,11,0.001315
304,L2_S26_F3069,11,0.001315
303,L0_S23_F619,11,0.001315
302,sequence,11,0.001315
301,L0_S11_F282,11,0.001315
295,L0_S12_F330,11,0.001315
299,L3_S29_D3477,11,0.001315
298,L2_S27_F3155,11,0.001315
297,L2_S26_F3047,11,0.001315
