In [1]:
import os, sys, time, random
import pandas as pd
import numpy as np
from os.path import join
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold

sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions
from utils.wrapped_xgboost import xgboost_classifier
from utils.validation_tools import score_MCC, MCC, create_validation_index
from utils.models import CombinedModel
from utils.data_munge import remove_single_value_columns
from utils.feature_engineering import NumericalFeatureEngineering, getRelativeTimeColumns, BasicDate_FeatureEngineering
from utils.feature_engineering import getTimeChangeColumns, getTimeSteps, build_IndexFeatures, build_sortedData_indexDiff

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'

start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'

In [2]:
tot_row_num = 1183747
num_rows = 50000
skip = sorted(random.sample(xrange(1,tot_row_num + 1),tot_row_num - num_rows))

In [3]:
#'''
start_time = time.time()
## randomly select certain rows
train_num = pd.read_csv(join(data_path, train_num_file),    index_col='Id', skiprows=skip, nrows=num_rows)
train_dat = pd.read_csv(join(data_path, train_date_file),   index_col='Id', skiprows=skip, nrows=num_rows)
train_cat = pd.read_csv(join(data_path, train_cat_file),    index_col='Id', skiprows=skip, nrows=num_rows)

test_num = pd.read_csv(join(data_path, test_num_file),      index_col='Id', nrows=num_rows)
test_dat = pd.read_csv(join(data_path, test_date_file),     index_col='Id', nrows=num_rows)
test_cat = pd.read_csv(join(data_path, test_cat_file),      index_col='Id', nrows=num_rows)

print 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))
#'''

finish loading date using 81.0 seconds


In [4]:
remove_single_value_columns(train_num, 'Response', test=test_num)
remove_single_value_columns(train_dat, test=test_dat)
remove_single_value_columns(train_cat, test=test_cat)

raw train data dimension:  (50000, 969)
raw test data dimension:  (50000, 968)
processed train data dimension:  (50000, 969)
processed test data dimension:  (50000, 968)
raw train data dimension:  (50000, 1156)
raw test data dimension:  (50000, 1156)
processed train data dimension:  (50000, 1146)
processed test data dimension:  (50000, 1146)
raw train data dimension:  (50000, 2140)
raw test data dimension:  (50000, 2140)
processed train data dimension:  (50000, 1204)
processed test data dimension:  (50000, 1204)


In [5]:
dat_columns = train_dat.columns.tolist()
num_columns = train_num.columns.tolist()
num_columns.remove(dep_var_name)

def build_column_dict(columns):
    station_dict = {}
    line_dict = {}
    for col in columns:
        stationList = col.split('_')[0:2]
        stationKey = ('_').join(stationList)
        lineKey = col.split('_')[0]
        
        if lineKey not in line_dict:
            line_dict[lineKey] = [col]
        else:
            line_dict[lineKey].append(col)
                    
        if stationKey not in station_dict:
            station_dict[stationKey] = [col]
        else:
            station_dict[stationKey].append(col)
            
    return station_dict, line_dict


def build_station_features(df, col_dict, prefix='dat'):
    features = pd.DataFrame()
    for key, value in col_dict.items():
        features['{}_{}_{}'.format(prefix, key, 'mean')] = df[value].mean(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'max')] = df[value].max(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'min')] = df[value].min(axis=1)
        features['{}_{}_{}'.format(prefix, key, 'var')] = df[value].var(axis=1)
    return features


def build_station_index_features(train, test = None):
    selected_columns = []
    for col in train.columns:
        if 'mean' in col or 'var' in col:
            selected_columns.append(col)
            
    if test is not None:
        train_test = pd.concat([train[selected_columns], test[selected_columns]], axis=0)
    else:
        train_test = train[selected_columns]
        
    train_test['index'] = train_test.index
    new_fea = pd.DataFrame()
    ## function to build index based on the given columns
    build_sortedData_indexDiff(train_test, new_fea, selected_columns)
    
    return new_fea


dat_col_dict, dat_line_dict = build_column_dict(dat_columns)
num_col_dict, num_line_dict = build_column_dict(num_columns)

dat_col_dict.update(dat_line_dict)
num_col_dict.update(num_line_dict)

In [6]:
start_time = time.time()

train_dat_stations = build_station_features(train_dat, dat_col_dict, 'dat')
test_dat_stations = build_station_features(test_dat, dat_col_dict, 'dat')

train_num_stations = build_station_features(train_num, num_col_dict, 'num')
test_num_stations = build_station_features(test_num, num_col_dict, 'num')

num_station_index = build_station_index_features(train_num_stations, test_num_stations)
dat_station_index = build_station_index_features(train_dat_stations, test_dat_stations)

print 'finish feature engineering date station using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date station using 0.46 minutes


In [8]:
combined_train_station_num = pd.concat([train_num_stations, num_station_index.ix[train_num_stations.index]], axis=1)
combined_train_station_dat = pd.concat([train_dat_stations, dat_station_index.ix[train_dat_stations.index]], axis=1)

combined_test_station_num = pd.concat([test_num_stations, num_station_index.ix[train_num_stations.index]], axis=1)
combined_test_station_dat = pd.concat([test_dat_stations, dat_station_index.ix[train_dat_stations.index]], axis=1)


In [16]:
remove_single_value_columns(combined_train_station_num, test=combined_test_station_num)
remove_single_value_columns(combined_train_station_dat, test=combined_test_station_dat)

raw train data dimension:  (50000, 432)
raw test data dimension:  (100000, 432)
processed train data dimension:  (50000, 431)
processed test data dimension:  (100000, 431)
raw train data dimension:  (50000, 432)
raw test data dimension:  (100000, 432)
processed train data dimension:  (50000, 431)
processed test data dimension:  (100000, 431)


In [30]:
station_fillna_value = 9999999
combined_train_station_num.fillna(station_fillna_value, inplace=True)
combined_test_station_num.fillna(station_fillna_value, inplace=True)
combined_train_station_dat.fillna(station_fillna_value, inplace=True)
combined_test_station_dat.fillna(station_fillna_value, inplace=True)

In [17]:
combined_train_station_num.head()

Unnamed: 0_level_0,num_L2_S28_mean,num_L2_S28_max,num_L2_S28_min,num_L2_S28_var,num_L3_S31_mean,num_L3_S31_max,num_L3_S31_min,num_L3_S31_var,num_L2_S26_mean,num_L2_S26_max,...,num_L1_S25_var_index_diff_0,num_L1_S25_var_index_diff_1,num_L1_S24_mean_index_diff_0,num_L1_S24_mean_index_diff_1,num_L1_S24_var_index_diff_0,num_L1_S24_var_index_diff_1,num_L3_S39_mean_index_diff_0,num_L3_S39_mean_index_diff_1,num_L3_S39_var_index_diff_0,num_L3_S39_var_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,,,,,,,,,,,...,1,-1,4,-1,4,-1,1,-1,1,-1
136,,,,,,,,,,,...,2,-3,2,-2,2,-2,-2367003,-2083,-2326492,-10902
229,,,,,,,,,,,...,1,-2,1,-1,1,-1,1,-1,1,-1
248,,,,,,,,,,,...,5,-3,5,-3,5,-3,5,-3,5,-3
258,,,,,,,,,-0.005571,0.244,...,-22744,-632996,5,-2,5,-2,5,-2,5,-2


In [18]:
combined_train_station_num.min(axis=0).min()

-2367003.0

In [19]:
combined_train_station_num.max(axis=0).max()

9999999.0

In [21]:
combined_train_station_dat.min(axis=0).min()

-2367235.0

In [31]:
#for col in combined_train_station_dat.columns:
    #print col, len(combined_train_station_dat[col].value_counts())

In [32]:
combined_train_station_num.isnull().sum(axis=1)

Id
9          0
136        0
229        0
248        0
258        0
269        0
293        0
421        0
464        0
471        0
505        0
558        0
565        0
744        0
754        0
755        0
767        0
955        0
970        0
1019       0
1050       0
1112       0
1191       0
1219       0
1222       0
1252       0
1304       0
1325       0
1351       0
1387       0
          ..
2366182    0
2366215    0
2366260    0
2366297    0
2366330    0
2366349    0
2366374    0
2366510    0
2366546    0
2366569    0
2366580    0
2366638    0
2366816    0
2366851    0
2366915    0
2366972    0
2366991    0
2367013    0
2367069    0
2367139    0
2367145    0
2367174    0
2367196    0
2367228    0
2367244    0
2367247    0
2367302    0
2367357    0
2367384    0
2367400    0
dtype: int64

In [35]:
print combined_train_station_num.isnull().sum().sum(), combined_train_station_dat.isnull().sum().sum()

(0, 0)

#### build categorical features

In [50]:
def BasicCat_FeatureEngineering(train_cat):
    ## feature engineering on the date features
    encoder = preprocessing.LabelEncoder()
    column_names = train_cat.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    dat_new_fea = pd.DataFrame()
    dat_new_fea['cat_sum'] = train_cat.sum(axis=1)
    dat_new_fea['cat_mean'] = train_cat.mean(axis=1)
    dat_new_fea['cat_nan_count'] = train_cat.isnull().sum(axis=1)
    dat_new_fea['cat_max'] = train_cat.max(axis=1)
    dat_new_fea['cat_min'] = train_cat.min(axis=1)
    dat_new_fea['cat_max_min_diff'] = dat_new_fea['cat_max'] - dat_new_fea['cat_min']
    dat_new_fea['cat_max_min_ratio'] = dat_new_fea['cat_min'] / dat_new_fea['cat_max']

    dat_new_fea['cat_idxmax'] = train_cat.idxmax(axis=1)
    dat_new_fea['cat_idxmax'].fillna('NaN', inplace=True)
    dat_new_fea['cat_idxmax'] = encoder.transform(dat_new_fea['cat_idxmax'])
    dat_new_fea['cat_idxmin'] = train_cat.idxmin(axis=1)
    dat_new_fea['cat_idxmin'].fillna('NaN', inplace=True)
    dat_new_fea['cat_idxmin'] = encoder.transform(dat_new_fea['cat_idxmin'])
    return dat_new_fea



def encode_categorical_by_dep_var(train, test, dep_var_column='Response'):
    for col_name in train.columns:
        if col_name == dep_var_column:
            continue
        
        train[col_name] = train[col_name].astype(str)
        test[col_name] = test[col_name].astype(str)
        dep_var_mean = train[[col_name, dep_var_column]].groupby(col_name).mean()
    
        dep_var_dict = {}
        for level in dep_var_mean.index.tolist():
            dep_var_dict[level] = dep_var_mean.ix[level, dep_var_column]
    
        train[col_name] = train[col_name].replace(dep_var_dict)  
        test[col_name] = test[col_name].replace(dep_var_dict)  

In [51]:
start_time = time.time()

train_cat['Response'] = train_num['Response']
encode_categorical_by_dep_var(train_cat, test_cat)
train_cat.drop('Response', axis=1, inplace=True)

print 'finish encoding categorical features using {} seconds'.format(round(time.time() - start_time, 0))

train_cat_Basics = BasicCat_FeatureEngineering(train_cat)
test_cat_Basics  = BasicCat_FeatureEngineering(train_cat)

print 'finish generating all categorical features using {} seconds'.format(round(time.time() - start_time, 0))


finish generating categorical features using 914.0 seconds


In [56]:
train_cat.dtypes.value_counts()

float64    1204
dtype: int64

In [57]:
print train_cat.shape, test_cat.shape
train_cat.head()

(50000, 1204) (50000, 1204)


Unnamed: 0_level_0,L0_S2_F33,L0_S2_F35,L0_S2_F37,L0_S2_F39,L0_S2_F41,L0_S2_F43,L0_S2_F45,L0_S2_F47,L0_S2_F49,L0_S2_F51,...,L3_S49_F4225,L3_S49_F4227,L3_S49_F4229,L3_S49_F4230,L3_S49_F4232,L3_S49_F4234,L3_S49_F4235,L3_S49_F4237,L3_S49_F4239,L3_S49_F4240
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,...,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624
136,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,...,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624
229,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,...,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624
248,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,...,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624
258,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,0.006241,...,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624,0.00624


In [60]:
train_cat_Basics.isnull().sum(axis=0).sum()

0

In [82]:
start_time = time.time()
combined_train_cat = pd.concat([train_cat, train_cat_Basics], axis=1)
combined_test_cat  = pd.concat([test_cat, test_cat_Basics], axis=1)                                                                                                                                                 
print 'finish feature engineering date using {} seconds'.format(round((time.time() - start_time), 2))

finish feature engineering date using 2.01 seconds


In [65]:
start_time = time.time()

#### numerical feature engineering work
train_num_Basics = NumericalFeatureEngineering(train_num)
test_num_Basics = NumericalFeatureEngineering(test_num)

missing_value= -1.5
train_num_Basics.fillna(missing_value, inplace=True)
test_num_Basics.fillna(missing_value, inplace=True)

train_num.fillna(missing_value, inplace=True)
test_num.fillna(missing_value, inplace=True)

combined_train_num = pd.concat([train_num, train_num_Basics], axis=1)
combined_test_num  = pd.concat([test_num, test_num_Basics], axis=1)
print 'finish feature engineering numercical using {} seconds'.format(round((time.time() - start_time), 2))
print 'combined train numerical feature shape: {}, combined test numerical features shape: {}'.format(combined_train_num.shape, combined_test_num.shape)

finish feature engineering numercical using 12.59 seconds
combined train numerical feature shape: (50000, 978), combined test numerical features shape: (50000, 977)


In [67]:
print combined_train_num.isnull().sum().sum(), combined_test_num.isnull().sum().sum()

0 0


In [66]:
### section of date features

## basic features from tmp_train_dat
train_dat_Basics = BasicDate_FeatureEngineering(train_dat)
test_dat_Basics  = BasicDate_FeatureEngineering(test_dat)

In [68]:
## normalized date columns
train_dat_Norm = train_dat.apply(getRelativeTimeColumns, axis=1)
test_dat_Norm  = test_dat.apply(getRelativeTimeColumns, axis=1)
## remove single-valued columns
remove_single_value_columns(train_dat_Norm, test=test_dat_Norm)

raw train data dimension:  (50000, 1146)
raw test data dimension:  (50000, 1146)
processed train data dimension:  (50000, 960)
processed test data dimension:  (50000, 960)


In [69]:
encoder = preprocessing.LabelEncoder()
column_names = train_dat.columns.tolist()
column_names.append('NaN')
encoder.fit(column_names)

LabelEncoder()

In [70]:
## TimeDiff features
train_dat_TimeDiff = train_dat.apply(getTimeChangeColumns, axis=1)
test_dat_TimeDiff  = test_dat.apply(getTimeChangeColumns, axis=1)
TimeDiff_ColumnNames = ['time_diff_start_col', 'time_diff_end_col', 'time_diff_value',
                        'time_ratio_value', 'first_time_value', 'last_time_value', 'first_date_value']

train_dat_TimeDiff.columns = TimeDiff_ColumnNames
test_dat_TimeDiff.columns = TimeDiff_ColumnNames

for column in ['time_diff_start_col', 'time_diff_end_col']:
    train_dat_TimeDiff[column].fillna('NaN', inplace=True)
    train_dat_TimeDiff[column] = encoder.transform(train_dat_TimeDiff[column])
    
    test_dat_TimeDiff[column].fillna('NaN', inplace=True)
    test_dat_TimeDiff[column] = encoder.transform(test_dat_TimeDiff[column])


In [71]:
start_time = time.time()

## section to create timeStep features

unique_value_counts = 6
timeStep_columnNames = []
column_name_columns = []
for i in xrange(unique_value_counts):
    timeStep_columnNames.extend(['time_diff_step_{}'.format(i), 'column_counts_step_{}'.format(i),
                                 'time_cost_step_{}'.format(i), 'first_column_step_{}'.format(i)])
    column_name_columns.append('first_column_step_{}'.format(i))

train_dat_TimeStep = train_dat_Norm.apply(getTimeSteps, axis=1)
test_dat_TimeStep  = test_dat_Norm.apply(getTimeSteps, axis=1)
train_dat_TimeStep.columns = timeStep_columnNames
test_dat_TimeStep.columns  = timeStep_columnNames

for column in column_name_columns:
    train_dat_TimeStep[column].fillna('NaN', inplace=True)
    test_dat_TimeStep[column].fillna('NaN', inplace=True)
    train_dat_TimeStep[column] = encoder.transform(train_dat_TimeStep[column])
    test_dat_TimeStep[column] = encoder.transform(test_dat_TimeStep[column])


print 'finish generating TimeStep features using {} seconds'.format(round(time.time() - start_time, 0))


finish generating TimeStep features using 616.0 seconds


In [72]:
print train_dat_Norm.isnull().sum().sum(), train_dat_Basics.isnull().sum().sum(), train_dat_TimeStep.isnull().sum().sum(), train_dat_TimeDiff.isnull().sum().sum()

42369176 21633 144517 21587


In [73]:
train_dat_Norm.head()

Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.38,2.38,2.38,2.38,2.38,2.38,2.38,2.38,2.38,2.38
229,,,,,,,,,,,...,,,,,,,,,,
248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
258,,,,,,,,,,,...,,,,,,,,,,


##### fill up missing values

In [98]:
train_dat_Norm.fillna(-1., inplace=True)
test_dat_Norm.fillna(-1., inplace=True)

train_dat_Basics.fillna(-1., inplace=True)
test_dat_Basics.fillna(-1., inplace=True)

train_dat_TimeStep.fillna(0, inplace=True)
test_dat_TimeStep.fillna(0, inplace=True)

train_dat_TimeDiff.fillna(0, inplace=True)
test_dat_TimeDiff.fillna(0, inplace=True)

In [74]:
'''
tmp_train_dat = train_dat_TimeDiff.copy()
tmp_test_dat = test_dat_TimeDiff.copy()

if 'start_time' in train_dat_Basics:
    tmp_train_dat['start_time'] = train_dat_Basics['start_time']
    tmp_test_dat['start_time']  = test_dat_Basics['start_time']
'''


In [99]:
start_time = time.time()
combined_train_dat = pd.concat([train_dat_Norm, train_dat_Basics, train_dat_TimeDiff, train_dat_TimeStep], axis=1)
combined_test_dat  = pd.concat([test_dat_Norm, test_dat_Basics, test_dat_TimeDiff, test_dat_TimeStep], axis=1)                                                                                                                                                 
print 'finish feature engineering date using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date using 0.02 minutes


In [100]:
start_time = time.time()
train_test_datIndex_features = build_IndexFeatures(combined_train_dat, combined_test_dat)
print 'finish feature engineering date index using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date index using 0.02 minutes


In [101]:
print combined_train_dat.shape, combined_test_dat.shape
combined_train_dat.head()

(50000, 1001) (50000, 1001)


Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,time_cost_step_3,first_column_step_3,time_diff_step_4,column_counts_step_4,time_cost_step_4,first_column_step_4,time_diff_step_5,column_counts_step_5,time_cost_step_5,first_column_step_5
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0725,945,4.94,10,0.494,1018,4.95,5,0.99,1028
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.069412,1062,2.37,5,0.474,1104,2.38,30,0.079333,1109
229,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,1146,0.0,0,0.0,1146,0.0,0,0.0,1146
248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.579333,1018,8.7,14,0.621429,1033,0.0,0,0.0,1146
258,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,1146,0.0,0,0.0,1146,0.0,0,0.0,1146


#### load the feature importances from xgboost models

In [50]:
xgb_feature_imp = pd.read_csv('/home/ymm/full_data_xgb_feature_importance.csv', index_col='feature')

In [51]:
xgb_feature_imp.head()

Unnamed: 0_level_0,fscore_0,norm_fscore_0,fscore_1,norm_fscore_1,fscore_2,norm_fscore_2,fscore_3,norm_fscore_3,fscore_4,norm_fscore_4,...,fscore_8,norm_fscore_8,fscore_9,norm_fscore_9,fscore_10,norm_fscore_10,fscore_11,norm_fscore_11,fscore_sum,norm_fscore_sum
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
first_time_value_index_diff_1,1357.0,0.013342,1480.0,0.014495,773.0,0.032962,1701.0,0.011999,1198.0,0.01881,...,1723.0,0.012308,747.0,0.031607,1671.0,0.011886,1152.0,0.018246,15033.0,0.228153
L3_S33_F3857,1317.0,0.012948,1226.0,0.012007,316.0,0.013475,1797.0,0.012676,648.0,0.010175,...,1745.0,0.012465,391.0,0.016544,1874.0,0.01333,708.0,0.011214,12276.0,0.149472
last_time_value_index_diff_1,1015.0,0.009979,1059.0,0.010372,410.0,0.017483,1310.0,0.009241,840.0,0.013189,...,1488.0,0.010629,382.0,0.016163,1329.0,0.009453,743.0,0.011768,10737.0,0.145426
L3_S33_F3859,1223.0,0.012024,1161.0,0.011371,264.0,0.011258,1722.0,0.012147,744.0,0.011682,...,1556.0,0.011115,375.0,0.015867,1649.0,0.011729,786.0,0.012449,11676.0,0.144343
last_time_value_index_diff_0,1020.0,0.010028,1087.0,0.010646,350.0,0.014925,1351.0,0.00953,707.0,0.011101,...,1391.0,0.009937,310.0,0.013117,1320.0,0.009389,612.0,0.009693,10258.0,0.132143


In [97]:
## sort by the norm_fscore_sum
sorted_combined_imp = xgb_feature_imp.sort_values(by=['norm_fscore_sum'], ascending=False)
#imp_feature = sorted_combined_imp.index[sorted_combined_imp['norm_fscore_sum'] >= 0.005].tolist()
imp_feature = sorted_combined_imp.index.tolist()

In [98]:
def select_important_features(df, imp_feature, test_df = None, dep_var_name = 'Response'):
    imp_col_names = [col for col in df.columns if col in imp_feature]
    print 'total {} columns in original DataFrame, select {} columns'.format(df.shape[1], len(imp_col_names))
    train_col_names = imp_col_names[:]
    test_col_names = imp_col_names[:]
    if dep_var_name in df.columns:    
        train_col_names.append(dep_var_name)
    if test_df is None:
        return df[train_col_names]
    else:
        return df[train_col_names], test_df[test_col_names]


In [99]:
combined_train_num, combined_test_num  = select_important_features(combined_train_num, imp_feature, combined_test_num)
combined_train_dat, combined_test_dat  = select_important_features(combined_train_dat, imp_feature, combined_test_dat)
train_test_datIndex_features = select_important_features(train_test_datIndex_features, imp_feature)

total 978 columns in original DataFrame, select 691 columns
total 1013 columns in original DataFrame, select 108 columns
total 23 columns in original DataFrame, select 22 columns


#### combine all the features together

In [48]:
## combined data with station features
#combined_train = pd.concat([train_dat_stations, train_num_stations, combined_train_num, combined_train_dat, train_test_datIndex_features.ix[combined_train_num.index, :]], axis=1)
#combined_test  = pd.concat([test_dat_stations, test_num_stations, combined_test_num,  combined_test_dat,  train_test_datIndex_features.ix[combined_test_num.index, :]], axis=1)

In [78]:
#combined_train = pd.concat([combined_train_num, combined_train_dat, train_test_datIndex_features.ix[combined_train_num.index, :]], axis=1)
#combined_test  = pd.concat([combined_test_num,  combined_test_dat,  train_test_datIndex_features.ix[combined_test_num.index, :]], axis=1)

In [84]:
## combined data with categorical features
#combined_train = pd.concat([combined_train_cat, combined_train_num, combined_train_dat, train_test_datIndex_features.ix[combined_train_num.index, :]], axis=1)
#combined_test  = pd.concat([combined_test_cat,  combined_test_num,  combined_test_dat,  train_test_datIndex_features.ix[combined_test_num.index, :]], axis=1)

In [102]:
## combined data with all features
combined_train = pd.concat([combined_train_station_dat, combined_train_station_num, combined_train_cat, combined_train_num, combined_train_dat, train_test_datIndex_features.ix[combined_train_num.index, :]], axis=1)
combined_test  = pd.concat([combined_test_station_dat, combined_test_station_num, combined_test_cat,  combined_test_num,  combined_test_dat,  train_test_datIndex_features.ix[combined_test_num.index, :]], axis=1)

In [107]:
print combined_train.isnull().sum().sum(), combined_test.isnull().sum().sum()

0 160700000


In [103]:
print combined_test.shape
combined_test.head()

(100000, 4076)


Unnamed: 0_level_0,dat_L2_S28_mean,dat_L2_S28_max,dat_L2_S28_min,dat_L2_S28_var,dat_L3_S31_mean,dat_L3_S31_max,dat_L3_S31_min,dat_L3_S31_var,dat_L2_S26_mean,dat_L2_S26_max,...,first_date_value_index_ratio_1,first_date_value_index_ratio_2,time_ratio_value_index_diff_0,time_ratio_value_index_diff_1,first_time_value_index_diff_0,first_time_value_index_diff_1,last_time_value_index_diff_0,last_time_value_index_diff_1,first_date_value_index_diff_0,first_date_value_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,23747.0,99531.0,-1932587.0,-3232.0,-2294181.0,-3232.0,-68013.0,-3232.0,9999999.0,-1.0
2,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,14590.0,49170.5,-1686473.0,-62612.0,-3722.0,-5943.0,-2059413.0,-61348.0,1.0,-1.0
3,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,704.11,704.11,...,12082.333333,4708.333333,-814062.0,-24124.0,-1903401.0,-17769.0,-2180310.0,-24124.0,1.0,-5.0
5,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,255.5,255.5,...,6793.6,2256.8,-88702.0,-97262.0,-74259.0,-147131.0,-74259.0,-61577.0,-147131.0,-63560.0
8,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,760.93,760.93,...,3465.875,4005.625,-69633.0,-1471526.0,-58459.0,-1766232.0,-2177226.0,-4313.0,5.0,-9.0


In [104]:
print combined_train.shape
combined_train.head()

(50000, 4077)


Unnamed: 0_level_0,dat_L2_S28_mean,dat_L2_S28_max,dat_L2_S28_min,dat_L2_S28_var,dat_L3_S31_mean,dat_L3_S31_max,dat_L3_S31_min,dat_L3_S31_var,dat_L2_S26_mean,dat_L2_S26_max,...,first_date_value_index_ratio_1,first_date_value_index_ratio_2,time_ratio_value_index_diff_0,time_ratio_value_index_diff_1,first_time_value_index_diff_0,first_time_value_index_diff_1,last_time_value_index_diff_0,last_time_value_index_diff_1,first_date_value_index_diff_0,first_date_value_index_diff_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,10721.555556,1792.888889,-308115,-3564,-98445,-3564,-83397,-3564,-98445,-3564
136,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,476.088235,488.529412,-894094,-62447,-581605,-13642,-64460,-78493,-581605,-15560
229,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,177.545852,182.960699,-77797,-57389,-23224,-57389,-258424,-3436,1,-1
248,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,...,395.524194,163.943548,-1775834,-1835979,-75565,-91574,-86189,-75565,-86189,-75565
258,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,9999999.0,1013.16,1013.16,...,27.693798,366.527132,-93147,-94044,-70815,-11669,-2212578,-3552,15,-2


#### KFold cross-validation

In [79]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.8
params["num_round"]                = 501
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False


In [89]:

skf = StratifiedKFold(combined_train[dep_var_name], 4)

for train_index, valid_index in skf:
    valid_data = combined_train.iloc[valid_index]
    tmp_train  = combined_train.iloc[train_index]

    y = tmp_train[dep_var_name].values
    X = tmp_train.drop(dep_var_name, axis=1)

    valid_y = valid_data[dep_var_name].values
    valid_X = valid_data.drop(dep_var_name, axis=1)
    
    model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='test_bosch_xgb_model')
    model.fit(tmp_train, dep_var_name)
    
    pred = model.predict(valid_X)
    print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
    print '\n'
    print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

scale_pos_weight: 159.256410256
a base_score 0.00624 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.714139
[50]	train-auc:0.929012
[100]	train-auc:0.95725
[150]	train-auc:0.973002
[200]	train-auc:0.985635
[250]	train-auc:0.99175
[300]	train-auc:0.995396
[350]	train-auc:0.997078
[400]	train-auc:0.998003
[450]	train-auc:0.998593
[500]	train-auc:0.999007
the xgboost fit is finished by using 276.0 seconds, saved into test_bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00624
threshold for preds: 0.0305829042032
0.148532999765


result from using flexsible threshold: (0.24959160803234479, 0.2047889679670334)
scale_pos_weight: 159.256410256
a base_score 0.00624 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.743321
[50]	train-auc:0.893038
[100]	train-auc:0.930638
[150]	train-auc:0.95189
[200]	train-auc:0.969382
[250]	train-auc:0.981204
[300]	train-auc:0.988611
[350]	train-auc:0.992939
[400]	train-auc:0.995426
[450]	train-auc:0.997187
[500]	train-auc:0.998183
the xgboost fit is finished by using 271.0 seconds, saved into test_bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00624
threshold for preds: 0.0304269296962
0.174335030075


result from using flexsible threshold: (0.19066125578191645, 0.046489156782627106)
scale_pos_weight: 159.256410256
a base_score 0.00624 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.762604
[50]	train-auc:0.924373
[100]	train-auc:0.946609
[150]	train-auc:0.962632
[200]	train-auc:0.980306
[250]	train-auc:0.98836
[300]	train-auc:0.992974
[350]	train-auc:0.995513
[400]	train-auc:0.99719
[450]	train-auc:0.998258
[500]	train-auc:0.998896
the xgboost fit is finished by using 275.0 seconds, saved into test_bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00624
threshold for preds: 0.0214032711911
0.148532999765


result from using flexsible threshold: (0.2285969636038559, 0.11651329696178436)
scale_pos_weight: 159.256410256
a base_score 0.00624 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.686714
[50]	train-auc:0.903945
[100]	train-auc:0.938321
[150]	train-auc:0.959428
[200]	train-auc:0.978452
[250]	train-auc:0.988739
[300]	train-auc:0.99346
[350]	train-auc:0.996336
[400]	train-auc:0.997937
[450]	train-auc:0.998603
[500]	train-auc:0.99906
the xgboost fit is finished by using 274.0 seconds, saved into test_bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00624
threshold for preds: 0.028971457653
0.16143401492


result from using flexsible threshold: (0.21370497474412756, 0.09888489544391632)


In [108]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)


#### regular models

In [97]:
#print tmp_train['index_ratio'].min(), tmp_train['index_ratio'].max()

4.4927666457e-05 14205.5


In [116]:
rf_params = {'random_state' : 9999, 'n_estimators' : 2000, 'max_depth' : 5, 'criterion' : 'gini', 'n_jobs' : -1}
et_params = {'random_state' : 9999, 'n_estimators' : 200, 'max_depth' : 5, 'criterion' : 'gini', 'n_jobs' : -1}
rf_clf = RandomForestClassifier(**rf_params)
rf_clf = rf_clf.fit(X, y)

et_clf = RandomForestClassifier(**et_params)
et_clf = et_clf.fit(X, y)

In [115]:
rf_pred = rf_clf.predict_proba(valid_X)[:, 1]
et_pred = et_clf.predict_proba(valid_X)[:, 1]

print 'result from using constant fraction: \n'
print score_MCC(valid_y, rf_pred)
print score_MCC(valid_y, et_pred)
print '\n'
print 'result from using flexsible threshold:'
print CombinedModel.mcc_eval_func(valid_y, rf_pred)
print CombinedModel.mcc_eval_func(valid_y, et_pred)


result from using constant fraction: 

mean of groud truth: 0.00620041336089
threshold for preds: 0.208108795254
0.123598204663
mean of groud truth: 0.00620041336089
threshold for preds: 0.144437788891
0.112778429412


result from using flexsible threshold:
(0.1689863037893953, 0.2573276077009751)
(0.19700913316658514, 0.1844954264950454)


In [117]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)

model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='test_bosch_xgb_model')
model.fit(tmp_train, dep_var_name)
pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

scale_pos_weight: 158.821917808
a base_score 0.00625696408674 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.73591
[50]	train-auc:0.927667
[100]	train-auc:0.94932
[150]	train-auc:0.967596
[200]	train-auc:0.983819
[250]	train-auc:0.992125
[300]	train-auc:0.995802
[350]	train-auc:0.997635
[400]	train-auc:0.998412
[450]	train-auc:0.998901
[500]	train-auc:0.999253
the xgboost fit is finished by using 259.0 seconds, saved into test_bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00620041336089
threshold for preds: 0.0542272486146
0.156057530416


result from using flexsible threshold: (0.22856920443307793, 0.2713233530521393)
