In [1]:
import pandas as pd
import numpy as np
import os, sys, time, random
from os.path import join
import yaml
import cPickle as pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import StratifiedKFold

sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
from utils.bosch_functions import load_processed_bosch_data
from utils.wrapped_xgboost import xgboost_classifier

from utils.models import CombinedModel
from utils.validation_tools import score_MCC, MCC, create_validation_index, cross_validate_model
from utils.validation_tools import get_combinedFeaImp_fromProj
from utils.feature_engineering import encode_categorical_by_dep_var

dep_var_name = 'Response'
project_name = 'processed_subsample_250k_data_xgb'
                                                                                                                                               

feature_data_path = '/home/ymm/kaggle/xgboost_hyperopt/scripts/xgb_model_features_0'
data_path = '/home/ymm/kaggle/bosch_data/bosch_processed_data'                                                                                 
train_file_name = 'bosch_combined_train_data.csv'

In [2]:
start_time = time.time()

## subsample training data                                                                                                                     
tot_row_num = 1183747
num_rows = 50000                                                                                                                              
skip = sorted(random.sample(xrange(1,tot_row_num + 1),tot_row_num - num_rows))
train = pd.read_csv(join(data_path, train_file_name), index_col='Id', skiprows=skip)                                                           
print 'shape of training data is {}, using {} minutes'.format(train.shape, round((time.time() - start_time)/60, 2))

shape of training data is (50000, 1558), using 0.49 minutes


In [3]:
with open(os.path.join(data_path, 'bosch_feature_dict.yml'), 'r') as yml_stream:
    models_dict = yaml.load(yml_stream)

In [4]:
combined_feature_importance = get_combinedFeaImp_fromProj(feature_data_path)
selected_features = combined_feature_importance.index.tolist()

print models_dict.keys()

(1152, 2)
(1209, 2)
(1216, 2)
(1168, 2)
(1204, 2)
['date', 'station', 'num', 'cat']


In [5]:
def get_selected_features(selected_features, feature_set, train_features):
    selected_subset_feature = [col for col in feature_set if col in selected_features]
    for col in selected_subset_feature:
        if col not in train_features:
            print 'column {} does not exist in train'.format(col)
    return selected_subset_feature

In [6]:
station_features = get_selected_features(selected_features, models_dict['station'], train.columns.tolist())
num_features = get_selected_features(selected_features, models_dict['num'], train.columns.tolist())
cat_features = get_selected_features(selected_features, models_dict['cat'], train.columns.tolist())
dat_features = get_selected_features(selected_features, models_dict['date'], train.columns.tolist())

In [7]:
train_stat = train[station_features]
train_cat = train[cat_features]
train_num = train[num_features]
train_dat = train[dat_features]

In [8]:
print train_stat.shape, train_cat.shape, train_num.shape, train_dat.shape

(50000, 774) (50000, 16) (50000, 624) (50000, 143)


In [9]:
#print train_stat.max().max(), train_stat.min().min()

In [10]:
print train_dat.isnull().sum().sum(), train_num.isnull().sum().sum(), train_cat.isnull().sum().sum(), train_stat.isnull().sum().sum()

3111968 23131423 375874 12818619


In [11]:
num_missing_value= -1.5
dat_missing_value = -1.
station_fillna_value = 9999999

train_stat = train_stat.fillna(station_fillna_value)
train_num = train_num.fillna(num_missing_value)
train_dat = train_dat.fillna(dat_missing_value)

In [12]:
start_time = time.time()

train_cat['Response'] = train['Response']
encode_columns_dict = encode_categorical_by_dep_var(train_cat, dep_var_column='Response', fill_missing=True, fill_missing_value = 9999999)
train_cat.drop('Response', axis=1, inplace=True)

print 'finish encoding categorical features using {} seconds'.format(round(time.time() - start_time, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train[col_name] = train[col_name].replace(dep_var_dict)


finish encoding categorical features using 1.0 seconds


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
print train_dat.isnull().sum().sum(), train_num.isnull().sum().sum(), train_cat.isnull().sum().sum(), train_stat.isnull().sum().sum()

0 0 0 0


In [14]:
combined_train = pd.concat([train_dat, train_num, train_cat, train_stat], axis=1)
combined_train[dep_var_name] = train[dep_var_name]

In [15]:
print combined_train.shape, train.shape

(50000, 1558) (50000, 1558)


In [16]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)

In [17]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.8
params["num_round"]                = 501
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False

In [18]:
model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='bosch_xgb_model')
model.fit(tmp_train, dep_var_name)

pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

scale_pos_weight: 167.274038462
a base_score 0.00594268735179 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.895148
[50]	train-auc:0.961153
[100]	train-auc:0.969305
[150]	train-auc:0.97737
[200]	train-auc:0.984385
[250]	train-auc:0.988474
[300]	train-auc:0.991049
[350]	train-auc:0.99259
[400]	train-auc:0.994035
[450]	train-auc:0.995422
[500]	train-auc:0.996536
the xgboost fit is finished by using 8.7 minutes, saved into bosch_xgb_model
in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00586705780385
threshold for preds: 0.111308638462
0.405603550765


result from using flexsible threshold: (0.4284298364314721, 0.177175372838974)


In [26]:
rf_params = {'random_state' : 9999, 'n_estimators' : 2000, 'max_depth' : 7, 'criterion' : 'gini', 'n_jobs' : -1}
et_params = {'random_state' : 9999, 'n_estimators' : 200, 'max_depth' : 18, 'criterion' : 'gini', 'n_jobs' : -1}
rf_clf = RandomForestClassifier(**rf_params)
rf_clf = rf_clf.fit(X, y)

et_clf = RandomForestClassifier(**et_params)
et_clf = et_clf.fit(X, y)

In [20]:
rf_pred = rf_clf.predict_proba(valid_X)[:, 1]
et_pred = et_clf.predict_proba(valid_X)[:, 1]

print 'result from using constant fraction: \n'
print score_MCC(valid_y, rf_pred)
print score_MCC(valid_y, et_pred)
print '\n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, rf_pred), CombinedModel.mcc_eval_func(valid_y, et_pred)

result from using constant fraction: 
mean of groud truth: 0.00586705780385
threshold for preds: 0.134773146856
0.337019345084 mean of groud truth: 0.00586705780385
threshold for preds: 0.300734622164
0.302727242243


result from using flexsible threshold: (0.39552983420609317, 0.20865927876708792) (0.38484222456887607, 0.42587428133674543)


In [25]:
et_params = {'random_state' : 9999, 'n_estimators' : 200, 'max_depth' : 20, 'criterion' : 'gini', 'n_jobs' : -1}

et_clf = RandomForestClassifier(**et_params)
et_clf = et_clf.fit(X, y)

et_pred = et_clf.predict_proba(valid_X)[:, 1]
print  CombinedModel.mcc_eval_func(valid_y, et_pred)

(0.36064602607579765, 0.4279625149738292)


In [None]:
skf = StratifiedKFold(combined_train[dep_var_name], 3)

for train_index, valid_index in skf:
    valid_data = combined_train.iloc[valid_index]
    tmp_train  = combined_train.iloc[train_index]

    y = tmp_train[dep_var_name].values
    X = tmp_train.drop(dep_var_name, axis=1)

    valid_y = valid_data[dep_var_name].values
    valid_X = valid_data.drop(dep_var_name, axis=1)
    
    model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='test_bosch_xgb_model')
    model.fit(tmp_train, dep_var_name)
    
    pred = model.predict(valid_X)
    print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
    print '\n'
    print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)