In [1]:
%matplotlib inline
import os, sys, time
import pandas as pd
import numpy as np
from os.path import join
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import preprocessing
sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions
from utils.wrapped_xgboost import xgboost_classifier
from utils.validation_tools import score_MCC, MCC, create_validation_index
from utils.models import CombinedModel
from utils.data_munge import remove_single_value_columns

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'


start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'
nan_fill_value = -2.


### section to select data (rows-wise) based on the start_time

In [2]:
bin_num = 1 ## number of bins to separate data by start_time
tmp_train, tmp_test, bins, bin_names = bosch_functions.create_grouped_index_df(bin_num)

data loading takes  57.7  seconds.


In [3]:
def creat_non_selected_window_num(bin_num, select_bin = '0'):
    none_selected_window_num = [np.NaN]
    for i in range(bin_num):
        if str(i) != select_bin:
            none_selected_window_num.append(str(i))
    return none_selected_window_num
    
none_selected_window_num = creat_non_selected_window_num(bin_num, '0')
## select NaN data only
none_selected_window_num = ['0']
skipped_test_row_num = tmp_test.loc[tmp_test['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()
skipped_train_row_num = tmp_train.loc[tmp_train['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()

In [4]:
print tmp_train.shape, tmp_test.shape, len(skipped_train_row_num), len(skipped_test_row_num)

(1183747, 3) (1183748, 3) 673861 674503


In [5]:
nrows = 100000
## select all the numerical columns and try to use LR
start_time = time.time()
train_num = pd.read_csv(join(data_path, train_num_file),  skiprows=skipped_train_row_num,  index_col='Id', nrows = nrows)
train_dat = pd.read_csv(join(data_path, train_date_file), skiprows=skipped_train_row_num,  index_col='Id', nrows = nrows)
train_cat = pd.read_csv(join(data_path, train_cat_file),  skiprows=skipped_train_row_num,  index_col='Id', nrows = nrows)
print 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))

finish loading date using 49.0 seconds


#### load the regular data

In [6]:
'''
nrows = 50000
## select all the numerical columns and try to use LR
start_time = time.time()
train_num = pd.read_csv(join(data_path, train_num_file),    index_col='Id', nrows = nrows)
train_dat = pd.read_csv(join(data_path, train_date_file),   index_col='Id', nrows = nrows)
train_cat = pd.read_csv(join(data_path, train_cat_file),    index_col='Id', nrows = nrows)
print 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))
'''

"\nnrows = 50000\n## select all the numerical columns and try to use LR\nstart_time = time.time()\ntrain_num = pd.read_csv(join(data_path, train_num_file),    index_col='Id', nrows = nrows)\ntrain_dat = pd.read_csv(join(data_path, train_date_file),   index_col='Id', nrows = nrows)\ntrain_cat = pd.read_csv(join(data_path, train_cat_file),    index_col='Id', nrows = nrows)\nprint 'finish loading date using {} seconds'.format(round(time.time() - start_time, 0))\n"

In [6]:
print train_dat.shape, train_num.shape, train_cat.shape

(100000, 1156) (100000, 969) (100000, 2140)


In [7]:
tmp_train_num = train_num
tmp_train_dat = train_dat
tmp_train_cat = train_cat
#tmp_train_num = train_num.copy()
#tmp_train_dat = train_dat.copy()
#tmp_train_cat = train_cat.copy()

### stat processing categorical features

In [8]:

def BasicCat_FeatureEngineering(train_cat):
    ## feature engineering on the date features
    encoder = preprocessing.LabelEncoder()
    column_names = train_cat.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    dat_new_fea = pd.DataFrame()
    dat_new_fea['cat_sum'] = train_cat.sum(axis=1)
    dat_new_fea['cat_mean'] = train_cat.mean(axis=1)
    dat_new_fea['cat_nan_count'] = train_cat.isnull().sum(axis=1)
    dat_new_fea['cat_max'] = train_cat.max(axis=1)
    dat_new_fea['cat_min'] = train_cat.min(axis=1)
    dat_new_fea['cat_max_min_diff'] = dat_new_fea['cat_max'] - dat_new_fea['cat_min']
    dat_new_fea['cat_max_min_ratio'] = dat_new_fea['cat_min'] / dat_new_fea['cat_max']

    dat_new_fea['cat_idxmax'] = train_cat.idxmax(axis=1)
    dat_new_fea['cag_idxmax'].fillna('NaN', inplace=True)
    dat_new_fea['cat_idxmax'] = encoder.transform(dat_new_fea['cat_idxmax'])
    dat_new_fea['cat_idxmin'] = train_cat.idxmin(axis=1)
    dat_new_fea['cat_idxmin'].fillna('NaN', inplace=True)
    dat_new_fea['cat_idxmin'] = encoder.transform(dat_new_fea['cat_idxmin'])
    return dat_new_fea

In [9]:
remove_single_value_columns(tmp_train_cat)
remove_single_value_columns(tmp_train_num)
remove_single_value_columns(tmp_train_dat)

raw train data dimension:  (100000, 2140)
processed train data dimension:  (100000, 1333)
raw train data dimension:  (100000, 969)
processed train data dimension:  (100000, 957)
raw train data dimension:  (100000, 1156)
processed train data dimension:  (100000, 1142)


#### collect all the unique levels from categorical features and transform data by each level

In [10]:
unique_value_set = set()
for col in tmp_train_cat.columns:
    uniques = tmp_train_cat[col].dropna().unique()
    unique_value_set |= set(uniques)
    
level_mapping_dict = {}
for i, value in enumerate(sorted(list(unique_value_set))):
    level_mapping_dict[value] = i + 1

In [11]:
sorted_level_list = list(unique_value_set)
sorted_level_list = sorted(sorted_level_list)

In [12]:
## generic function to encode categorical features
def sweep_up_categorical_encode_by_dep_var(df, fea_name, test_df = None, dep_var_name='Response', count_thres = 10, nan_fill = -1., const_scale = 1.):
    tmp_df = df[[fea_name, dep_var_name]]
    tmp_df = tmp_df.fillna(nan_fill)
    value_counts = tmp_df[fea_name].value_counts()
    minor_keys = []
    key_dep_var_map = {}
    
    ## training sweep-up
    for counts, key in zip(value_counts.values, value_counts.index):
        if counts > count_thres:
            mean_dep_var = const_scale * tmp_df.loc[tmp_df[fea_name] == key, dep_var_name].mean()
            key_dep_var_map[key] = mean_dep_var
        else:
            minor_keys.append(key)
            
    ## mean value of dep_var for all the minor levels
    if len(minor_keys) > 0:
        minor_key_dep_var_mean = const_scale * tmp_df.loc[tmp_df[fea_name].isin(minor_keys), dep_var_name].mean()
        ## update the key_dep_var_map with minor key
        for key in minor_keys:
            key_dep_var_map[key] = minor_key_dep_var_mean
    
    encoded_train = tmp_df[fea_name].replace(key_dep_var_map)
    overall_mean_dep_var = tmp_df[dep_var_name].mean()
    
    ## sweep up the test column
    if test_df is not None:
        test_value_counts = test_df[fea_name].value_counts()
        test_minor_keys = []
        test_key_dep_var_map = key_dep_var_map.copy()
    
        for counts, key in zip(test_value_counts.values, test_value_counts.index):
            if key not in test_key_dep_var_map:
                print 'new level {} with counts {} found in test data'.format(key, counts)
                if counts > count_thres:
                    print 'warning! new level {} is found in test data!'.format(key)
                else:
                    test_minor_keys.append(key)
        
        if len(test_minor_keys) > 0:
            for key in test_minor_keys:
                test_key_dep_var_map[key] = const_scale * overall_mean_dep_var
        
        encoded_test = test_df[fea_name].replace(test_key_dep_var_map)
        return encoded_train, encoded_test
    
    else:
        return encoded_train
    
    
    

def getCat_LevelFeatures(series):
    feature_array = []
    tmp_series = series.dropna()
    for level in sorted_level_list:
        if sum(tmp_series.isin([level])):
            level_index = tmp_series[tmp_series == level].index
            feature_array.extend([level_index[0], level_index[-1], len(level_index)])
        else:
            feature_array.extend(['NaN', 'NaN', 0])
    return pd.Series(feature_array)       
    #print series.value_counts()       
    value_counts = series.value_counts()

    
start_time = time.time()

levelFeatures = tmp_train_cat.apply(getCat_LevelFeatures, axis=1)

print 'finish feature engineering date using {} minutes'.format(round((time.time() - start_time)/60., 2))

finish feature engineering date using 26.16 minutes


In [13]:
#sorted_level_list
column_name_features = []
cat_fea_names = []
for level in sorted_level_list:
    cat_fea_names.extend(['level_{}_start_column'.format(level), 
                          'level_{}_end_column'.format(level), 
                          'level_{}_column_counts'.format(level)])
    
    column_name_features.extend(['level_{}_start_column'.format(level), 
                                 'level_{}_end_column'.format(level)])

In [14]:
levelFeatures.columns = cat_fea_names

encoder = preprocessing.LabelEncoder()
column_names = train_cat.columns.tolist()
column_names.append('NaN')
encoder.fit(column_names)

for col in column_name_features:
    levelFeatures[col] = encoder.transform(levelFeatures[col]) 

In [16]:
## One-Hot encode important categorical columns
print tmp_train_cat.shape
imp_cat_fea = ['L3_S29_F3317', 'L3_S35_F3907', 'L3_S49_F4217', 'L3_S35_F3902', 'L3_S32_F3851', 'L3_S32_F3854']
cat_fea = []
for col in imp_cat_fea:
    if col in tmp_train_cat.columns:
        cat_fea.append(col)

imp_cat_df = tmp_train_cat[cat_fea].astype(str)
oneHot_combined_cat = pd.get_dummies(imp_cat_df, dummy_na=True)
tmp_train_cat = tmp_train_cat.ix[:, ~tmp_train_cat.columns.isin(imp_cat_fea)]
print tmp_train_cat.shape

(100000, 1333)
(100000, 1327)


In [17]:
print oneHot_combined_cat.shape

(100000, 44)


In [18]:
'''
print levelFeatures.shape
levelFeatures.head()

print tmp_train_cat.shape
tmp_train_cat.head()
'''

'\nprint levelFeatures.shape\nlevelFeatures.head()\n\nprint tmp_train_cat.shape\ntmp_train_cat.head()\n'

In [18]:
combined_train_cat = pd.concat([tmp_train_cat, oneHot_combined_cat, levelFeatures], axis=1)

In [19]:
print combined_train_cat.shape

(100000, 1533)


### functions to process numerical and date features

In [49]:


def BasicDate_FeatureEngineering(tmp_train_dat):
    ## feature engineering on the date features
    encoder = preprocessing.LabelEncoder()
    column_names = tmp_train_dat.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    dat_new_fea = pd.DataFrame()
    
    if 'L0_S0_D1' in tmp_train_dat.columns:
        dat_new_fea['start_time'] = tmp_train_dat['L0_S0_D1']
        
    dat_new_fea['time_sum'] = tmp_train_dat.sum(axis=1)
    dat_new_fea['time_mean'] = tmp_train_dat.mean(axis=1)
    dat_new_fea['dat_nan_count'] = tmp_train_dat.isnull().sum(axis=1)
    dat_new_fea['max_time'] = tmp_train_dat.max(axis=1)
    dat_new_fea['min_time'] = tmp_train_dat.min(axis=1)
    dat_new_fea['dat_max_min_diff'] = dat_new_fea['max_time'] - dat_new_fea['min_time']
    dat_new_fea['dat_max_min_ratio'] = dat_new_fea['min_time'] / dat_new_fea['max_time']

    dat_new_fea['dat_idxmax'] = tmp_train_dat.idxmax(axis=1)
    dat_new_fea['dat_idxmax'].fillna('NaN', inplace=True)
    dat_new_fea['dat_idxmax'] = encoder.transform(dat_new_fea['dat_idxmax'])
    dat_new_fea['dat_idxmin'] = tmp_train_dat.idxmin(axis=1)
    dat_new_fea['dat_idxmin'].fillna('NaN', inplace=True)
    dat_new_fea['dat_idxmin'] = encoder.transform(dat_new_fea['dat_idxmin'])
    return dat_new_fea


def getRelativeTimeColumns(series):
    '''
    normalize the time features by
    the start_time, the first none-NaN
    value
    '''
    if series[0] == np.NaN:
        start_time = series.dropna().index[0]
    else:
        start_time = series[0]
    new_series = series - start_time
    return new_series
   


def getTimeSteps(series, unique_value_counts = 10):
    '''
    in each row/series, use the sorted value_count
    to find the time steps and use the value, counts
    and column_index as features
    '''
    value_counts = series.value_counts()
    value_counts.sort_index(inplace=True)

    if 0. in value_counts.index:
        value_counts = value_counts[value_counts.index != 0.]
        
    available_counts = value_counts.shape[0]
    feature_array = []
    for i in xrange(unique_value_counts):
        if i < available_counts:
            date_value = value_counts.index[i]
            counts = value_counts[date_value]
            first_index = series[series == date_value].index[0]
            avg_time_cost = date_value / counts
            feature = [date_value, counts, avg_time_cost, first_index]
        else:
            feature = [np.NaN, 0, 0, 'NaN']
        feature_array.extend(feature)

    return pd.Series(feature_array)



def getTimeChangeColumns(series):
    start_time = series[0]
    tmp_series = series.dropna()
    if start_time == np.NaN:
        first_index     = tmp_series.index[0]
        last_index      = tmp_series.index[-1]
        first_id_value  = tmp_series[first_index]
        last_id_value   = tmp_series[last_index]
        first_num_value = first_id_value
        time_diff       = last_id_value - first_id_value
        time_ratio      = last_id_value / first_id_value
        return pd.Series([first_index, last_index, time_diff, time_ratio, 
                          first_id_value, last_id_value, first_num_value])
    else:
        first_num_value = start_time
        if np.sum(tmp_series != start_time) == 0:
            return pd.Series(['NaN', 'NaN', np.NaN, np.NaN, np.NaN, np.NaN, first_num_value])
        else:
            first_index     = tmp_series.index[tmp_series != start_time][0]
            last_index      = tmp_series.index[tmp_series != start_time][-1]
            first_id_value  = series[first_index]
            last_id_value   = series[last_index]
            first_id_value  = tmp_series[first_index]
            last_id_value   = tmp_series[last_index]
            time_diff       = last_id_value - first_id_value
            time_ratio      = last_id_value / first_id_value

            return pd.Series([first_index, last_index, time_diff, time_ratio,
                              first_id_value, last_id_value, first_num_value])


        
def getNonNaN_ColumIndex(series):
    if series.notnull().sum() == 0:
        return pd.Series([np.NaN, np.NaN, np.NaN])
    else:
        first_id = series[series.notnull()].index[0]
        last_id = series[series.notnull()].index[-1]
        time_diff = series[last_id] - series[first_id]
        return pd.Series([first_id, last_id, time_diff, first_time_value, last_time_value])



    
    
    
    
def DateFeatureEngineering(series):
    '''
    combination of time difference features and none-NaN features    
    this method is faster than separating two functions
    '''
    ## time features
    start_time = series[0]
    tmp_series = series.dropna()
    if start_time == np.NaN:
        first_index = tmp_series.index[0]
        last_index  = tmp_series.index[-1]
        time_diff   = tmp_series[last_index] - tmp_series[first_index]
        time_fea    = np.array([first_index, last_index, time_diff])
    else:
        if np.sum(tmp_series != start_time) == 0:
            time_fea = np.array([np.NaN, np.NaN, np.NaN])
        else:
            first_index = tmp_series.index[tmp_series != start_time][0]
            last_index  = tmp_series.index[tmp_series != start_time][-1]
            time_diff   = tmp_series[last_index] - tmp_series[first_index]
            time_fea = np.array([first_index, last_index, time_diff])
    ## none-NaN features
    if series.notnull().sum() == 0:
        nan_fea = np.array([np.NaN, np.NaN, np.NaN])
    else:
        first_id  = tmp_series.index[0]
        last_id   = tmp_series.index[-1]
        time_diff = tmp_series[last_id] - tmp_series[first_id]
        nan_fea = np.array([first_id, last_id, time_diff])
         
    new_fea_row = np.concatenate([time_fea, nan_fea])
    return pd.Series(new_fea_row)

### start processing numerical features

In [21]:


def NumericalFeatureEngineering(df, col_ignore = ['Response']):
    tmp_df = df.loc[:, ~df.columns.isin(col_ignore)]
    new_fea_df = pd.DataFrame()
    encoder = preprocessing.LabelEncoder()
    column_names = tmp_df.columns.tolist()
    column_names.append('NaN')
    encoder.fit(column_names)
    
    new_fea_df['num_mean'] = tmp_df.mean(axis=1)
    
    new_fea_df['num_sum'] = tmp_df.sum(axis=1)
    #num_sum_max = new_fea_df['num_sum'].max()
    #new_fea_df['num_sum'].fillna(1.*int(num_sum_max - 5), inplace=True)
    new_fea_df['num_max'] = tmp_df.max(axis=1)
    #num_max_max = new_fea_df['num_max'].max()
    #new_fea_df['num_max'].fillna(1.*int(num_max_max + 1), inplace=True)
    new_fea_df['num_min'] = tmp_df.min(axis=1)
    #num_min_min = new_fea_df['num_min'].min()
    #new_fea_df['num_min'].fillna(1.*int(num_min_min - 1.), inplace=True)
    new_fea_df['num_max_min_ratio'] = new_fea_df['num_min'] / new_fea_df['num_max']
    new_fea_df['num_max_min_ratio'] = new_fea_df['num_max_min_ratio'].replace([np.inf, -np.inf], np.NaN)
    new_fea_df['num_nan_col_count'] = tmp_df.isnull().sum(axis=1)
    new_fea_df['num_reg_col_count'] = tmp_df.shape[1] - tmp_df.isnull().sum(axis=1)
    new_fea_df['num_idxmax'] = tmp_df.idxmax(axis=1)
    new_fea_df['num_idxmax'].fillna('NaN', inplace=True)
    new_fea_df['num_idxmax'] = encoder.transform(new_fea_df['num_idxmax'])
    new_fea_df['num_idxmin'] = tmp_df.idxmin(axis=1)
    new_fea_df['num_idxmin'].fillna('NaN', inplace=True)
    new_fea_df['num_idxmin'] = encoder.transform(new_fea_df['num_idxmin'])
    return new_fea_df


In [22]:
tmp_train_num_Basics = NumericalFeatureEngineering(tmp_train_num)

In [24]:
#print tmp_train_num_Basics.min().min(), tmp_train_num_Basics.max().max()

#### fill up the NaN in numerical features with nan_fill_value

In [25]:
#print tmp_train_num.max().max(), '\n', tmp_train_num.min().min()

In [26]:
#tmp_train_num.fillna(nan_fill_value, inplace=True)

In [23]:
combined_train_num = pd.concat([tmp_train_num, tmp_train_num_Basics], axis=1)

In [28]:
#print combined_train_num.isnull().sum().sum()

### start processing date features

In [50]:
start_time = time.time()

## normalized date columns
tmp_train_dat_Norm = tmp_train_dat.apply(getRelativeTimeColumns, axis=1)
## basic features from tmp_train_dat
tmp_train_dat_Basics = BasicDate_FeatureEngineering(tmp_train_dat)

encoder = preprocessing.LabelEncoder()
column_names = tmp_train_dat.columns.tolist()
column_names.append('NaN')
encoder.fit(column_names)
    
#'''
tmp_train_dat_TimeDiff = tmp_train_dat.apply(getTimeChangeColumns, axis=1)
tmp_train_dat_TimeDiff.columns = ['time_diff_start_col', 'time_diff_end_col', 'time_diff_value', 
                                  'time_ratio_value', 'first_time_value', 'last_time_value', 'first_date_value']
                   
for column in ['time_diff_start_col', 'time_diff_end_col']:
    tmp_train_dat_TimeDiff[column].fillna('NaN', inplace=True)
    tmp_train_dat_TimeDiff[column] = encoder.transform(tmp_train_dat_TimeDiff[column])  
    
#'''


## section to create timeStep features
unique_value_counts = 10
timeStep_columnNames = []
column_name_columns = []
for i in xrange(unique_value_counts):
    timeStep_columnNames.extend(['time_diff_step_{}'.format(i), 'column_counts_step_{}'.format(i), 
                                 'time_cost_step_{}'.format(i), 'first_column_step_{}'.format(i)])
    column_name_columns.append('first_column_step_{}'.format(i))

tmp_train_dat_TimeStep = tmp_train_dat_Norm.apply(getTimeSteps, axis=1)
tmp_train_dat_TimeStep.columns = timeStep_columnNames
for column in column_name_columns:
    tmp_train_dat_TimeStep[column].fillna('NaN', inplace=True)
    tmp_train_dat_TimeStep[column] = encoder.transform(tmp_train_dat_TimeStep[column])
    

print 'finish feature engineering date using {} minutes'.format(round((time.time() - start_time)/60, 2))

finish feature engineering date using 4.27 minutes


In [51]:
tmp_train_dat_TimeDiff.head()

Unnamed: 0_level_0,time_diff_start_col,time_diff_end_col,time_diff_value,time_ratio_value,first_time_value,last_time_value,first_date_value
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,26,1042,2.63,1.002003,1313.12,1315.75,
14,26,1042,1.41,1.000848,1662.63,1664.04,
16,294,1042,13.14,1.016607,791.22,804.36,
23,26,1042,1.62,1.010367,156.27,157.89,
41,26,1042,0.51,1.001071,476.06,476.57,


In [31]:
'''
print tmp_train_dat_Norm.shape
tmp_train_dat_Norm.head()

print tmp_train_dat_TimeStep.shape
tmp_train_dat_TimeStep.head()

print tmp_train_dat_TimeDiff.shape
tmp_train_dat_TimeDiff.head()

print tmp_train_dat_Basics.shape
tmp_train_dat_Basics.head()
'''

'\nprint tmp_train_dat_Norm.shape\ntmp_train_dat_Norm.head()\n\nprint tmp_train_dat_TimeStep.shape\ntmp_train_dat_TimeStep.head()\n\nprint tmp_train_dat_TimeDiff.shape\ntmp_train_dat_TimeDiff.head()\n\nprint tmp_train_dat_Basics.shape\ntmp_train_dat_Basics.head()\n'

In [52]:
combined_train_dat = pd.concat([tmp_train_dat_Norm, tmp_train_dat_Basics, tmp_train_dat_TimeDiff, tmp_train_dat_TimeStep], axis=1)

In [33]:
#print combined_train_dat.max().max(), combined_train_dat.min().min()

#### fill up NaN in the date features

In [34]:
#combined_train_dat.fillna(nan_fill_value, inplace=True)

In [35]:
#print combined_train_dat.isnull().sum().sum()

In [36]:
#'start_time' in combined_train_dat

#### create features from index columns

In [53]:
combined_train_dat['first_time_index']         = combined_train_dat['first_time_value'].argsort()
combined_train_dat['last_time_index']          = combined_train_dat['last_time_value'].argsort()
combined_train_dat['index_ratio']              = combined_train_dat['first_time_index'] / combined_train_dat['last_time_index']

if 'start_time' in combined_train_dat.columns:
    combined_train_dat['start_time_index']         = combined_train_dat['start_time'].argsort()
    combined_train_dat['start_time_index_ratio_1'] = combined_train_dat['first_time_index'] / combined_train_dat['start_time_index']
    combined_train_dat['start_time_index_ratio_2'] = combined_train_dat['last_time_index'] / combined_train_dat['start_time_index']
    
    
combined_train_dat['time_ratio_value_index']    = combined_train_dat['time_ratio_value'].argsort()
combined_train_dat['first_time_value_index']    = combined_train_dat['first_time_value'].argsort()
combined_train_dat['first_date_value_index']    = combined_train_dat['first_date_value'].argsort()
combined_train_dat['first_date_value_index_ratio_1'] = combined_train_dat['first_time_index'] / combined_train_dat['first_date_value_index']
combined_train_dat['first_date_value_index_ratio_2'] = combined_train_dat['last_time_index'] / combined_train_dat['first_date_value_index']

#combined_train_dat['index']                    = combined_train_cat.index

In [33]:
combined_train_num['top_2_ratio'] = combined_train_num['L3_S33_F3859'] / combined_train_num['L3_S33_F3857']

'''
to_index_column_names = ['L0_S0_F20', 'L3_S33_F3859', 'L0_S1_F28', 'L3_S30_F3754', 'L3_S33_F3857',
                         'L0_S1_F24', 'L0_S2_F44', 'L3_S30_F3754', 'L3_S30_F3744']
for col in to_index_column_names:
    col_name = '{}_index'.format(col)
    combined_train_num[col_name] = combined_train_num[col].argsort()
    if col != to_index_column_names[0]:
        ratio_col_name = '{}_index_ratio'.format(col)
        bench_col_name = col_name = '{}_index'.format(to_index_column_names[0])
        combined_train_num[ratio_col_name] = combined_train_num[col] / combined_train_num[bench_col_name]
'''

"\nto_index_column_names = ['L0_S0_F20', 'L3_S33_F3859', 'L0_S1_F28', 'L3_S30_F3754', 'L3_S33_F3857',\n                         'L0_S1_F24', 'L0_S2_F44', 'L3_S30_F3754', 'L3_S30_F3744']\nfor col in to_index_column_names:\n    col_name = '{}_index'.format(col)\n    combined_train_num[col_name] = combined_train_num[col].argsort()\n    if col != to_index_column_names[0]:\n        ratio_col_name = '{}_index_ratio'.format(col)\n        bench_col_name = col_name = '{}_index'.format(to_index_column_names[0])\n        combined_train_num[ratio_col_name] = combined_train_num[col] / combined_train_num[bench_col_name]\n"

In [55]:
#remove_single_value_columns(combined_train_cat)
remove_single_value_columns(combined_train_num)
remove_single_value_columns(combined_train_dat)

raw train data dimension:  (100000, 967)
processed train data dimension:  (100000, 967)
raw train data dimension:  (100000, 1206)
processed train data dimension:  (100000, 475)


### combine all the data together

In [56]:
combined_train = pd.concat([combined_train_num, combined_train_dat, combined_train_cat], axis=1)

In [57]:
'start_time' in combined_train.columns

False

In [58]:
print combined_train.shape
combined_train.head()

(100000, 2975)


Unnamed: 0_level_0,L0_S1_F24,L0_S1_F28,L0_S2_F32,L0_S2_F36,L0_S2_F40,L0_S2_F44,L0_S2_F48,L0_S2_F52,L0_S2_F56,L0_S2_F60,...,level_16777216.0_column_counts,level_16777232.0_start_column,level_16777232.0_end_column,level_16777232.0_column_counts,level_16777557.0_start_column,level_16777557.0_end_column,level_16777557.0_column_counts,level_33554432.0_start_column,level_33554432.0_end_column,level_33554432.0_column_counts
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,1333,1333,,1333,1333,,1333,1333,
14,,,,,,,,,,,...,,1333,1333,,1333,1333,,1333,1333,
16,,,,,,,,,,,...,,1333,1333,,1333,1333,,1333,1333,
23,,,,,,,,,,,...,,1333,1333,,1333,1333,,1333,1333,
41,,,,,,,,,,,...,,1333,1333,,1333,1333,,1333,1333,


In [239]:
#combined_train = combined_train_dat
#combined_train[dep_var_name] = combined_train_num[dep_var_name]

In [59]:
############## Section of regular validation #######################
train_index, valid_index = create_validation_index(combined_train, 0.3, dep_var_name, True)
valid_data = combined_train.ix[valid_index]
tmp_train  = combined_train.ix[train_index]

In [60]:
y = tmp_train[dep_var_name].values
X = tmp_train.drop(dep_var_name, axis=1)

valid_y = valid_data[dep_var_name].values
valid_X = valid_data.drop(dep_var_name, axis=1)

#### tree-base models

In [None]:
rf_params = {'random_state' : 9999, 'n_estimators' : 2000, 'max_depth' : 7, 'criterion' : 'gini', 'n_jobs' : -1}
et_params = {'random_state' : 9999, 'n_estimators' : 200, 'max_depth' : 18, 'criterion' : 'gini', 'n_jobs' : -1}
rf_clf = RandomForestClassifier(**rf_params)
rf_clf = rf_clf.fit(X, y)

et_clf = RandomForestClassifier(**et_params)
et_clf = et_clf.fit(X, y)

#### Logistic Regression model

In [247]:
C = 0.1
start_time = time.time()
LR_clf = LogisticRegression(C = C, class_weight='balanced', n_jobs = -1, penalty='l2')
LR_clf.fit(X, y)
print 'finish training LR model using {} seconds'.format(round(time.time() - start_time, 0))

finish training LR model using 242.0 seconds


#### xgboost model

In [None]:
params = {}
params["eta"]                      = 0.0075
params["subsample"]                = 0.8
params["colsample_bytree"]         = 0.8
params["num_round"]                = 1001
params["max_depth"]                = 5
params["gamma"]                    = 0
params["metrics"]                  = 'auc'
params['eval_metric']              = 'auc'
params["seed"]                     = 999
params['verbose_eval']             = 50
## whether to use weights
params['use_base_score']           = True
params['use_weights']              = True
#params['use_scale_pos_weight']     = True
params["val"]                      = False

model = xgboost_classifier(label_name = dep_var_name, params = params, model_file='bosch_xgb_model')
model.fit(tmp_train, dep_var_name)

scale_pos_weight: 159.185354691
a base_score 0.00624276796046 is used in the xgboost model...



####################
 train the xgboost without early stopping
####################


[0]	train-auc:0.710775
[50]	train-auc:0.817682


In [43]:
#pred = rf_clf.predict_proba(valid_X)[:, 1]
#pred = et_clf.predict_proba(valid_X)[:, 1]
#pred = LR_clf.predict_proba(valid_X)[:, 1]
pred = model.predict(valid_X)

print 'result from using constant fraction: \n', score_MCC(valid_y, pred)
print '\n \n'
print 'result from using flexsible threshold:', CombinedModel.mcc_eval_func(valid_y, pred)

in the prediction step, dep_var_name is not provided....
result from using constant fraction: 
mean of groud truth: 0.00620020667356
threshold for preds: 0.0333485260902
0.210156570941

 

result from using flexsible threshold: (0.2876271064191786, 0.3474218547344208)
