In [2]:
%matplotlib inline
from sklearn import preprocessing
import os, sys, time
import pandas as pd
import numpy as np
from os.path import join

sys.path.append('/home/ymm/kaggle/xgboost_hyperopt')
import utils.bosch_functions as bosch_functions

data_path = '/home/ymm/bosch/'

train_num_file   = 'train_numeric.csv'
train_cat_file   = 'train_categorical.csv'
train_date_file  = 'train_date.csv'
test_num_file    = 'test_numeric.csv'
test_cat_file    = 'test_categorical.csv'
test_date_file   = 'test_date.csv'

sample_submission_file   = 'sample_submission.csv'


start_time_column_name = 'L0_S0_D1'
id_column_name = 'Id'
dep_var_name = 'Response'

In [2]:
bin_num = 1 ## number of bins to separate data by start_time
tmp_train, tmp_test, bins, bin_names = bosch_functions.create_grouped_index_df(bin_num)

data loading takes  75.8  seconds.


In [3]:
## create the skipped row numbers

## select 19 features based on the LR model trained with NaN data, a threshold of 0.25 
## is used to select 19 features as listed below
LR_selected_features = ['L3_S38_F3952', 'L0_S23_F619',  'L1_S25_F1855', 'L1_S25_F2799',
                        'L3_S29_F3379', 'L1_S24_F1808', 'L1_S24_F679',  'L1_S25_F2498',
                        'L1_S24_F1118', 'L3_S49_F4206', 'L0_S22_F546',  'L3_S31_F3834',
                        'L3_S29_F3464', 'L3_S50_F4243', 'L2_S28_F3222', 'L1_S25_F2231',
                        'L1_S24_F1581', 'L1_S24_F1672', 'L3_S32_F3850']

none_selected_window_num = ['0']
skipped_test_row_num = tmp_test.loc[tmp_test['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()
skipped_train_row_num = tmp_train.loc[tmp_train['time_window_num'].isin(none_selected_window_num), 'row_num'].tolist()

train_cat_cols  = pd.read_csv(join(data_path, train_cat_file), index_col=id_column_name, nrows=0)
train_date_cols = pd.read_csv(join(data_path, train_date_file), index_col=id_column_name, nrows=0)
train_num_cols  = pd.read_csv(join(data_path, train_num_file), index_col=id_column_name, nrows=0)

bin_nan_data_path = '/home/ymm/kaggle/bosch/data_2_bins_xgb_combined_models/data_bin_NaN_models'

def collect_feature_names(data_path, fea_name='feature', thres_name = None, thres = 10):
    csv_files = [f for f in os.listdir(data_path) if '.csv' in f]
    feature_names = set()
    for file_name in csv_files:
        data = pd.read_csv(join(data_path, file_name), index_col=0)
        if thres_name is None:
            feature_names = feature_names.union(data[fea_name])
        else:
            feature_names = feature_names.union(data.loc[data[thres_name] > thres, fea_name])
    return feature_names

## collect feature names based on the fscore
bin_nan_selected_col_name = collect_feature_names(bin_nan_data_path, 'feature', 'fscore', 10)

## based on the selected features from xgboost to create column list
selected_num_col_names =  train_num_cols.columns[train_num_cols.columns.isin(bin_nan_selected_col_name)].tolist()

## add LR important features into the NUMERICAL column set
for feature_name in LR_selected_features:
    if feature_name not in selected_num_col_names:
        selected_num_col_names.append(feature_name)

selected_cat_col_names =  train_cat_cols.columns[train_cat_cols.columns.isin(bin_nan_selected_col_name)].tolist()
selected_dat_col_names =  train_date_cols.columns[train_date_cols.columns.isin(bin_nan_selected_col_name)].tolist()
test_num_col_names     =  selected_num_col_names[:]
        
selected_cat_col_names.extend([id_column_name])
selected_num_col_names.extend([id_column_name, dep_var_name])
test_num_col_names.extend([id_column_name])

In [4]:
print len(test_num_col_names), len(selected_num_col_names), len(selected_dat_col_names), len(selected_cat_col_names)

559 560 0 15


In [5]:
start_time = time.time()
train_cat  = pd.read_csv(join(data_path, train_cat_file),   index_col='Id', skiprows=skipped_train_row_num, usecols=selected_cat_col_names)
test_cat   = pd.read_csv(join(data_path, test_cat_file),    index_col='Id', skiprows=skipped_test_row_num,  usecols=selected_cat_col_names)
train_num  = pd.read_csv(join(data_path, train_num_file),   index_col='Id', skiprows=skipped_train_row_num, usecols=selected_num_col_names)
test_num   = pd.read_csv(join(data_path, test_num_file),    index_col='Id', skiprows=skipped_test_row_num,  usecols=test_num_col_names)

print 'finish reading data by columns selected using xgboost feature importance, using {} seconds.'.format(round(time.time() - start_time, 2))

finish reading data by columns selected using xgboost feature importance, using 129.81 seconds.


In [6]:
print train_cat.shape, train_num.shape

(509886, 14) (509886, 559)


In [7]:
print test_cat.shape, test_num.shape

(509245, 14) (509245, 558)


### encode the numerical data

In [8]:
tmp_num_train = train_num.copy()
tmp_num_test = test_num.copy()

In [9]:
print train_num.shape, test_num.shape
print tmp_num_train.shape, tmp_num_test.shape
tmp_num_train.head()

(509886, 559) (509245, 558)
(509886, 559) (509245, 558)


Unnamed: 0_level_0,L0_S8_F144,L0_S12_F330,L0_S12_F332,L0_S12_F334,L0_S12_F336,L0_S12_F338,L0_S12_F340,L0_S12_F342,L0_S12_F344,L0_S12_F346,...,L3_S47_F4138,L3_S47_F4143,L3_S47_F4153,L3_S47_F4158,L3_S47_F4163,L3_S48_F4196,L3_S48_F4198,L3_S49_F4206,L3_S50_F4243,Response
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0.096,0.076,-0.065,-0.136,0.169,0.231,-0.014,-0.072,-0.039,...,,,,,,,,,,0
14,,-0.051,-0.082,-0.181,-0.481,0.113,0.154,-0.01,-0.072,-0.054,...,,,,,,,,,,0
16,,,,,,,,,,,...,,,,,,,,,,0
23,,-0.008,-0.003,-0.065,-0.136,-0.053,-0.154,-0.006,-0.041,-0.049,...,,,,,,,,,,0
41,,0.041,0.062,0.081,0.14,0.002,0.0,0.002,-0.01,0.223,...,,,,,,,,,,0


### feature engineering on the numerical features

In [10]:
def num_columns_feature_engineering(df, col_ignore = ['Response']):
    #tmp_df = df.loc[:, df.columns != 'start_time']
    tmp_df = df.loc[:, ~df.columns.isin(col_ignore)]
    new_fea_df = pd.DataFrame()
    encoder = preprocessing.LabelEncoder()
    new_fea_df['num_sum'] = tmp_df.sum(axis=1)
    new_fea_df['num_max'] = tmp_df.max(axis=1)
    new_fea_df['num_min'] = tmp_df.min(axis=1)
    new_fea_df['mun_max_min_ratio'] = new_fea_df['num_max'] / new_fea_df['num_min']
    new_fea_df['num_nan_col_count'] = tmp_df.isnull().sum(axis=1)
    new_fea_df['num_reg_col_count'] = tmp_df.shape[1] - tmp_df.isnull().sum(axis=1)
    new_fea_df['idxmax'] = tmp_df.idxmax(axis=1)
    new_fea_df['idxmax'] = encoder.fit_transform(new_fea_df['idxmax'])
    new_fea_df['idxmin'] = tmp_df.idxmin(axis=1)
    new_fea_df['idxmin'] = encoder.fit_transform(new_fea_df['idxmin'])
    #new_fea_df = pd.merge(new_fea_df, pd.get_dummies(idmax, prefix='oneHot'), how='left', left_index=True, right_index=True)
    return new_fea_df


## generic function to encode categorical features
def sweep_up_categorical_encode_by_dep_var(df, fea_name, test_df = None, dep_var_name='Response', count_thres = 10, nan_fill = -1., const_scale = 1.):
    tmp_df = df[[fea_name, dep_var_name]]
    tmp_df = tmp_df.fillna(nan_fill)
    value_counts = tmp_df[fea_name].value_counts()
    minor_keys = []
    key_dep_var_map = {}
    
    ## training sweep-up
    for counts, key in zip(value_counts.values, value_counts.index):
        if counts > count_thres:
            mean_dep_var = const_scale * tmp_df.loc[tmp_df[fea_name] == key, dep_var_name].mean()
            key_dep_var_map[key] = mean_dep_var
        else:
            minor_keys.append(key)
            
    ## mean value of dep_var for all the minor levels
    if len(minor_keys) > 0:
        minor_key_dep_var_mean = const_scale * tmp_df.loc[tmp_df[fea_name].isin(minor_keys), dep_var_name].mean()
        ## update the key_dep_var_map with minor key
        for key in minor_keys:
            key_dep_var_map[key] = minor_key_dep_var_mean
    
    encoded_train = tmp_df[fea_name].replace(key_dep_var_map)
    overall_mean_dep_var = tmp_df[dep_var_name].mean()
    
    ## sweep up the test column
    if test_df is not None:
        test_value_counts = test_df[fea_name].value_counts()
        test_minor_keys = []
        test_key_dep_var_map = key_dep_var_map.copy()
    
        for counts, key in zip(test_value_counts.values, test_value_counts.index):
            if key not in test_key_dep_var_map:
                print 'new level {} with counts {} found in test data'.format(key, counts)
                if counts > count_thres:
                    print 'warning! new level {} is found in test data!'.format(key)
                else:
                    test_minor_keys.append(key)
        
        if len(test_minor_keys) > 0:
            for key in test_minor_keys:
                test_key_dep_var_map[key] = const_scale * overall_mean_dep_var
        
        encoded_test = test_df[fea_name].replace(test_key_dep_var_map)
        return encoded_train, encoded_test
    
    else:
        return encoded_train

In [11]:
#tmp_num_dep_var = tmp_num_train[dep_var_name]
combined_num = pd.concat([tmp_num_train, tmp_num_test])
train_num_index = tmp_num_train.index
test_num_index  = tmp_num_test.index

In [12]:
## feature engineering on the numerical features without filling NaN
start_time = time.time()
## not fill up NaN with fixed value so that min and max values are correct
new_fea_combined_num = num_columns_feature_engineering(combined_num)
print 'finish creating new numerical features using {} seconds'.format(round(time.time() - start_time, 0))    

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


finish creating new numerical features using 146.0 seconds


In [13]:
tmp_num_train = pd.merge(tmp_num_train, new_fea_combined_num.ix[train_num_index], how='left', left_index=True, right_index=True)
tmp_num_test  = pd.merge(tmp_num_test,  new_fea_combined_num.ix[test_num_index],  how='left', left_index=True, right_index=True)

In [14]:
print tmp_num_train.shape, tmp_num_test.shape

(509886, 567) (509245, 566)


In [15]:
categorical_like_columns = []
level_num_thres = 10
for column in tmp_num_train.columns:
    level_num = len(tmp_num_train[column].unique())
    if level_num < level_num_thres:
        if column != dep_var_name:
            categorical_like_columns.append(column)
            print 'column {} has #levels {} if in LR features: {}'.format(column, level_num, column in LR_selected_features)
        #print combined_num[column].value_counts()

column L0_S8_F144 has #levels 3 if in LR features: False
column L0_S14_F358 has #levels 7 if in LR features: False
column L0_S15_F394 has #levels 7 if in LR features: False
column L3_S29_F3360 has #levels 5 if in LR features: False
column L3_S29_F3464 has #levels 3 if in LR features: True
column L3_S29_F3470 has #levels 3 if in LR features: False
column L3_S35_F3894 has #levels 9 if in LR features: False
column L3_S38_F3952 has #levels 9 if in LR features: True
column L3_S49_F4206 has #levels 4 if in LR features: True


In [16]:
count_thres = 50
const_scale = 100.
nan_fill = -2.
for fea_name in categorical_like_columns:
    new_fea_name = '{}_{}'.format('dep_var_encoded', fea_name)
    tmp_num_train[new_fea_name], tmp_num_test[new_fea_name] = sweep_up_categorical_encode_by_dep_var(tmp_num_train, 
                                                                                                     fea_name, 
                                                                                                     test_df=tmp_num_test,
                                                                                                     count_thres=count_thres,
                                                                                                     nan_fill = nan_fill,
                                                                                                     const_scale = const_scale)
    

new level -0.657 with counts 2 found in test data
new level 0.333 with counts 1 found in test data
new level 0.433 with counts 1 found in test data
new level -0.188 with counts 1 found in test data
new level -0.021 with counts 1 found in test data


In [17]:
print tmp_num_train.shape, tmp_num_test.shape

(509886, 576) (509245, 575)


In [18]:
tmp_num_train.head()

Unnamed: 0_level_0,L0_S8_F144,L0_S12_F330,L0_S12_F332,L0_S12_F334,L0_S12_F336,L0_S12_F338,L0_S12_F340,L0_S12_F342,L0_S12_F344,L0_S12_F346,...,idxmin,dep_var_encoded_L0_S8_F144,dep_var_encoded_L0_S14_F358,dep_var_encoded_L0_S15_F394,dep_var_encoded_L3_S29_F3360,dep_var_encoded_L3_S29_F3464,dep_var_encoded_L3_S29_F3470,dep_var_encoded_L3_S35_F3894,dep_var_encoded_L3_S38_F3952,dep_var_encoded_L3_S49_F4206
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,0.096,0.076,-0.065,-0.136,0.169,0.231,-0.014,-0.072,-0.039,...,309,0.642201,0.519173,0.677625,0.629636,0.415067,0.415067,0.532753,0.637246,0.643968
14,,-0.051,-0.082,-0.181,-0.481,0.113,0.154,-0.01,-0.072,-0.054,...,4,0.642201,0.665993,0.548182,0.653692,0.415067,0.415067,0.532753,0.637246,0.643968
16,,,,,,,,,,,...,143,0.642201,0.665993,0.677625,0.653692,0.840757,0.840757,0.83054,0.637246,0.643968
23,,-0.008,-0.003,-0.065,-0.136,-0.053,-0.154,-0.006,-0.041,-0.049,...,305,0.642201,0.633613,0.677625,0.629636,0.840757,0.840757,0.532753,0.637246,0.643968
41,,0.041,0.062,0.081,0.14,0.002,0.0,0.002,-0.01,0.223,...,27,0.642201,0.519173,0.677625,0.653692,0.840757,0.840757,0.645993,0.637246,0.643968


In [19]:
#print tmp_num_train.min().min(), tmp_num_train.max().max()
#rint tmp_num_test.min().min(), tmp_num_test.max().max()

In [20]:
'''
## check the value counts for every features
for feature in LR_selected_features:
    print tmp_num_train[feature].value_counts()
    
## check the distribution
fea_name = LR_selected_features[10]
tmp_df = tmp_num_train[[fea_name, dep_var_name]]
tmp_df[fea_name].hist(bins=50)

## way to quantify the continuous distribution
from scipy.stats.mstats import mquantiles
bin_num = 32
prob_list = [1.*i/bin_num for i in range(1, bin_num)]
quantile_values = mquantiles(tmp_df[fea_name][tmp_df[fea_name].notnull()], prob=prob_list)
'''

'\n## check the value counts for every features\nfor feature in LR_selected_features:\n    print tmp_num_train[feature].value_counts()\n    \n## check the distribution\nfea_name = LR_selected_features[10]\ntmp_df = tmp_num_train[[fea_name, dep_var_name]]\ntmp_df[fea_name].hist(bins=50)\n\n## way to quantify the continuous distribution\nfrom scipy.stats.mstats import mquantiles\nbin_num = 32\nprob_list = [1.*i/bin_num for i in range(1, bin_num)]\nquantile_values = mquantiles(tmp_df[fea_name][tmp_df[fea_name].notnull()], prob=prob_list)\n'

In [21]:
tmp_num_train = tmp_num_train.fillna(nan_fill)
tmp_num_test = tmp_num_test.fillna(nan_fill)

In [22]:
print tmp_num_train.shape, tmp_num_test.shape, dep_var_name in tmp_num_train.columns

(509886, 576) (509245, 575) True


### process the categorical features

In [23]:
start_time = time.time()
combined_cat = pd.concat([train_cat, test_cat])
print combined_cat.shape
## convert to string so that column is categorical
combined_cat = combined_cat.astype(str)
## One-Hot encode all the categorical columns
oneHot_combined_cat = pd.get_dummies(combined_cat, dummy_na=True)
print 'shape after OneHot encoding: ', oneHot_combined_cat.shape
train_index = train_cat.index
test_index  = test_cat.index
oneHot_train_cat = oneHot_combined_cat.ix[train_index]
oneHot_test_cat  = oneHot_combined_cat.ix[test_index]
print 'finish OneHot encoding the categorical columns, using {} seconds'.format(round(time.time() - start_time, 2))

(1019131, 14)
shape after OneHot encoding:  (1019131, 103)
finish OneHot encoding the categorical columns, using 10.34 seconds


In [24]:
print oneHot_train_cat.shape
oneHot_train_cat.head()

(509886, 103)


Unnamed: 0_level_0,L1_S24_F675_1.0,L1_S24_F675_2.0,L1_S24_F675_3.0,L1_S24_F675_4.0,L1_S24_F675_5.0,L1_S24_F675_nan,L1_S24_F675_nan,L1_S24_F1510_1.0,L1_S24_F1510_2.0,L1_S24_F1510_3.0,...,L3_S32_F3854_36992.0,L3_S32_F3854_4.0,L3_S32_F3854_48.0,L3_S32_F3854_492.0,L3_S32_F3854_512.0,L3_S32_F3854_55424.0,L3_S32_F3854_63616.0,L3_S32_F3854_8.0,L3_S32_F3854_nan,L3_S32_F3854_nan
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
41,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
combined_train = pd.concat([oneHot_train_cat, tmp_num_train], axis=1)
combined_test  = pd.concat([oneHot_test_cat,  tmp_num_test],  axis=1)

In [26]:
print combined_train.shape, combined_test.shape

(509886, 679) (509245, 678)


In [27]:
dep_var_name in combined_train.columns

True

In [28]:
print combined_test.isnull().sum().sum(), '\n \n', combined_train.isnull().sum().sum()

0 
 
0


In [29]:
print combined_test.dtypes.value_counts(), '\n \n',  combined_train.dtypes.value_counts()

float64    674
int64        4
dtype: int64 
 
float64    674
int64        5
dtype: int64


In [30]:
dep_var_name in combined_train.columns, dep_var_name in combined_test

(True, False)

#### add ID to mock up the time label

In [31]:
combined_train['sequence_id'] = np.arange(combined_train.shape[0])
combined_train['given_id'] = combined_train.index

In [32]:
combined_test['sequence_id'] = np.arange(combined_test.shape[0])
combined_test['given_id'] = combined_test.index

In [34]:
print combined_test.isnull().sum().sum(), '\n \n', combined_train.isnull().sum().sum()

0 
 
0


In [35]:
print combined_test.dtypes.value_counts(), '\n \n',  combined_train.dtypes.value_counts()

float64    674
int64        6
dtype: int64 
 
float64    674
int64        7
dtype: int64


In [36]:
print combined_train.shape, combined_test.shape

(509886, 681) (509245, 680)


In [33]:
start_time = time.time()
combined_train.to_csv('bosch_processed_nan_filled_FE_thres_10_train_data_with_id.csv')
combined_test.to_csv('bosch_processed_nan_filled_FE_thres_10_test_data_with_id.csv')
print 'saving data using {} seconds'.format(round(time.time() - start_time, 2))

saving data using 842.83 seconds


In [3]:
data_path = '/mnt/home/ymm/kaggle/bosch_data/bosch_nan_data_FE_xgb_feature_10_thres_with_id'
train_file_name = 'bosch_processed_nan_filled_FE_thres_10_train_data_with_id.csv'
test_file_name = 'bosch_processed_nan_filled_FE_thres_10_test_data_with_id.csv'

new_train = pd.read_csv(join(data_path, train_file_name), index_col='Id')
#test = pd.read_csv(join(data_path, test_file_name), index_col='Id')

In [4]:
print new_train.shape

(509886, 681)


In [9]:
new_train.dtypes.value_counts()

float64    674
int64        7
dtype: int64

In [7]:
data_path = '/home/ymm/kaggle/bosch_data/bosch_regular_data_FE_xgb_feature_10_thres'
train_file_name = 'bosch_processed_regular_filled_FE_thres_10_train_data.csv'

old_train = pd.read_csv(join(data_path, train_file_name), index_col='Id')

In [22]:
old_train.max()

L2_S26_F3038_1.0                  1.00
L2_S26_F3038_nan                  1.00
L2_S26_F3038_nan.1                0.00
L2_S27_F3131_1.0                  1.00
L2_S27_F3131_nan                  1.00
L2_S27_F3131_nan.1                0.00
L3_S29_F3317_1.0                  1.00
L3_S29_F3317_nan                  1.00
L3_S29_F3317_nan.1                0.00
L3_S29_F3475_1.0                  1.00
L3_S29_F3475_nan                  1.00
L3_S29_F3475_nan.1                0.00
L3_S32_F3851_1.0                  1.00
L3_S32_F3851_nan                  1.00
L3_S32_F3851_nan.1                0.00
L3_S32_F3854_-2147481664.0        1.00
L3_S32_F3854_-21474819.0          1.00
L3_S32_F3854_-2147482176.0        1.00
L3_S32_F3854_-2147482432.0        1.00
L3_S32_F3854_-21474825.0          1.00
L3_S32_F3854_-2147482688.0        1.00
L3_S32_F3854_-2147482816.0        1.00
L3_S32_F3854_-2147482944.0        1.00
L3_S32_F3854_-2147483648.0        1.00
L3_S32_F3854_-21474872.0          1.00
L3_S32_F3854_1.0         

In [29]:
print new_train.shape, old_train.shape

(509886, 681) (673861, 480)


In [17]:
column_names = new_train.columns[~new_train.columns.isin(old_train.columns)]

In [26]:
tmp_data = new_train[column_names]

In [28]:
tmp_data.shape

(509886, 426)

In [66]:
new_train.max().max()

inf

In [65]:
new_train.ix[:, 665].max()

inf

In [63]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(new_train.ix[:, 664:665], old_train[dep_var_name][:tmp_data.shape[0]])

KeyboardInterrupt: 

In [21]:
new_train[column_names].max()

L1_S24_F675_1.0                 1.000000e+00
L1_S24_F675_2.0                 1.000000e+00
L1_S24_F675_3.0                 1.000000e+00
L1_S24_F675_4.0                 1.000000e+00
L1_S24_F675_5.0                 1.000000e+00
L1_S24_F675_nan                 1.000000e+00
L1_S24_F675_nan.1               0.000000e+00
L1_S24_F1510_1.0                1.000000e+00
L1_S24_F1510_2.0                1.000000e+00
L1_S24_F1510_3.0                1.000000e+00
L1_S24_F1510_4.0                1.000000e+00
L1_S24_F1510_5.0                1.000000e+00
L1_S24_F1510_nan                1.000000e+00
L1_S24_F1510_nan.1              0.000000e+00
L1_S24_F1582_1.0                1.000000e+00
L1_S24_F1582_nan                1.000000e+00
L1_S24_F1582_nan.1              0.000000e+00
L1_S24_F1584_1.0                1.000000e+00
L1_S24_F1584_nan                1.000000e+00
L1_S24_F1584_nan.1              0.000000e+00
L1_S24_F1585_1372.0             1.000000e+00
L1_S24_F1585_16.0               1.000000e+00
L1_S24_F15

In [25]:
np.finfo(np.float32).max

3.4028235e+38

In [40]:
print train.shape, test.shape

(509886, 681) (509245, 680)


In [38]:
print test.isnull().sum().sum(), '\n \n', train.isnull().sum().sum()

0 
 
0


In [39]:
print test.dtypes.value_counts(), '\n \n',  train.dtypes.value_counts()

float64    674
int64        6
dtype: int64 
 
float64    674
int64        7
dtype: int64
