In [1]:
from keras.layers import Dense, Dropout, Embedding, Flatten, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.models import Model

from time import time
import datetime
from itertools import combinations
import pickle

import numpy as np
import pandas as pd
from scipy import sparse, mod
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./data/train.csv')
train_label = train['target']
train_id = train['id']

del train['target'], train['id']

test = pd.read_csv('./data/test.csv')
test_id = test['id']
del test['id']

# 기초 통계 변수 생성

In [42]:
def proj_num_on_cat(train_df, test_df, target_column, group_column) :
    
    train_df['row_id'] = range(train_df.shape[0])
    test_df['row_id'] = range(test_df.shape[0])
    train_df['train'] = 1; test_df['train'] = 0
    
    #train + test
    all_df = train_df[['row_id','train',target_column, group_column]].append(test_df[['row_id','train',target_column, group_column]])
    grouped = all_df[[target_column, group_column]].groupby(group_column)
    
    #size, mean, median, max, min
    the_size = pd.DataFrame(grouped.size()).reset_index()
    the_size.columns = [group_column, '%s_size' % target_column]
    the_mean = pd.DataFrame(grouped.mean()).reset_index()
    the_mean.columns = [group_column, '%s_mean' % target_column]
    the_std = pd.DataFrame(grouped.std()).reset_index()
    the_std.columns = [group_column, '%s_std' % target_column]
    the_median = pd.DataFrame(grouped.median()).reset_index()
    the_median.columns = [group_column, '%s_median' % target_column]
    the_max = pd.DataFrame(grouped.max()).reset_index()
    the_max.columns = [group_column, '%s_max' % target_column]
    the_min = pd.DataFrame(grouped.min()).reset_index()
    the_min.columns = [group_column, '%s_min' % target_column]
    
    #통게 기반 파생 변수
    the_stats = pd.merge(the_size, the_mean)
    the_stats = pd.merge(the_stats, the_std)
    the_stats = pd.merge(the_stats, the_median)
    the_stats = pd.merge(the_stats, the_max)
    the_stats = pd.merge(the_stats, the_min)
    
    all_df = pd.merge(all_df, the_stats, how = 'left')
    
    #split train ,test
    selected_train = all_df[all_df.train == 1]
    selected_test = all_df[all_df.train == 0]
    
    selected_train.sort_values('row_id', inplace = True)
    selected_test.sort_values('row_id', inplace = True)
    
    selected_train.drop([target_column, group_column,'row_id','train'], axis = 1, inplace = True)
    selected_test.drop([target_column, group_column,'row_id','train'], axis = 1, inplace = True)
    
    selected_train, selected_test = np.array(selected_train), np.array(selected_test)
    
    return selected_train, selected_test

# 변수 간의 다양한 상호 작용 파생 변수

In [4]:
def interaction_features(train ,test, fea1, fea2, prefix) :
    train['inter_{}*'.format(prefix)] = train[fea1] * train[fea2]
    train['inter_{}/'.format(prefix)] = train[fea1] / train[fea2]
    
    test['inter_{}*'.format(prefix)] = test[fea1] * test[fea2]
    test['inter_{}/'.format(prefix)] = test[fea1] / test[fea2]
    
    return train, test

# 변수 생성

In [5]:
cat_fea = [x for x in list(train) if 'cat' in x]
bin_fea = [x for x in list(train) if 'bin' in x]

In [6]:
#the number of -1(NA)
train['missing'] = (train==-1).sum(axis = 1).astype(float)
test['missing'] = (test==-1).sum(axis = 1).astype(float)

In [7]:
#combination : 전체 리스트 n 개에서 2개씩 짝 지은 nC2의 모든 경우를 모두 보여줌
for e,(x,y) in enumerate(combinations(['ps_car_13','ps_ind_03','ps_reg_03','ps_ind_15','ps_reg_01','ps_ind_01'],2)) :
    train, test = interaction_features(train, test, x, y, e)

In [9]:
feature_names = list(train)

num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]
num_features.append('missing')

inter_features = [x for x in feature_names if 'inter' in x]

ind_features = [c for c in feature_names if 'ind' in c]

In [14]:
count = 0
for c in ind_features :
    if count == 0 :
        train['new_ind'] = train[c].astype(str)
        count += 1
    else :
        train['new_ind'] += '_' + train[c].astype(str)
count = 0        
for c in ind_features :
    if count == 0 :
        test['new_ind'] = test[c].astype(str)
        count +=1
    else :
        test['new_ind'] += '_' + test[c].astype(str)
        

reg_features = [c for c in feature_names if 'reg' in c]
count = 0
for c in reg_features :
    if count == 0 :
        train['new_reg'] = train[c].astype(str)
        count += 1
    else :
        train['new_reg'] += '_' + train[c].astype(str)
count = 0
for c in reg_features :
    if count == 0 :
        test['new_reg'] = test[c].astype(str)
        count += 1
    else :
        test['new_reg'] += '_' + test[c].astype(str)

        
car_features = [c for c in feature_names if 'car' in c]
count = 0
for c in car_features :
    if count == 0 :
        train['new_car'] = train[c].astype(str)
        count += 1
    else :
        train['new_car'] += '_' + train[c].astype(str)
count = 0
for c in car_features :
    if count == 0 :
        test['new_car'] = test[c].astype(str)
        count += 1
    else :
        test['new_car'] += '_' + test[c].astype(str)

In [16]:
train_cat = train[cat_fea]
train_num = train[[x for x in list(train) if x in num_features]]
test_cat = test[cat_fea]
test_num = test[[x for x in list(train) if x in num_features]]

In [20]:
max_cat_values = [] #라벨 인코딩을 하게 되면, 1,2,3,... 등으로 labeling이 되는데, max 값을 취하면 category의 개수?

for c in cat_fea :
    
    x = le.fit_transform(pd.concat([train_cat, test_cat])[c])
    
    train_cat[c] = le.transform(train_cat[c])
    test_cat[c] = le.transform(test_cat[c])
    
    max_cat_values.append(np.max(x))
    
#카테고리 변수들을 label encoding 한 후
#각 값들의 카운트 수를 다시 새로운 피쳐로!

cat_count_features = []
for c in cat_fea + ['new_ind', 'new_reg','new_car'] :
    d = pd.concat([train[c], test[c]]).value_counts().to_dict()
    
    train['%s_count'%c] = train[c].apply(lambda x : d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x : d.get(x,0))
    cat_count_features.append('%s_count'%c)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# import Xgboost features

In [35]:
train_fea0, test_fea0 = pickle.load(open('./data/fea0.pk', 'rb'))

In [37]:
#replace inf, nan -> 0 in numeric data
train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features], train_fea0]
test_list = [test_num.replace([np.inf, -np.inf, np.nan],0), test[cat_count_features], test_fea0]

In [43]:
for t in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01'] :
    for g in ['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'ps_ind_05_cat'] :
        if t!=g :
            s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
            train_list.append(s_train)
            test_list.append(s_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [45]:
X = sparse.hstack(train_list).tocsr()
X_test = sparse.hstack(test_list).tocsr()
all_data = np.vstack([X.toarray(), X_test.toarray()])

scaler = StandardScaler()
scaler.fit(all_data)

X = scaler.transform(X.toarray())
X_test = scaler.transform(X_test.toarray())

왜 위의 피쳐들만 이용해서 조합을 만들었는지는 알 수 없다 ㅜㅡㅜ

# Neural Network Model

In [49]:
def nn_model() :
    inputs = []
    flatten_layers = []
    
    for e, c in enumerate(cat_fea) :
        input_c = Input(shape = (1,), dtype = 'int32')
        num_c = max_cat_values[e]
        embed_c = Embedding(num_c, 6, input_length = 1)(inpu_c)
        embed_c = Dropout(0.25)(embed_c)
        flatten_c = Flatten()(embed_c)
        inputs.append(input_c)
        flatten_layers.append(flatten_c)
        
        
        

BatchNormalization	 Dense	 Dropout	 Embedding	 Flatten	 Input	 LabelEncoder	 Model	 PReLU	 
StandardScaler	 StratifiedKFold	 X	 X_test	 all_data	 bin_fea	 c	 car_features	 cat_count_features	 
cat_fea	 combinations	 count	 d	 datetime	 e	 feature_names	 g	 ind_features	 
inter_fea	 interaction_features	 le	 max_cat_values	 merge	 mod	 np	 num_features	 pd	 
pickle	 proj_num_on_cat	 reg_features	 s_test	 s_train	 scaler	 sparse	 t	 test	 
test_cat	 test_fea0	 test_id	 test_list	 test_num	 time	 train	 train_cat	 train_fea0	 
train_id	 train_label	 train_list	 train_num	 x	 y	 
