In [1]:
import warnings
warnings.filterwarnings("ignore")

import re
import gc
import datetime
import glob
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

import tensorflow as tf
import keras.backend as K
from keras import optimizers
from keras.models import Model
from keras.optimizers import SGD, RMSprop, Adam
from keras.models import Sequential
from keras.layers import Input, Dropout, Dense, Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils import np_utils
from keras.regularizers import l2

Using TensorFlow backend.


In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [4]:
dir_path = '/disk/Tbrain/'

train_set = pd.read_csv(dir_path+'training-set.csv', header=None, names=['FileID', 'Target'])
test_set = pd.read_csv(dir_path+'testing-set.csv', header=None, names=['FileID', 'Target'])
train_ex = pd.read_table(dir_path+'exception/exception_train.txt', header=None, names=['FileID'])
test_ex = pd.read_table(dir_path+'exception/exception_testing.txt', header=None, names=['FileID'])

train_set = train_set.loc[~train_set['FileID'].isin(train_ex)]
test_set = test_set.loc[~test_set['FileID'].isin(test_ex)]

data = pd.concat([train_set, test_set],axis=0)
data['Target'].replace(0.5, np.nan, inplace=True)

# Given fold number for each FileID (Need for feature engineering with y, prevent for overfitting)
nrow_train = len(train_set)
n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, random_state=5566)
data['Fold'] = np.nan
for f, (_, valid_idx) in enumerate(skf.split(data['FileID'].iloc[:nrow_train], data['Target'].iloc[:nrow_train])):
    data['Fold'].iloc[valid_idx] = f

cv_folds = data['Fold']
cv_folds.dropna(inplace=True)
y = data['Target']
y = y.dropna()

X_data = load_sparse_csr('/disk/albert/Top1/new_process_v3.npz')


del train_ex, test_ex, data
gc.collect()

40

In [7]:
X_data = pd.DataFrame(X_data.toarray())
tmp = X_data.isnull().sum()
X_data.drop(tmp[tmp>0].index.tolist(), axis=1, inplace=True)
X_data = csr_matrix(X_data.values)

In [21]:
# Define NN model
def nn_model():
    model = Sequential()
    model.add(Dense(4096, activation='tanh', input_dim=train_X.shape[1]))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(512, activation='elu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))    
    model.add(Dense(128, activation='elu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

### Submit Final bagging answer

In [22]:
# Reset data
train_X = X_data[:len(train_set)]
test_X = X_data[len(train_set):]
train_y = y[:len(train_set)]

n_bags = 10
predsN = 0

for fold in range(n_folds):
    train_idx, valid_idx = cv_folds.loc[cv_folds != fold].index, cv_folds.loc[cv_folds == fold].index

    X_train, y_train = train_X[train_idx], train_y.iloc[train_idx]
    X_valid, y_valid = train_X[valid_idx], train_y.iloc[valid_idx]
    
    for bag in range(n_bags):
        # Train model
        model = nn_model()
        model.fit(X_train, y_train, batch_size=256, epochs=10, class_weight='auto', verbose=0)
        tmpN = model.predict_proba(X_valid).squeeze()
        predsN += model.predict_proba(test_X).squeeze()
        
        print("NN validation AUC: {0:.6f}".format(roc_auc_score(y_valid, tmpN)))
        # Clearing session
        K.clear_session()
    print('='*30)

predsN /= (n_folds*n_bags)
print('Predict NN completed.')

gc.collect()

NN validation AUC: 0.966674
NN validation AUC: 0.967656
NN validation AUC: 0.967792
NN validation AUC: 0.968849
NN validation AUC: 0.968289
NN validation AUC: 0.965319
NN validation AUC: 0.967960
NN validation AUC: 0.967596
NN validation AUC: 0.967455
NN validation AUC: 0.968366
NN validation AUC: 0.962891
NN validation AUC: 0.961262
NN validation AUC: 0.964137
NN validation AUC: 0.963599
NN validation AUC: 0.963614
NN validation AUC: 0.966314
NN validation AUC: 0.966764
NN validation AUC: 0.961669
NN validation AUC: 0.964231
NN validation AUC: 0.961384
NN validation AUC: 0.962729
NN validation AUC: 0.963994
NN validation AUC: 0.959669
NN validation AUC: 0.961318
NN validation AUC: 0.960211
NN validation AUC: 0.963135
NN validation AUC: 0.959233
NN validation AUC: 0.961692
NN validation AUC: 0.962204
NN validation AUC: 0.961616
NN validation AUC: 0.971178
NN validation AUC: 0.970822
NN validation AUC: 0.970806
NN validation AUC: 0.970733
NN validation AUC: 0.970908
NN validation AUC: 0

21210

In [23]:
submit = pd.concat([test_set[['FileID']], pd.Series(predsN)], axis=1)
submit.columns = ['FileID', 'Probability']
submit.to_csv('./NN_{0}bag_{1}.csv'.format(n_bags*n_folds, re.sub('-', '', str(datetime.date.today())[5:])),
              index=False)