In [20]:
%matplotlib inline

import h5py
from keras.layers import Input, Dense, Dropout, BatchNormalization, Activation
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.callbacks import Callback as cbs
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
import numpy as np
import pandas as pd
import os
from sklearn import model_selection, preprocessing, metrics
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
import time
import csv
import pickle
import warnings

#warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [21]:
RAW_DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/raw_data'
DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data'
TRAIN_DATA = os.path.join(RAW_DATA_PATH, 'train.csv')
TEST_DATA = os.path.join(RAW_DATA_PATH, 'test.csv')
SAMPLE_SUBMISSION = os.path.join(RAW_DATA_PATH, 'sample_submission.csv')
SUBMISSION_PATH = os.path.join(DATA_PATH, 'submissions')
MODELS_PATH_NN = os.path.join(DATA_PATH, 'models/nn/')
ENSEMBLE_PATH = os.path.join(DATA_PATH, 'ensemble/malhotra')

In [22]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION)

In [23]:
filter_outliers = True
filter_XO_X8 = False
dnn_use_augment_features = True

# Preprocess data
for column in train_df.columns:
    cardinality = len(np.unique(train_df[column]))
    if cardinality == 1:
        train_df.drop(column, axis=1, inplace=True)
        test_df.drop(column, axis=1, inplace=True)

x0_x8 = ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]
original_train_columns = sorted(list(set(train_df.columns) - set(['ID', 'y'])))

for f in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
    lbl = preprocessing.LabelEncoder()
    train_values = set(train_df[f].values)
    test_values = set(test_df[f].values)
    all_values = list(train_values | test_values)
    lbl.fit(all_values) 
    train_df[f] = lbl.transform(list(train_df[f].values))
    test_df[f] = lbl.transform(list(test_df[f].values))

if filter_outliers:
    print('Filtering outliers')
    # Filter out outlier y = 265.32
    train_df = train_df[train_df.y < 200]
    
if filter_XO_X8:
    print('Filtering XO-X8')
    train_df = train_df[list(set(train_df.columns) - set(x0_x8))]
    test_df = test_df[list(set(test_df.columns) - set(x0_x8))]
    
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

Filtering outliers
train_df.shape (4208, 366)
test_df.shape (4209, 365)


In [24]:
n_comp = 10

# tSVD
# tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
# tsvd_results_train = tsvd.fit_transform(train_df.drop(["y"], axis=1))
# tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train_df.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_df.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df.drop(["y"], axis=1))
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train_df.drop(["y"], axis=1))
srp_results_test = srp.transform(test_df)

# Append decomposition components to datasets
for i in range(0, n_comp):
    train_df['pca_' + str(i)] = pca2_results_train[:, i]
    test_df['pca_' + str(i)] = pca2_results_test[:, i]

    train_df['ica_' + str(i)] = ica2_results_train[:, i]
    test_df['ica_' + str(i)] = ica2_results_test[:, i]

#     train_df['tsvd_' + str(i)] = tsvd_results_train[:, i]
#     test_df['tsvd_' + str(i)] = tsvd_results_test[:, i]

    train_df['grp_' + str(i)] = grp_results_train[:, i]
    test_df['grp_' + str(i)] = grp_results_test[:, i]

    train_df['srp_' + str(i)] = srp_results_train[:, i]
    test_df['srp_' + str(i)] = srp_results_test[:, i]

augmented_train_columns = sorted(list(set(train_df.columns) - set(['ID', 'y'])))
print('original columns', len(original_train_columns))
print('augmented columns', len(augmented_train_columns))
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

original columns 364
augmented columns 404
train_df.shape (4208, 406)
test_df.shape (4209, 405)


In [13]:
train_df.to_csv(os.path.join(DATA_PATH, 'train_df_encoded.csv'))
test_df.to_csv(os.path.join(DATA_PATH, 'test_df_encoded.csv'))

In [25]:
Y = train_df['y'].values
X = train_df.drop(["y"], axis=1).set_index(['ID'], drop=True)
test_X = test_df.set_index(['ID'], drop=True)
print('X.shape', X.shape)
print('Y.shape', Y.shape)

X.shape (4208, 404)
Y.shape (4208,)


In [26]:
X.to_csv(os.path.join(DATA_PATH, 'X.csv'))
test_X.to_csv(os.path.join(DATA_PATH, 'test_X.csv'))
np.savetxt(os.path.join(DATA_PATH, 'Y.csv'), Y, delimiter=',')

In [None]:
BATCH_SIZE = 32
NUM_FOLDS = 5
kf = model_selection.KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
perf = []

def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res / (SS_tot + K.epsilon()))

class LossHistory(cbs):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    timestamp = str(int(time.time()))
    os.mkdir(MODELS_PATH_NN + timestamp)
    print('Fold {} Training {}'.format(fold, timestamp))
    
    np.savetxt(os.path.join(MODELS_PATH_NN + timestamp, 'fold_{}_train_idxs.csv'.format(fold)), train_idxs, delimiter=',')
    np.savetxt(os.path.join(MODELS_PATH_NN + timestamp, 'fold_{}_val_idxs.csv'.format(fold)), val_idxs, delimiter=',')

    
    trn_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    trn_Y, val_Y = Y[train_idxs], Y[val_idxs]
    
    callbacks = [
        LossHistory(),
    #     EarlyStopping(
    #         monitor='val_r2_keras',
    #         min_delta=0.001
    #         patience=20,
    #         mode='max',
    #         verbose=0),
        ModelCheckpoint(
            MODELS_PATH_NN + timestamp + '/' + '{epoch:04d}-{r2_keras:.6f}-{val_r2_keras:.6f}.hdf5', 
            monitor='r2_keras', 
            save_best_only=True, 
            mode='max',
            verbose=0)
    ]

    input_dims = train_X.shape[1]

    # This returns a tensor
    inputs = Input(shape=(input_dims,))

    # a layer instance is callable on a tensor, and returns a tensor

    print('inputs.shape', inputs.shape)

    x = Dense(input_dims, activation='relu')(inputs)
    x = BatchNormalization()(x)
    #x = Activation('relu')(x)
    x = Dropout(0.2)(x)

    x = Dense(input_dims, activation='relu')(inputs)
    x = BatchNormalization()(x)
    #x = Activation('relu')(x)
    x = Dropout(0.3)(x)

    x = Dense(input_dims, activation='relu')(inputs)
    x = BatchNormalization()(x)
    #x = Activation('relu')(x)
    x = Dropout(0.3)(x)

    x = Dense(input_dims, activation='relu')(inputs)
    x = BatchNormalization()(x)
    #x = Activation('relu')(x)
    x = Dropout(0.3)(x)

    x = Dense(input_dims//2, activation='relu')(inputs)
    x = BatchNormalization()(x)
    #x = Activation('relu')(x)
    x = Dropout(0.3)(x)

    # Output Layer.
    outputs = Dense(1)(x)

    print('outputs.shape', outputs.shape)

    adamOptimizer = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=adamOptimizer,
                  loss='mean_squared_error', #'mean_squared_logarithmic_error',
                  metrics=[r2_keras, 'mae', 'mean_squared_logarithmic_error'])

    k_X, k_Y = trn_X.copy().values, trn_Y
    k_val_X = val_X.copy().values

    history = model.fit([k_X], [k_Y], epochs=5000, batch_size=BATCH_SIZE, verbose=0, validation_data=(k_val_X, val_Y), callbacks=callbacks)

Fold 0 Training 1498536945
inputs.shape (?, 404)
outputs.shape (?, 1)
Fold 1 Training 1498539281
inputs.shape (?, 404)
outputs.shape (?, 1)
Fold 2 Training 1498541615
inputs.shape (?, 404)
outputs.shape (?, 1)


In [52]:
def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res / (SS_tot + K.epsilon()))

model_id1 = '1498536945/2114-0.478986-0.537906.hdf5'
model_id2 = '1498539281/4127-0.507133-0.542570.hdf5'
model_id3 = '1498541615/4886-0.502786-0.596412.hdf5'
model_id4 = '1498543945/4910-0.501763-0.604505.hdf5'
model_id5 = '1498546273/4721-0.513207-0.574646.hdf5'

dataset_blend_train = np.zeros((X.shape[0]))
dataset_blend_test = np.zeros((test_X.shape[0]))
dataset_blend_test_j = np.zeros((test_X.shape[0], NUM_FOLDS))

X = pd.read_csv(os.path.join(DATA_PATH, 'X.csv')).set_index(['ID'], drop=True)
test_X = pd.read_csv(os.path.join(DATA_PATH, 'test_X.csv')).set_index(['ID'], drop=True)
Y = np.loadtxt(os.path.join(DATA_PATH, 'Y.csv'), delimiter=',')

k_test_X = test_X.copy().values
for fold, model_id in enumerate([model_id1, model_id2, model_id3, model_id4, model_id5]):
    model = load_model(MODELS_PATH_NN + model_id, custom_objects={'r2_keras': r2_keras})
    val_idxs_path = model_id.split('/')[0] + '/' + 'fold_{}_val_idxs.csv'.format(fold)
    val_idxs = np.loadtxt(MODELS_PATH_NN + val_idxs_path, dtype=np.int32)
    
    val_X = X.iloc[val_idxs]
    k_val_X = val_X.copy().values
    
    test_Y = model.predict(k_test_X).ravel()
    val_Y_pred = model.predict(k_val_X).ravel()
    
    dataset_blend_train[val_idxs] = val_Y_pred
    dataset_blend_test_j[:, fold] = test_Y

dataset_blend_test = dataset_blend_test_j.mean(1)

val_r2_mean = '56370'
ts = str(int(time.time()))

train_blend_df = pd.DataFrame(data=dataset_blend_train, index=X.index, columns=['y'])
test_blend_df = pd.DataFrame(data=dataset_blend_test, index=test_X.index, columns=['y'])

train_blend_df.to_csv(os.path.join(ENSEMBLE_PATH, 'malhot_dnn5fold_{}_{}_{}_train.csv'.format(ts, 0, val_r2_mean)), index=True)
test_blend_df.to_csv(os.path.join(ENSEMBLE_PATH, 'malhot_dnn5fold_{}_{}_{}_test.csv'.format(ts, 0, val_r2_mean)), index=True)

In [28]:
submission_df = test_df[['ID']]
submission_df['y'] = test_Y.tolist()
submission_df.to_csv(os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'), index=False)
print('Generated submission ', os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'))

NameError: name 'test_Y' is not defined