## AutoEncoder into a Feedforward Neural Net Regressor
_By Nick Brooks, July 2018_

This is my first attempt at an Auto-Encoder. I am **not** an authority.

**Sources:**
- [Keras Blog](https://blog.keras.io/building-autoencoders-in-keras.html)

**Load:**

In [None]:
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
random.seed(2018)

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation, Dropout, Input
from keras.optimizers import Adam, SGD, RMSprop
from keras import backend as K
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import gc

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import TimeDistributed

# Utility
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras.models import Model
from keras import regularizers
from sklearn.random_projection import SparseRandomProjection
from sklearn.feature_selection import VarianceThreshold

# Gradient Boosting
import lightgbm as lgb

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

# Specify index/ target name
id_col = "ID"
target_var = "target"

# House Keeping Parameters
Debug = True
Home = False
Build_Results_csv = False # if running for first time

results = pd.DataFrame(columns = ["Rounds","Score","STDV", "LB", "Parameters"])
if Build_Results_csv is True & Home is True: results.to_csv("results.csv")
if Home is True:
    import os
    path = r"D:\My Computer\DATA\Santander"
    os.chdir(path)
    
    print("Data Load Stage")
    training = pd.read_csv('train.csv', index_col = id_col)
    if Debug is True : training = training.sample(100)
    traindex = training.index
    testing = pd.read_csv('test.csv', index_col = id_col)
    if Debug is True : testing = testing.sample(100)
    testdex = testing.index
else:
    print("Data Load Stage")
    training = pd.read_csv('../input/train.csv', index_col = id_col)
    if Debug is True : training = training.sample(100)
    traindex = training.index
    testing = pd.read_csv('../input/test.csv', index_col = id_col)
    if Debug is True : testing = testing.sample(100)
    testdex = testing.index

trainlen = len(traindex)
y = np.log1p(training[target_var])
training.drop(target_var,axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

**Model Dataset and Standardization:**

- Delete Zero Var columns and Duplicate columns [script](https://www.kaggle.com/seiya1998/lgbm-with-random-projection-and-aggregate-lb-1-41/code) by [Siya(Japan)](https://www.kaggle.com/seiya1998/lgbm-with-random-projection-and-aggregate-lb-1-41/code)

In [None]:
# # Remove Zero Variance Variables
# print("DF Variables # Before Zero Threshold Variance Transformer: ", df.shape[1])
# VT = VarianceThreshold(threshold=0.0)
# df = VT.fit_transform(df)
# print("DF Variables # after Zero Threshold Variance Transformer: ", df.shape[1])

In [None]:
# Training and Testing
test_df = df[trainlen:]
X = df[0:trainlen]

# Siya's Script
colsToRemove = []
for col in X.columns:
    if X[col].std() == 0: 
        colsToRemove.append(col)
        
# remove constant columns in the training set
X.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))


colsToRemove = []
colsScaned = []
dupList = {}

columns = X.columns

for i in range(len(columns)-1):
    v = X[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, X[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
                
# remove duplicate columns in the training set
X.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test_df.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dupList)))

In [None]:
print("Aggregate")
weight = ((X != 0).sum()/len(X)).values
print("Weight: ", weight)

tmp_train = X[X!=0]
tmp_test = test_df[test_df!=0]
# tmp = pd.concat([X,test_df]) #RandomProjection

X["weight_count"] = (tmp_train*weight).sum(axis=1)
test_df["weight_count"] = (tmp_test*weight).sum(axis=1)

X["count_not0"] = (X != 0).sum(axis=1)
test_df["count_not0"] = (test_df != 0).sum(axis=1)

X["sum"] = X.sum(axis=1)
test_df["sum"] = test_df.sum(axis=1)

X["var"] = tmp_train.var(axis=1)
test_df["var"] = tmp_test.var(axis=1)

X["mean"] = tmp_train.mean(axis=1)
test_df["mean"] = tmp_test.mean(axis=1)

X["std"] = tmp_train.std(axis=1)
test_df["std"] = tmp_test.std(axis=1)

X["max"] = tmp_train.max(axis=1)
test_df["max"] = tmp_test.max(axis=1)

X["min"] = tmp_train.min(axis=1)
test_df["min"] = tmp_test.min(axis=1)

# FillNA
X.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [None]:
# Scale for Neural Net
std_scale = preprocessing.StandardScaler()
X = std_scale.fit_transform(X)
test_df = std_scale.fit_transform(test_df)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Deep AutoEncoder
Because my validation error is always lower and I tried to increase capacity

In [None]:
# Architecture
inputshape = X_train.shape[1]
input_data = Input(shape=(inputshape,))

# Auto-Encoder
encoded = Dense(512, activation='relu')(input_data)
encoded = Dense(128, activation='relu')(encoded)
# encoded = Dense(64, activation='relu')(encoded)
#encoded = Dense(32, activation='relu')(encoded)
## No Mans Land ##
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(512, activation='relu')(decoded)
# decoded = Dense(200, activation='relu')(decoded)
decoded = Dense(inputshape, activation='linear')(encoded)

# Compile
autoencoder = Model(input_data, decoded)
autoencoder.compile(optimizer='adam', loss=root_mean_squared_error)

In [None]:
callbacks_list=[EarlyStopping(monitor="val_loss",min_delta=0.001, patience=3, mode='auto')]
auto_encoder_params = {
      "batch_size":32,
      "verbose":1,
      "epochs":200
    }

auto_hist = autoencoder.fit(X_train, X_train,
                    **auto_encoder_params,
                    shuffle=True,
                    validation_data=(X_test, X_test),
                    callbacks=callbacks_list)

In [None]:
# Model Evaluation
best = np.argmin(auto_hist.history["val_loss"])
print("Optimal Epoch: ",best+1)
print("Train Score: {}, Validation Score: {}".format(auto_hist.history["loss"][best],auto_hist.history["val_loss"][best]))

plt.plot(auto_hist.history['loss'], label='train')
plt.plot(auto_hist.history['val_loss'], label='validation')
plt.xlabel("Epochs")
plt.ylabel("Mean Square Error")
plt.title("Auto-Encoder Train and Validation Error")
plt.legend()
plt.show()
plt.savefig("Train and Validation MSE Progression.png")

## Sparse Random Projection

In [None]:
print("Combine Train and Test")
df = np.concatenate((X, test_df), axis=0)
del X, test_df; gc.collect()
print('All Data shape: {} Rows, {} Columns'.format(*df.shape))

# note that we take them from the *test* set
encoded_df = autoencoder.predict(df)

# Sparse Random Projection
SRP = SparseRandomProjection(n_components = 500, dense_output = True)
encoded_df = SRP.fit_transform(encoded_df)
print('All AutoEncoded and SparseRandomProjected shape: {} Rows, {} Columns'.format(*encoded_df.shape))

In [None]:
# Modeling Datasets
test_df = encoded_df[trainlen:]
X = encoded_df[0:trainlen]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Feed-Forward Neural Network

In [None]:
## Test with LGBM

In [None]:
# LGBM Dataset
lgtrain = lgb.Dataset(X, y)
print("Starting LightGBM. Train shape: {}, Test shape: {}".format(X.shape,test_df.shape))

In [None]:
import lightgbm as lgb
print("Light Gradient Boosting Regressor: ")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    #"min_split_gain":0.2,
    "min_child_weight":10,
    'zero_as_missing':True
                }
lgbm_params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }

modelstart= time.time()
# Find Optimal Parameters / Boosting Rounds
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
best_cv_score = min(lgb_cv['rmse-mean'])

print("\nOptimal Round: {}\nOptimal Score: {} + {}".format(
    optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))

results = results.append({"Rounds": optimal_rounds,
                          "Score": best_cv_score,
                          "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                          "LB": None,
                          "Parameters": lgbm_params}, ignore_index=True)
if Home is True:
    with open('results.csv', 'a') as f:
        results.to_csv(f, header=False)