# Auto encoder for feature compression using TFlearn

In [15]:
from __future__ import division, print_function, absolute_import # Ensure no issues with 
import tflearn
import tensorflow as tf 

import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import random
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error

def read_csv(csv_file):
    '''Returns numpy array and panda version of the file'''
    csv_file = csv_file +".csv"
    df = pd.DataFrame()
    df = df.from_csv(csv_file, header=0, sep=',', index_col=0)
#n=df.shape[0] # number of samples
#d=df.shape[1] # number of features
    array = np.asarray(df,dtype="float64")
    if np.shape(array)[1] == 1:
        array = array.ravel()
    return array, df

def publish_pred(y_pred, file_name):
    df = pd.DataFrame()
    df = df.from_csv("reg_sample_submission.csv", header=0, sep=',', index_col=0)
    df["Output"] = y_pred
    df.to_csv(file_name)

# Load data

In [2]:
tr_in, df_in = read_csv("reg_train_in")
tr_in = np.asarray(tr_in,dtype="float64")
tr_out, df_out = read_csv("reg_train_out")
tr_out = np.asarray(tr_out,dtype="float64")
te_in, df = read_csv("reg_test_in") # Still have to deal with NaNs best

df = pd.DataFrame()
df = df.from_csv("reg_test_gp2.csv", header=None, sep=',', index_col=None)

te_gp= np.asarray(df,dtype="float64")

In [3]:
NaN = np.isnan(te_in)
NaN_rows = []
for i in range(1800):
    for j in range(14):
        if NaN[i,j]:
            NaN_rows.append(i)
index = list(set(range(1800))- set(NaN_rows))

X_full = np.concatenate((tr_in,te_in[index,:]),axis=0)

# Split data

In [4]:
cv = ShuffleSplit(n_splits=1,  test_size=0.20, random_state=random.randint(0,20), train_size=None) # 10_splits
for train_index, test_index in cv.split(X_full):
    print("TRAIN:", len(list(train_index)), "TEST:", len(test_index))
X_tr = X_full[list(train_index)]
X_vl = X_full[list(test_index)]
# VALIDATION AND TRAINING SET:
X_tr_scale = preprocessing.StandardScaler().fit(X_tr)
X_train = X_tr_scale.transform(X_tr)
X_val = X_tr_scale.transform(X_vl)

# DATA NOW PREPARED TO ENTER NETWORK
X_test = X_tr_scale.transform(te_gp)

TRAIN: 28480 TEST: 7120


# Build encoder and decoder

### default settings of fully connected
tflearn.layers.core.fully_connected (incoming, n_units, activation='linear', bias=True, weights_init='truncated_normal', bias_init='zeros', regularizer=None, weight_decay=0.001, trainable=True, restore=True, reuse=False, scope=None, name='FullyConnected')

In [5]:
encoder = tflearn.input_data(shape=[None, 14])
encoder = tflearn.fully_connected(encoder, 8)
encoder = tflearn.fully_connected(encoder, 5)

decoder = tflearn.fully_connected(encoder, 8)
decoder = tflearn.fully_connected(decoder, 14)

In [6]:
net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.001,
                         loss='mean_square', metric=None)


In [7]:
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, X_train, n_epoch=50, validation_set=(X_val, X_val),
          run_id="auto_encoder", 
          batch_size=100)


Training Step: 14249  | total loss: [1m[32m0.01970[0m[0m | time: 1.010s
| Adam | epoch: 050 | loss: 0.01970 -- iter: 28400/28480
Training Step: 14250  | total loss: [1m[32m0.01967[0m[0m | time: 2.035s
| Adam | epoch: 050 | loss: 0.01967 | val_loss: 0.02027 -- iter: 28480/28480
--


In [8]:
encoding_model = tflearn.DNN(encoder, session=model.session)
decoding_model = tflearn.DNN(decoder, session=model.session)


In [20]:
X_test_auto = np.array(decoding_model.predict(X_test.reshape(1800,14)))
te_gp_auto = X_tr_scale.inverse_transform(X_test_auto)
np.savetxt('te_gp_auto.csv', te_gp_auto, delimiter=",")