# Setup

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# Suppress Warning messagaes
tf.logging.set_verbosity(tf.logging.ERROR)

# Set Parameters

In [3]:
NUM_LABELS = 99      # number of classes
VAL_FRAC = 0.1       # fraction of input set aside for cross validation

# Read and Scale Dataset

In [4]:
train_file = "data/train.csv"
test_file = "data/test.csv"
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
test_ids = test['id']

In [5]:
x_train = train.drop(['species', 'id'], axis=1).values
le = LabelEncoder().fit(train['species'])
y_train = le.transform(train['species'])

In [6]:
x_test = test.drop(['id'], axis=1).values

In [7]:
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Cross Validation Set

In [8]:
num_data = x_train.shape[0]
num_features = x_train.shape[1]

num_cv = int(np.around(num_data * VAL_FRAC))
num_train = num_data - num_cv

In [9]:
ind_cv = np.random.choice(num_data, num_cv, replace=False)
ind_tr = np.delete(np.arange(num_data), ind_cv)

x_cv = x_train[ind_cv, :]
y_cv = y_train[ind_cv]

x_tr = x_train[ind_tr, :]
y_tr = y_train[ind_tr]

# Fit model

In [10]:
# Specify the data to be used.
# x_train: contains the full training data
# x_cv: contains the cross validation data
# x_tr: contains the remaining training data
X = x_tr
Y = y_tr

In [11]:
# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=num_features)]

# Choose optimizer

#opt_adam = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4)
#opt_adagrad = tf.train.AdagradOptimizer(learning_rate=1e-1)
#opt_adadelta = tf.train.AdadeltaOptimizer(learning_rate=1e-1)

# Choose DNN Sturcture
nn_wide = [4096, 2048]
nn_mid = [2048, 1024, 512]
nn_narrow = [1024, 512, 256, 512]

# Dropout Rate
dropout = 0.1

# Number of training steps
steps_tr = 1000

In [12]:
# Build the DNN
classifier = tf.contrib.learn.DNNClassifier(feature_columns = feature_columns,
                                            hidden_units = nn_mid,
                                            dropout = dropout,
                                            #optimizer=opt_adagrad,
                                            n_classes = NUM_LABELS,
                                            model_dir = "kleafc_model"
                                           )
classifier.fit(x = X.astype("float32"), y = Y, steps = steps_tr)

DNNClassifier(dropout=0.1, optimizer=None, feature_columns=[_RealValuedColumn(column_name='', dimension=192, default_value=None, dtype=tf.float32)], hidden_units=[2048, 1024, 512])

# Training and Cross Validation Error

In [14]:
# Training Error
log_loss_tr = classifier.evaluate(X.astype("float32"), Y)['loss']
acc_tr = classifier.evaluate(X.astype("float32"), Y)['accuracy']

In [15]:
# Cross Validation Error
log_loss_cv = classifier.evaluate(x_cv.astype("float32"), y_cv)['loss']
acc_cv = classifier.evaluate(x_cv.astype("float32"), y_cv)['accuracy']

In [16]:
print('Training Log-Loss: {0:f}'.format(log_loss_tr))
print('Training Accuracy: {0:f}'.format(acc_tr))

print('Cross Validation Log-Loss: {0:f}'.format(log_loss_cv))
print('Cross Validation Accuracy: {0:f}'.format(acc_cv))

Training Log-Loss: 0.000784
Training Accuracy: 1.000000
Cross Validation Log-Loss: 0.017190
Cross Validation Accuracy: 1.000000


# Test Data Predictions

In [17]:
y_test_prob = classifier.predict_proba(x_test.astype("float32"))

# Submit

In [18]:
submission = pd.DataFrame(y_test_prob, index=test_ids, columns=le.classes_)
submission.to_csv('submission_dnn.csv')