In [1]:
import numpy as np

# Data Extraction
import pandas as pd

# Machine Learning
import tensorflow as tf
import sklearn

# Fetch Clean Data

In [2]:
# clean_data_path = "../dataset/clean_data.csv"
clean_data_path = "../dataset/_ambari_clean_data.csv"

In [3]:
from sklearn.model_selection import train_test_split
def split_data(data, labels, train_perc):
    
    test_perc = round(1-train_perc, 2)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_perc,
                                                        test_size=test_perc, random_state=42, stratify=labels)
    return x_train, x_test, y_train, y_test

In [4]:
df = pd.read_csv(clean_data_path, sep=',', encoding='ISO-8859-1', header=None)
clean_data = np.array(df)

# get rid of rows containing "nan" in clean data file
rows_to_delete = []
for i, row in enumerate(clean_data):
    for j, val in enumerate(row):
        if (str(row[j]).strip() == 'nan'):
            print("> Deleting row: " + str(row))
            rows_to_delete.append(i)
            break
clean_data = np.delete(clean_data, rows_to_delete, 0)

# don't include the last column; where the labels are
data = (clean_data[:,:-1])

# reshape from (m,) to (m,1), then convert into one-hot vector (m,k)
y = pd.get_dummies(clean_data[:,-1]).values # also converting to one-hot vector using pandas

print("> data matrix shape: " + str(data.shape))
print("> labels (y) shape: " + str(y.shape))

train_perc = .1 # percentage of total data used for training
x_train, x_test, y_train, y_test = split_data(data, y, train_perc) # randomly splitting up the data
m = x_train.shape[0] # number of tuples for training
n = data.shape[1] # number of features
k = len(y[0]) # number of classes

print("> m (training samples) = " + str(m) + "\n> n (num. features)= " + str(n) + "\n> k (num. classes) = " + str(k))

> data matrix shape: (1000, 5)
> labels (y) shape: (1000, 5)
> m (training samples) = 100
> n (num. features)= 5
> k (num. classes) = 5


In [5]:
y_rand = pd.get_dummies((np.floor(np.random.rand(len(y_test), 1)*5).astype(int)).flatten()).values
print("> y_rand shape: " + str(y_rand.shape))

> y_rand shape: (900, 5)


# Neural Network (Luisa)

In [None]:
def apply_activation_function(X, W, b, func='softmax'):
    
    if (func == 'softmax'): # softmax
       
        return tf.nn.softmax(tf.add(tf.matmul(X, W), b))
    
    if (func == 'relu'): # relu
        
        return tf.nn.relu(tf.add(tf.matmul(X, W), b))

    else: # sigmoid
    
        return tf.sigmoid(tf.add(tf.matmul(X, W), b))

In [None]:
def get_cost(y, y_):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_, labels=y))


In [None]:
# Using multiple layers
def get_output_layer(n_hidden_layers, X, n, k, n_perceptrons):
    
    layer_weights = []
    
    # input layer to first hidden layer
    layer_weights.append({'W': tf.Variable(tf.random_normal([n, n_perceptrons])),
                          'b': tf.Variable(tf.random_normal([n_perceptrons]))})
    
    # generate this many hidden layers
    for i in range(n_hidden_layers):
        layer_weights.append({'W': tf.Variable(tf.random_normal([n_perceptrons, n_perceptrons])),
                              'b': tf.Variable(tf.random_normal([n_perceptrons]))})

    # last hidden layer to output layer
    layer_weights.append({'W': tf.Variable(tf.random_normal([n_perceptrons, k])),
                          'b': tf.Variable(tf.random_normal([k]))})
            
    # calculate output-first hidden inner layer
    aggregated_val = apply_activation_function(X, layer_weights[0]['W'], layer_weights[0]['b'])
    
    # print("  aggregated_val.shape: " + str(aggregated_val.shape))
    
    # calculate all hidden layers and output layer
    for i in range(1, len(layer_weights)):
        aggregated_val = apply_activation_function(aggregated_val, layer_weights[i]['W'], layer_weights[i]['b'])
    
    # return final layer
    return aggregated_val

In [None]:
def run_model(n_hidden_layers, X, y, n, learning_rate, epochs, k, init_perceptrons, total_perceptrons, step):
   
    # to store the different accuracy values for each number of perceptrons used
    total_accuracy = []
    
    # if we are only trying with one set of perceptrons, adjust the upper bound for the "range" function below
    if (init_perceptrons == total_perceptrons):
        stop_cond = init_perceptrons + 1
    # otherwise, set the upper bound taking into accout both the initial perceptrons, and the total wanted
    else:
        stop_cond = init_perceptrons + total_perceptrons + 1

    # perform the training for each number of perceptrons specified
    for n_nodes in range(init_perceptrons, stop_cond, step):

        print("> Using ", n_nodes, " perceptrons and " + str(n_hidden_layers) + " hidden layers ...")

        y_ = get_output_layer(n_hidden_layers, X, n, k, n_nodes)
        cost_function = get_cost(y, y_)
        
        # using gradient descent to minimize the cost
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)

        correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)) # checking how many were predicted correctly
        benchmark_prediction = tf.equal(tf.argmax(y_rand, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        benchmark_accuracy = tf.reduce_mean(tf.cast(benchmark_prediction, tf.float32))

        # --- TRAINING ---

        # collecting cost for each epoch for plotting
        total_cost = []
        init_op = tf.global_variables_initializer()

        with tf.Session() as sess:

            sess.run(init_op)

            for epoch in range(epochs):

                _, c = sess.run([optimizer, cost_function], feed_dict={X:x_train, y:y_train})
                total_cost.append(c)

                if (epoch+1) % 1000 == 0:
                    print("  EPOCH:", (epoch+1), "Cost =", "{:.15f}".format(c))

            a = sess.run(accuracy, feed_dict={X: x_test, y: y_test})
            b_a = sess.run(benchmark_accuracy, feed_dict={y: y_test})
            total_accuracy.append(a)
            print("  >> Accuracy = " + "{:.5f}%".format(a*100) + " vs. Random = " + "{:.5f}%".format(b_a*100))
            

In [18]:
n_hidden_layers = 1
learning_rate = 0.01
epochs = 10000 # cycles of feed forward + backpropagation

# used to observe the change in accuracy as number of perceptrons increases
init_perceptrons = 200
total_perceptrons = 200
step = 25

# declare training data placeholders
X = tf.placeholder(tf.float32, [None, n]) # input x1, x2, x3, ..., x12 (12 nodes)
y = tf.placeholder(tf.float32, [None, k]) # output (5 nodes)

In [None]:
# run model
total_acc = run_model(n_hidden_layers, X, y, n, learning_rate, epochs, k, init_perceptrons,
                        total_perceptrons, step)

# Neural Network (Tutorial)

In [6]:
# learning_rate = 0.01
# epochs = 10000
# batch_size = 100
# num_perceptrons = 100

# # declare training data placeholders
# X = tf.placeholder(tf.float32, [None, n]) # input x1, x2, ..., x5 (5 nodes), features
# y = tf.placeholder(tf.float32, [None, k]) # output (5 nodes), classes

In [7]:
# # now declare the weights connecting the input to the hidden layer
# W1 = tf.Variable(tf.random_normal([n, num_perceptrons], stddev=0.03), name='W1')
# b1 = tf.Variable(tf.random_normal([num_perceptrons]), name='b1')

# # and the weights connecting the hidden layer to the output layer
# W2 = tf.Variable(tf.random_normal([num_perceptrons, k], stddev=0.03), name='W2')
# b2 = tf.Variable(tf.random_normal([k]), name='b2')

In [8]:
# # calculate the output of the hidden layer
# hidden_out = tf.add(tf.matmul(X, W1), b1)
# hidden_out = tf.nn.relu(hidden_out)

In [9]:
# # now calculate the hidden layer output - in this case, let's use a softmax activated
# # output layer
# y_ = tf.nn.softmax(tf.add(tf.matmul(hidden_out, W2), b2))

In [10]:
# # limited between 1e-10 to 0.999999.  This is to make sure that we never get a case were we have a log(0) operation
# y_clipped = tf.clip_by_value(y_, 1e-10, 0.9999999)
# cross_entropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped) + (1 - y) * tf.log(1 - y_clipped), axis=1))

In [11]:
# # add an optimiser
# optimiser = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

In [12]:
# # finally setup the initialisation operator
# init_op = tf.global_variables_initializer()

# # define an accuracy assessment operation
# correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## Training

In [13]:
# def next_batch(num, data, labels):
    
#     idx = np.arange(0 , len(data))
#     np.random.shuffle(idx)
#     idx = idx[:num]
#     data_shuffle = [data[ i] for i in idx]
#     labels_shuffle = [labels[ i] for i in idx]

#     return np.asarray(data_shuffle), np.asarray(labels_shuffle)

In [14]:
# # start the session
# with tf.Session() as sess:
#     # initialise the variables
#     sess.run(init_op)
#     total_batch = int(len(x_train) / batch_size)
#     for epoch in range(epochs):
#         avg_cost = 0
#         for i in range(total_batch):
#             batch_x, batch_y = next_batch(batch_size, x_train, y_train)
#             _, c = sess.run([optimiser, cross_entropy], feed_dict={X: batch_x, y: batch_y})
#             avg_cost += c / total_batch
        
#         if (epoch+1) % 1000 == 0:
#             print("Epoch:", (epoch + 1), "cost =", "{:.9f}".format(avg_cost))
        
#     print(sess.run(accuracy, feed_dict={X: x_test, y: y_test}))

Epoch: 1000 cost = 0.684979320
Epoch: 2000 cost = 0.606092155
Epoch: 3000 cost = 0.522155821
Epoch: 4000 cost = 0.474051654
Epoch: 5000 cost = 0.446948349
Epoch: 6000 cost = 0.406118393
Epoch: 7000 cost = 0.390257120
Epoch: 8000 cost = 0.408621430
Epoch: 9000 cost = 0.379612952
Epoch: 10000 cost = 0.337954223
0.822222


# Neural Network (Alex)

In [15]:
# # neural network
# num_epochs = 4000        # number of Epochs(forward+backward prop) to run
# learning_rate = 0.001     # learning rate of the optimizers
# HL_size = 5            # number of perceptrons in the hidden layer

In [16]:
# def model(data, num_feat, num_class, HL_size):
#     hidden_1_layer = {'weights':tf.Variable(tf.random_normal([num_feat, HL_size])),
#                       'biases': tf.Variable(tf.random_normal([HL_size]))}
    
#     hidden_2_layer = {'weights':tf.Variable(tf.random_normal([HL_size, HL_size])),
#                       'biases': tf.Variable(tf.random_normal([HL_size]))}
    
#     hidden_3_layer = {'weights':tf.Variable(tf.random_normal([HL_size, HL_size])),
#                       'biases': tf.Variable(tf.random_normal([HL_size]))}
    
#     output_layer = {'weights':tf.Variable(tf.random_normal([HL_size, num_class])),
#                       'biases': tf.Variable(tf.random_normal([num_class]))}
    
#     # (input_data * weights) + biases
    
#     l1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
#     l1 = tf.nn.relu(l1)
    
#     l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']), hidden_2_layer['biases'])
#     l2 = tf.nn.relu(l2)
    
#     l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']), hidden_3_layer['biases'])
#     l3 = tf.nn.relu(l3)
    
#     output = tf.add(tf.matmul(l3, output_layer['weights']), output_layer['biases'])
    
#     return output

# def run_neural_net(train_x, test_x, train_y, test_y):
#     #Get the number of features and number of classes
#     num_feat, num_class = len(train_x[0,:]), len(train_y[0,:])
    
#     # height x width
#     x = tf.placeholder('float',[None, num_feat])
#     y = tf.placeholder('float')
    
#     #Run the model
#     prediction = model(x, num_feat, num_class, HL_size)
#     cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = prediction,labels = y))
    
#     # learning_default = 0.001
#     optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)    
    
#     #The benchmark prediction
#     benchmark_prediction = tf.equal(tf.argmax(y_rand, 1), tf.argmax(y, 1))
    
#     with tf.Session() as s:
#         s.run(tf.global_variables_initializer())
        
#         for epoch in range(num_epochs):
#             #print("epoch_x: ",epoch_x.shape,"epoch_y:",epoch_x.shape);
#             _, epoch_loss = s.run([optimizer, cost], feed_dict = {x:train_x, y:train_y})
#             if (epoch+1) % 1000 == 0:
#                 print('Epoch',epoch+1,'completed out of',num_epochs,'loss:',epoch_loss)
            
#         #Actual Prediction
#         correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))        
#         accuracy = tf.reduce_mean(tf.cast(correct,'float'))
#         accuracy_val = accuracy.eval({x:test_x, y:test_y})
        
#         #Benchmark Prediction
#         correct_bench = tf.equal(tf.argmax(prediction,1), tf.argmax(y_rand,1))
#         accuracy_bench = tf.reduce_mean(tf.cast(correct_bench,'float'))
#         accuracy_bench = accuracy_bench.eval({x:test_x, y:test_y})
        
#         print('Accuracy:', accuracy_val, " Benchmark:",accuracy_bench)

In [17]:
# run_neural_net(x_train, x_test, y_train, y_test)