In [1]:
import numpy as np

# Data Extraction
import pandas as pd

# Machine Learning
import tensorflow as tf
import sklearn

# Parameters

In [2]:
#data
clean_data_path = "../dataset/clean_data.csv"

# 2. Implementation <a class="anchor" id="implementation"></a>

## 2.2. Fetch Clean Data

In [5]:
from sklearn.model_selection import train_test_split
def split_data(data, labels, train_perc):
    
    test_perc = round(1-train_perc, 2)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_perc, test_size=test_perc, shuffle=False)

    return x_train, x_test, y_train, y_test

In [6]:
df = pd.read_csv(clean_data_path, sep=',', encoding='ISO-8859-1', header=None)
clean_data = np.array(df)

# get rid of rows containing "nan" in clean data file
rows_to_delete = []
for i, row in enumerate(clean_data):
    for j, val in enumerate(row):
        if (str(row[j]).strip() == 'nan'):
            print("> Deleting row: " + str(row))
            rows_to_delete.append(i)
            break
clean_data = np.delete(clean_data, rows_to_delete, 0)

# don't include the last column; where the labels are
data = (clean_data[:,:-1])

# reshape from (m,) to (m,1), then convert into one-hot vector (m,k)
y = pd.get_dummies(clean_data[:,-1]).values # also converting to one-hot vector using pandas

print("> data matrix shape: " + str(data.shape))
print("> labels (y) shape: " + str(y.shape))

train_perc = .7 # percentage of total data used for training
x_train, x_test, y_train, y_test = split_data(data, y, train_perc) # randomly splitting up the data
m = x_train.shape[0] # number of tuples for training
n = data.shape[1] # number of features
k = len(y[0]) # number of classes

print("> m (training samples) = " + str(m) + "\n> n (num. features)= " + str(n) + "\n> k (num. classes) = " + str(k))

> data matrix shape: (4000, 5)
> labels (y) shape: (4000, 5)


TypeError: Invalid parameters passed: {'shuffle': False}

In [None]:
y_rand = pd.get_dummies((np.floor(np.random.rand(len(y_test), 1)*5).astype(int)).flatten()).values
print("> y_rand shape: " + str(y_rand.shape))

# Neural Network Specific Functions

In [None]:
# neural network
num_epochs = 4000        # number of Epochs(forward+backward prop) to run
learning_rate = 0.001     # learning rate of the optimizers
HL_size = 5            # number of perceptrons in the hidden layer

In [None]:
def model(data, num_feat, num_class, HL_size):
    hidden_1_layer = {'weights':tf.Variable(tf.random_normal([num_feat, HL_size])),
                      'biases': tf.Variable(tf.random_normal([HL_size]))}
    
    hidden_2_layer = {'weights':tf.Variable(tf.random_normal([HL_size, HL_size])),
                      'biases': tf.Variable(tf.random_normal([HL_size]))}
    
    hidden_3_layer = {'weights':tf.Variable(tf.random_normal([HL_size, HL_size])),
                      'biases': tf.Variable(tf.random_normal([HL_size]))}
    
    output_layer = {'weights':tf.Variable(tf.random_normal([HL_size, num_class])),
                      'biases': tf.Variable(tf.random_normal([num_class]))}
    
    # (input_data * weights) + biases
    
    l1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)
    
    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']), hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)
    
    l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']), hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.add(tf.matmul(l3, output_layer['weights']), output_layer['biases'])
    
    return output

def run_neural_net(train_x, test_x, train_y, test_y):
    #Get the number of features and number of classes
    num_feat, num_class = len(train_x[0,:]), len(train_y[0,:])
    
    # height x width
    x = tf.placeholder('float',[None, num_feat])
    y = tf.placeholder('float')
    
    #Run the model
    prediction = model(x, num_feat, num_class, HL_size)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = prediction,labels = y))
    
    # learning_default = 0.001
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)    
    
    #The benchmark prediction
    benchmark_prediction = tf.equal(tf.argmax(y_rand, 1), tf.argmax(y, 1))
    
    with tf.Session() as s:
        s.run(tf.global_variables_initializer())
        
        for epoch in range(num_epochs):
            #print("epoch_x: ",epoch_x.shape,"epoch_y:",epoch_x.shape);
            _, epoch_loss = s.run([optimizer, cost], feed_dict = {x:train_x, y:train_y})
            if (epoch+1) % 1000 == 0:
                print('Epoch',epoch+1,'completed out of',num_epochs,'loss:',epoch_loss)
            
        #Actual Prediction
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))        
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        accuracy_val = accuracy.eval({x:test_x, y:test_y})
        
        #Benchmark Prediction
        correct_bench = tf.equal(tf.argmax(prediction,1), tf.argmax(y_rand,1))
        accuracy_bench = tf.reduce_mean(tf.cast(correct_bench,'float'))
        accuracy_bench = accuracy_bench.eval({x:test_x, y:test_y})
        
        print('Accuracy:', accuracy_val, " Benchmark:",accuracy_bench)

In [None]:
run_neural_net(x_train, x_test, y_train, y_test)