In [1]:
import numpy as np

# Data Extraction
import pandas as pd

# Machine Learning
import tensorflow as tf
import sklearn

# Fetch Clean Data

In [2]:
def fetch_data(data_path):
    
    df = pd.read_csv(clean_data_path, sep=',', encoding='ISO-8859-1', header=None)
    clean_data = np.array(df)

    # get rid of rows containing "nan" in clean data file
    rows_to_delete = []
    for i, row in enumerate(clean_data):
        for j, val in enumerate(row):
            if (str(row[j]).strip() == 'nan'):
                print("> Deleting row: " + str(row))
                rows_to_delete.append(i)
                break
    clean_data = np.delete(clean_data, rows_to_delete, 0)

    # don't include the last column; where the labels are
    x = (clean_data[:,:-1])

    # retrieve the last column: the target/labels
    # reshape from (m,) to (m,1), then convert into one-hot vector (m,k)
    y = pd.get_dummies(clean_data[:,-1]).values # also converting to one-hot vector using pandas
    
    return x, y

# Class Distribution

In [3]:
def get_class_distribution(array):
    
    dist = []
    for elem in array: dist.append(np.argmax(elem))
        
    unique, counts = np.unique(dist, return_counts=True)
    
    counts = ["{:.2f}%".format(num/len(dist)*100) for num in counts]

    return (dict(zip(unique, counts)))

# Handle Imbalanced Data

In [4]:
from sklearn.datasets import make_classification

def oversample_data(x, y_onehot, alg='naive'):
        
    # convert y from one-hot to 1D
    y = []
    for elem in y_onehot: y.append(np.argmax(elem))

    if alg=='smote':
        from imblearn.over_sampling import SMOTE
        x_oversampled, y_oversampled = SMOTE().fit_sample(x, y)
    
    elif alg=='adasyn':
        from imblearn.over_sampling import ADASYN
        x_oversampled, y_oversampled = ADASYN().fit_sample(x, y)
        
    elif alg=='naive':
        from imblearn.over_sampling import RandomOverSampler
        ros = RandomOverSampler(random_state=0)
        x_oversampled, y_oversampled = ros.fit_sample(x, y)
        
    else:
        print("ERROR: This is not a valid algorithm.")

    # convert y back into a one-hot vector
    y_oversampled = pd.get_dummies(y_oversampled).values
    
    return x_oversampled, y_oversampled

# Split Data into Testing and Training Sets

In [5]:
from sklearn.model_selection import train_test_split
def split_data(data, labels, train_perc):
    
    test_perc = round(1-train_perc, 2)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_perc,
                                                        test_size=test_perc, random_state=42, stratify=labels)
    return x_train, x_test, y_train, y_test

# Neural Network

In [6]:
def apply_activation_function(X, W, b, func='softmax'):
    
    if (func == 'softmax'): # softmax
        return tf.nn.softmax(tf.add(tf.matmul(X, W), b))
    if (func == 'relu'): # relu
        return tf.nn.relu(tf.add(tf.matmul(X, W), b))
    else: # sigmoid
        return tf.sigmoid(tf.add(tf.matmul(X, W), b))

In [7]:
def get_cost(y, y_):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_, labels=y))

In [8]:
# Using multiple layers
def get_output_layer(n_hidden_layers, X, n, k, n_perceptrons):
    
    layer_weights = []
    
    # input layer to first hidden layer
    layer_weights.append({'W': tf.Variable(tf.random_normal([n, n_perceptrons])),
                          'b': tf.Variable(tf.random_normal([n_perceptrons]))})
    
    # generate this many hidden layers
    for i in range(n_hidden_layers):
        layer_weights.append({'W': tf.Variable(tf.random_normal([n_perceptrons, n_perceptrons])),
                              'b': tf.Variable(tf.random_normal([n_perceptrons]))})

    # last hidden layer to output layer
    layer_weights.append({'W': tf.Variable(tf.random_normal([n_perceptrons, k])),
                          'b': tf.Variable(tf.random_normal([k]))})
            
    # calculate output-first hidden inner layer
    aggregated_val = apply_activation_function(X, layer_weights[0]['W'], layer_weights[0]['b'])
    
    # print("  aggregated_val.shape: " + str(aggregated_val.shape))
    
    # calculate all hidden layers and output layer
    for i in range(1, len(layer_weights)):
        aggregated_val = apply_activation_function(aggregated_val, layer_weights[i]['W'], layer_weights[i]['b'])
    
    # return final layer
    return aggregated_val

## Variables

In [9]:
def run_model(n_hidden_layers, X, y, n, learning_rate, epochs, k, init_perceptrons, total_perceptrons, step):
   
    # to store the different accuracy values for each number of perceptrons used
    total_accuracy = []
    
    # if we are only trying with one set of perceptrons, adjust the upper bound for the "range" function below
    if (init_perceptrons == total_perceptrons):
        stop_cond = init_perceptrons + 1
    # otherwise, set the upper bound taking into accout both the initial perceptrons, and the total wanted
    else:
        stop_cond = init_perceptrons + total_perceptrons + 1

    # perform the training for each number of perceptrons specified
    for n_nodes in range(init_perceptrons, stop_cond, step):

        print("> Using ", n_nodes, " perceptrons and " + str(n_hidden_layers) + " hidden layer(s) ...")

        y_ = get_output_layer(n_hidden_layers, X, n, k, n_nodes)
        cost_function = get_cost(y, y_)
        
        # using gradient descent to minimize the cost
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)

        correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)) # checking how many were predicted correctly
        benchmark_prediction = tf.equal(tf.argmax(y_rand, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        benchmark_accuracy = tf.reduce_mean(tf.cast(benchmark_prediction, tf.float32))

        # --- TRAINING ---

        # collecting cost for each epoch for plotting
        total_cost = []
        init_op = tf.global_variables_initializer()

        with tf.Session() as sess:

            sess.run(init_op)

            for epoch in range(epochs):

                _, c = sess.run([optimizer, cost_function], feed_dict={X:x_train, y:y_train})
                total_cost.append(c)

                if (epoch+1) % 1000 == 0:
                    print("  EPOCH:", (epoch+1), "Cost =", "{:.15f}".format(c))

            a = sess.run(accuracy, feed_dict={X: x_test, y: y_test})
            b_a = sess.run(benchmark_accuracy, feed_dict={y: y_test})
            
            total_accuracy.append(a)
            
            print("\n  >> Accuracy = " + "{:.5f}%".format(a*100) + " vs. Random = " + "{:.5f}%".format(b_a*100))

# Implementation

### Fetching Clean Data

In [10]:
project_name = 'all data'
clean_data_path = "../dataset/clean_data.csv"
# clean_data_path = "../dataset/" + project_name + "_clean_data.csv"

In [11]:
x, y = fetch_data(clean_data_path)

<font color=red>DELETING FEATURES
</font>

In [12]:
# TEMP: Delete some features and see how the net performs
x = np.delete(x,[1,2,4], axis=1)

**Obtain the class distribution of the data, and adjust it if it's imbalanced.**

In [13]:
dist = get_class_distribution(y)

print("\nProject: " + (project_name).upper())
print("\nData Distribution")
print(dist)


Project: ALL DATA

Data Distribution
{0: '1.20%', 1: '4.12%', 2: '65.00%', 3: '25.07%', 4: '4.60%'}


In [14]:
alg = 'naive' # naive, smote, adasyn
x, y = oversample_data(x, y, alg)

In [15]:
dist = get_class_distribution(y)

print("\nProject: " + (project_name).upper())
print("\nData Distribution")
print(dist)


Project: ALL DATA

Data Distribution
{0: '20.00%', 1: '20.00%', 2: '20.00%', 3: '20.00%', 4: '20.00%'}


### Neural Network

**Declare variables**

In [16]:
n_hidden_layers = 1
learning_rate = 0.01
epochs = 10000 # cycles of feed forward + backpropagation

# used to observe the change in accuracy as number of perceptrons increases
init_perceptrons = 10
total_perceptrons = 10
step = 25 # changing from init_perceptrons to total_perceptrons

**Split data into training and testing sets**

In [17]:
train_perc = .7 # percentage of total data used for training
x_train, x_test, y_train, y_test = split_data(x, y, train_perc) # randomly splitting up the data

# setting m, n and k variables for placeholder definitions later on
m = x_train.shape[0] # number of tuples for training
n = x.shape[1] # number of features
k = len(y[0]) # number of classes

print("> m (training samples) = " + str(m) + "\n> n (num. features)= " + str(n) + "\n> k (num. classes) = " + str(k))

> m (training samples) = 9100
> n (num. features)= 2
> k (num. classes) = 5


**Based on the testing set, generate a random solution as a benchmark for comparisson in terms of accuracy.**

In [18]:
y_rand = pd.get_dummies((np.floor(np.random.rand(len(y_test), 1)*5).astype(int)).flatten()).values
print("> y_rand shape: " + str(y_rand.shape))

> y_rand shape: (3900, 5)


**Run the neural network model**

In [19]:
# declare training data placeholders
X = tf.placeholder(tf.float32, [None, n]) # input x1, x2, x3, ..., x12 (12 nodes)
y = tf.placeholder(tf.float32, [None, k]) # output (5 nodes)

In [20]:
# run model
total_acc = run_model(n_hidden_layers, X, y, n, learning_rate, epochs, k, init_perceptrons,
                        total_perceptrons, step)

> Using  10  perceptrons and 1 hidden layer(s) ...
  EPOCH: 1000 Cost = 1.604899287223816
  EPOCH: 2000 Cost = 1.603291988372803
  EPOCH: 3000 Cost = 1.601563811302185
  EPOCH: 4000 Cost = 1.599710464477539
  EPOCH: 5000 Cost = 1.597723484039307
  EPOCH: 6000 Cost = 1.595571517944336
  EPOCH: 7000 Cost = 1.593298673629761
  EPOCH: 8000 Cost = 1.590878605842590
  EPOCH: 9000 Cost = 1.588327646255493
  EPOCH: 10000 Cost = 1.585708022117615

  >> Accuracy = 32.64103% vs. Random = 20.51282%
