## import the library we need.

In [1]:
import pandas as pd

# Dataset

## read csv file.

In [2]:
def read_dataset():
    dataset = pd.read_csv('wheat-seeds.csv', header=None, delimiter=',')
    dataset.columns = ['area', 'perimeter', 'compactness', 'kernel_length', 'kernel_width', 'asymmetry_coef', 'kernel_groove_length', 'variety']
    return dataset
dataset = read_dataset()

**Reference:**
* read without header: https://stackoverflow.com/questions/29287224/pandas-read-in-table-without-headers
* columns names: https://archive.ics.uci.edu/ml/datasets/seeds

## check the data.

In [3]:
dataset.head()

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coef,kernel_groove_length,variety
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


# Code

In [4]:
# Backprop on the Seeds Dataset
from random import seed
from random import randrange
from random import random
from csv import reader
from math import exp

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	stats = [[min(column), max(column)] for column in zip(*dataset)]
	return stats

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)-1):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Calculate neuron activation for an input
def activate(weights, inputs):
	activation = weights[-1]
	for i in range(len(weights)-1):
		activation += weights[i] * inputs[i]
	return activation

# Transfer neuron activation
def transfer(activation):
	return 1.0 / (1.0 + exp(-activation))

# Forward propagate input to a network output
def forward_propagate(network, row):
	inputs = row
	for layer in network:
		new_inputs = []
		for neuron in layer:
			activation = activate(neuron['weights'], inputs)
			neuron['output'] = transfer(activation)
			new_inputs.append(neuron['output'])
		inputs = new_inputs
	return inputs

# Calculate the derivative of an neuron output
def transfer_derivative(output):
	return output * (1.0 - output)

# Backpropagate error and store in neurons
def backward_propagate_error(network, expected):
	for i in reversed(range(len(network))):
		layer = network[i]
		errors = list()
		if i != len(network) - 1:
			for j in range(len(layer)):
				error = 0.0
				for neuron in network[i + 1]:
					error += (neuron['weights'][j] * neuron['delta'])
				errors.append(error)
		else:
			for j in range(len(layer)):
				neuron = layer[j]
				errors.append(neuron['output'] - expected[j])
		for j in range(len(layer)):
			neuron = layer[j]
			neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])

# Update network weights with error
def update_weights(network, row, l_rate):
	for i in range(len(network)):
		inputs = row[:-1]
		if i != 0:
			inputs = [neuron['output'] for neuron in network[i - 1]]
		for neuron in network[i]:
			for j in range(len(inputs)):
				neuron['weights'][j] -= l_rate * neuron['delta'] * inputs[j]
			neuron['weights'][-1] -= l_rate * neuron['delta']

# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
	for epoch in range(n_epoch):
		for row in train:
			outputs = forward_propagate(network, row)
			expected = [0 for i in range(n_outputs)]
			expected[row[-1]] = 1
			backward_propagate_error(network, expected)
			update_weights(network, row, l_rate)

# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
	network = list()
	hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
	network.append(hidden_layer)
	output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
	network.append(output_layer)
	return network

# Make a prediction with a network
def predict(network, row):
	outputs = forward_propagate(network, row)
	return outputs.index(max(outputs))

# Backpropagation Algorithm With Stochastic Gradient Descent
def back_propagation(train, test, l_rate, n_epoch, n_hidden):
	n_inputs = len(train[0]) - 1
	n_outputs = len(set([row[-1] for row in train]))
	network = initialize_network(n_inputs, n_hidden, n_outputs)
	train_network(network, train, l_rate, n_epoch, n_outputs)
	predictions = list()
	for row in test:
		prediction = predict(network, row)
		predictions.append(prediction)
	return(predictions)

# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds_dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.3
n_epoch = 500
n_hidden = 5
scores = evaluate_algorithm(dataset, back_propagation, n_folds, l_rate, n_epoch, n_hidden)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [92.85714285714286, 92.85714285714286, 97.61904761904762, 92.85714285714286, 90.47619047619048]
Mean Accuracy: 93.333%


# Q1

## (a) What is the maximum value of the dataset?

In [5]:
dataset = read_dataset()

### for each column

In [6]:
dataset.max()

area                    21.1800
perimeter               17.2500
compactness              0.9183
kernel_length            6.6750
kernel_width             4.0330
asymmetry_coef           8.4560
kernel_groove_length     6.5500
variety                  3.0000
dtype: float64

### for each record

In [7]:
dataset.max(axis=1)

0      15.26
1      14.88
2      14.29
3      13.94
4      16.14
       ...  
205    13.20
206    12.88
207    13.66
208    13.21
209    13.34
Length: 210, dtype: float64

### for all the values

In [8]:
max(dataset.max())

21.18

### for the sample code

In [9]:
# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds_dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
for col_minmax in minmax:
    print(col_minmax[-1])

21.18
17.25
0.9183
6.675
4.033
8.456
6.55
2


**Reference:**
* max from row: https://stackoverflow.com/questions/44300989/get-max-value-from-row-of-a-dataframe-in-python 

## (b) What is the minimum value of the dataset?

In [10]:
dataset = read_dataset()

### for each column

In [11]:
dataset.min()

area                    10.5900
perimeter               12.4100
compactness              0.8081
kernel_length            4.8990
kernel_width             2.6300
asymmetry_coef           0.7651
kernel_groove_length     4.5190
variety                  1.0000
dtype: float64

### for each record

In [12]:
dataset.min(axis=1)

0      0.8710
1      0.8811
2      0.9050
3      0.8955
4      0.9034
        ...  
205    0.8783
206    0.8511
207    0.8883
208    0.8521
209    0.8684
Length: 210, dtype: float64

### for all the values

In [13]:
min(dataset.min())

0.7651

### for the sample code

In [14]:
# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds_dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
for col_minmax in minmax:
    print(col_minmax[0])

10.59
12.41
0.8081
4.899
2.63
0.7651
4.519
0


## (c) There was a scaling done on the data before the machine learning process. 
## Write down the formulae used for the scaling and state what would be the max and min values after the scaling.

$$NewValue = \frac{OldValue - ColMin}{ColMax - ColMin}$$

* "def normalize_dataset(dataset, minmax)" part in the sample code.
* rescale dataset columns to the range 0-1.
* new value = (old value - column's min) / (column's max - column's min)
* after the scaling, max = 1, min = 0

**Reference:**
* Latex: https://towardsdatascience.com/write-markdown-latex-in-the-jupyter-notebook-10985edb91fd

## (d) What does the function cross_validation_split do?

* Randomly split a dataset into k folds.
* Randomly split 210 records into 5 pack of 42 records in this case.
* train with k - 1 folds and test with one fold, and will repeat k times , with each of the k subsamples used exactly once as the validation data

## (e) What does the function accuracy_metric do?

$$Accuracy = \frac{CorrectNumbers}{TotalNumbers} * 100\%$$

* Calculate accuracy percentage
* Help us to see the performance of the network

## (f) What was the non-linear perceptron activation function used?
## Name the function in the code that perform this activation.

$$Output = \frac{1}{1 + e^{-x}}$$

* "def transfer(activation)" part in the sample code.
* transfer neuron activation.
* sigmoid activation function can take any input value and produce a number between 0 and 1 on an S-curve.

## (g) In the function “transfer_derivative”, explain why the return value is output * (1-output)

$$
f'(SigmoidOutput)\\
= f'(\frac{1}{1 + e^{-x}})\\
= f'(({1 + e^{-x}}) ^ {-1})\\
= -({1 + e^{-x}}) ^ {-2} * (-e ^ {-x})\\
= \frac{1}{1 + e^{-x}} * \frac{e^{-x}}{1 + e^{-x}}\\
= SigmoidOutput * (1 - SigmoidOutput)
$$

* the slope of the neuron’s sigmoid output value

## (h) In the function “backward_propagate_error”, explain what is “errors”.

$$
errors = (Ouputs - ExpectedValues)\\
errors = (Ouputs - OutputLayerValues)\\
errors = (Ouputs - Weights * Delta)\\
$$

* "errors" is the difference between the values produced by neurons and the expected values.
* "errors" is the difference between the output values and the expected values, the values in the output layers, or the weights multiply with the delta of the neurons in the layers.
* We store the errors of the neurons in the layers to “errors”, and use it to train the network.

## (i) In the function “backward_propagate_error”, explain what is “neuron[‘delta’]”

$$neuron['delta'] = Errors * TransferDerivative$$

* neuron['delta'] is the values of the neurons before activating transforming need to be changed by. That is, the values of the neurons before activating transforming need to be changed by the values of neuron['delta'].

## (j) Modify the code so that the total training error (a scalar) for epochs at 100, 200, 300, 400, 500 will be printed out.

In [15]:
# Train a network for a fixed number of epochs
def train_network_100epoch_show(network, train, l_rate, n_epoch, n_outputs):
	for epoch in range(1, n_epoch + 1):
		sum_error = 0 # correct
		for row in train:
			outputs = forward_propagate(network, row)
			expected = [0 for i in range(n_outputs)]
			expected[row[-1]] = 1
			if epoch % 100 == 0:
				# sum_error = 0 # wrong
				sum_error += sum([(expected[expected_id] - outputs[expected_id]) ** 2 for expected_id in range(len(expected))])
			backward_propagate_error(network, expected)
			update_weights(network, row, l_rate)
		if epoch % 100 == 0:
			print('>epoch=%d, error=%.3f' % (epoch, sum_error))
	print()
            
train_network = train_network_100epoch_show

In [16]:
# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds_dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.3
n_epoch = 500
n_hidden = 5
scores = evaluate_algorithm(dataset, back_propagation, n_folds, l_rate, n_epoch, n_hidden)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=100, error=0.001
>epoch=200, error=0.000
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

>epoch=100, error=0.001
>epoch=200, error=0.001
>epoch=300, error=0.001
>epoch=400, error=0.001
>epoch=500, error=0.000

>epoch=100, error=0.000
>epoch=200, error=0.000
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

>epoch=100, error=0.000
>epoch=200, error=0.000
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

>epoch=100, error=0.039
>epoch=200, error=0.002
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

Scores: [92.85714285714286, 92.85714285714286, 97.61904761904762, 92.85714285714286, 90.47619047619048]
Mean Accuracy: 93.333%


**Reference:**
* redefine functions: https://stackoverflow.com/questions/3692159/how-do-i-redefine-functions-in-python

# Q2

## Modify the code so that the neural network has two hidden layers. The added hidden layer will be in-between the existing hidden layer and the output layer. Also, the added hidden layer has 3 neurons.

In [17]:
# Initialize a network
def initialize_network_new(n_inputs, n_hidden, n_outputs):
	network = list()
	hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
	network.append(hidden_layer)
	n_hidden_added = 3
	hidden_layer_added = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_hidden_added)]
	network.append(hidden_layer_added)
	output_layer = [{'weights':[random() for i in range(n_hidden_added + 1)]} for i in range(n_outputs)]
	network.append(output_layer)
	return network

initialize_network = initialize_network_new

In [18]:
# Test Backprop on Seeds dataset
seed(1)
# load and prepare data
filename = 'seeds_dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# normalize input variables
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.3
n_epoch = 500
n_hidden = 5
scores = evaluate_algorithm(dataset, back_propagation, n_folds, l_rate, n_epoch, n_hidden)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=100, error=0.003
>epoch=200, error=0.000
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

>epoch=100, error=0.003
>epoch=200, error=0.002
>epoch=300, error=0.002
>epoch=400, error=0.001
>epoch=500, error=0.001

>epoch=100, error=0.002
>epoch=200, error=0.001
>epoch=300, error=0.001
>epoch=400, error=0.001
>epoch=500, error=0.000

>epoch=100, error=0.002
>epoch=200, error=0.000
>epoch=300, error=0.000
>epoch=400, error=0.000
>epoch=500, error=0.000

>epoch=100, error=0.055
>epoch=200, error=0.014
>epoch=300, error=0.001
>epoch=400, error=0.000
>epoch=500, error=0.000

Scores: [92.85714285714286, 88.09523809523809, 97.61904761904762, 92.85714285714286, 88.09523809523809]
Mean Accuracy: 91.905%
