In [1]:
import numpy as np

class MultiplyGate:
    def forward(self,W, X):
        return np.dot(X, W)

    def backward(self, W, X, dZ):
        dW = np.dot(np.transpose(X), dZ)
        dX = np.dot(dZ, np.transpose(W))
        return dW, dX

class AddGate:
    def forward(self, X, b):
        return X + b

    def backward(self, X, b, dZ):
        dX = dZ * np.ones_like(X)
        db = np.dot(np.ones((1, dZ.shape[0]), dtype=np.float64), dZ)
        return db, dX

class Sigmoid:
    def forward(self, X):
        return 1.0 / (1.0 + np.exp(-X))

    def backward(self, X, top_diff):
        output = self.forward(X)
        return (1.0 - output) * output * top_diff

class Tanh:
    def forward(self, X):
        return np.tanh(X)

    def backward(self, X, top_diff):
        output = self.forward(X)
        return (1.0 - np.square(output)) * top_diff
    
class Softmax:
    def predict(self, X):
        exp_scores = np.exp(X)
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    def loss(self, X, y):
        num_examples = X.shape[0]
        probs = self.predict(X)
        print(probs[:5], y[:5])
        corect_logprobs = -np.log(probs[range(num_examples), y])
        data_loss = np.sum(corect_logprobs)
#         print('data loss', 1./num_examples * data_loss)
        return 1./num_examples * data_loss
#         log_likelihood = -np.log(probs[range(y.shape[0]), y])
#         print('loss', np.sum(log_likelihood) / y.shape[0])
#         loss = np.sum(log_likelihood) / y.shape[0]
# #         return loss
#         return 1./num_examples * data_loss
    
    def diff(self, X, y):
        num_examples = X.shape[0]
        probs = self.predict(X)
        probs[range(num_examples), y] -= 1
        return probs

class Model:
    def __init__(self, layers_dim):
        self.b = []
        self.W = []
        for i in range(len(layers_dim)-1):
            self.W.append(np.random.randn(layers_dim[i], layers_dim[i+1]) / np.sqrt(layers_dim[i]))
            self.b.append(np.random.randn(layers_dim[i+1]).reshape(1, layers_dim[i+1]))
#             self.W.append(np.zeros(layers_dim[i], layers_dim[i+1]))
#             self.b.append(np.ones(layers_dim[i+1]).reshape(1, layers_dim[i+1]))

    def predict(self, X):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()

        input = X
        for i in range(len(self.W)):
            mul = mulGate.forward(self.W[i], input)
            add = addGate.forward(mul, self.b[i])
            input = layer.forward(add)

        probs = softmaxOutput.predict(input)
        return np.argmax(probs, axis=1)

    def calculate_loss(self, X, y):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()

        input = X
        for i in range(len(self.W)):
            mul = mulGate.forward(self.W[i], input)
            add = addGate.forward(mul, self.b[i])
            input = layer.forward(add)

        return softmaxOutput.loss(input, y)

    def train(self, X, y, num_passes=70, epsilon=0.01, reg_lambda=0.01, print_loss=False):
        mulGate = MultiplyGate()
        addGate = AddGate()
        layer = Tanh()
        softmaxOutput = Softmax()

        for epoch in range(num_passes):
            # Forward propagation
            input = X
            forward = [(None, None, input)]
            for i in range(len(self.W)):
                mul = mulGate.forward(self.W[i], input)
                add = addGate.forward(mul, self.b[i])
                input = layer.forward(add)
                forward.append((mul, add, input))
#                 print('ff size z, a, mul', add.shape, input.shape, mul.shape)

            # Back propagation
            dtanh = softmaxOutput.diff(forward[len(forward)-1][2], y)
#             print('dtanh init', dtanh.shape)
            for i in range(len(forward)-1, 0, -1):
                dadd = layer.backward(forward[i][1], dtanh)
                db, dmul = addGate.backward(forward[i][0], self.b[i-1], dadd)
                dW, dtanh = mulGate.backward(self.W[i-1], forward[i-1][2], dmul)
                
                self.W[i-1] += -epsilon * dW

                # Add regularization terms (b1 and b2 don't have regularization terms)
                dW += reg_lambda * self.W[i-1]
                # Gradient descent parameter update
                self.b[i-1] += -epsilon * db

            if print_loss and epoch % 1000 == 0:
#             if print_loss:
                print("Loss after iteration %i: %f" %(epoch, self.calculate_loss(X, y)))

In [2]:
import sys
import csv
import os.path

def load_and_prep_data(csvfile):

	# category to int function for y
	def f(i):
		if i[1] == 'M':
			return 1
		else:
			return 0

	#open file proc
	def load_data(csvfile):
		if not os.path.isfile(csvfile):
			exit_error('can\'t find the file ' + csvfile)
		data = []
		with open(csvfile) as csv_iterator:
			data_reader = csv.reader(csv_iterator, delimiter=',')
			for row in data_reader:
				data.append(row)
		csv_iterator.close()
		if len(data) < 1:
			exit_error('file ' + csvfile + ' is empty')
		return data

	# load data from csvfile
	dataRaw = np.array(load_data(csvfile))
	dataTemp = []

	# fill y / replace categorical values with numeric values (1 is for 'M')
	y = np.array([f(i) for i in dataRaw])

	# remove unwanted columns/features
	dataRaw = np.delete(dataRaw, [0,1,4,5], 1)

	# cast to float
	dataRaw = dataRaw.astype('float')

	# normalize data using transpose
	dataTemp = np.zeros((dataRaw.shape[1], dataRaw.shape[0]))
	for index, feature in enumerate(dataRaw.T):
		dataTemp[index] = [(x - min(feature)) / (max(feature) - min(feature)) for x in feature]
	
	print('\n\033[32mData loaded...\033[0m')
	print('\033[32m%d data rows for %d features...\033[0m' % (dataTemp.T.shape[0], dataTemp.T.shape[1]))
	return dataTemp.T, y

def divide_dataset(data, y, train_share):
	limit = int(len(data) * train_share)
	p = np.random.permutation(len(data))
	data = data[p]
	y = y[p]
	print('\033[32mShuffling the dataset...\033[0m')
	return data[:limit], data[limit:], y[:limit], y[limit:]

np.random.seed(42)
train_share = 0.8			#share of the dataset to use as train set
mlp_layers = [10,20]		#size of each hidden layer
mlp_init = ''				#random sur distrib 'uniform' or 'normal'(default normal)
mlp_activation = ''			#'relu' (rectified linear unit) or 'sigmoid' or 'tanh'(hyperboloid tangent) (default tanh)
nb_cats = 2					#size of the output layer
epochs = 3
batch_size = 64
learningR = 0.01

csvfile = './data/data.csv'
# Data retrieval and cleaning
data, y = load_and_prep_data(csvfile)

# Creation of train and validation dataset
x_train, x_valid, y_train, y_valid = divide_dataset(data, y, train_share)
batch_size = x_train.shape[0]
print('\033[32m%d rows for the train dataset (%d%%), %d rows for validation...\033[0m\n' % \
    (x_train.shape[0], train_share * 100, x_valid.shape[0]))

layers_dim = [28, 30, 10, 2]

model = Model(layers_dim)
model.train(x_train, y_train, num_passes=70, epsilon=0.01, reg_lambda=0.01, print_loss=True)


[32mData loaded...[0m
[32m569 data rows for 28 features...[0m
[32mShuffling the dataset...[0m
[32m455 rows for the train dataset (80%), 114 rows for validation...[0m

[[ 0.52298337  0.47701663]
 [ 0.53404563  0.46595437]
 [ 0.53037174  0.46962826]
 [ 0.52198885  0.47801115]
 [ 0.51709471  0.48290529]] [0 1 1 0 0]
[[-0.47701663  0.47701663]
 [ 0.53404563 -0.53404563]
 [ 0.53037174 -0.53037174]
 [-0.47801115  0.47801115]
 [-0.48290529  0.48290529]]
Loss after iteration 0: 0.811482
[[ 0.86551733  0.13448267]
 [ 0.86549615  0.13450385]
 [ 0.86443273  0.13556727]
 [ 0.86514741  0.13485259]
 [ 0.86653003  0.13346997]] [0 1 1 0 0]
[[-0.13448267  0.13448267]
 [ 0.86549615 -0.86549615]
 [ 0.86443273 -0.86443273]
 [-0.13485259  0.13485259]
 [-0.13346997  0.13346997]]
Loss after iteration 1: 0.691014
[[ 0.50382205  0.49617795]
 [ 0.50379994  0.49620006]
 [ 0.50380424  0.49619576]
 [ 0.50381723  0.49618277]
 [ 0.50385128  0.49614872]] [0 1 1 0 0]
[[-0.49617795  0.49617795]
 [ 0.50379994 -

Loss after iteration 56: 0.693152
[[ 0.4999613   0.5000387 ]
 [ 0.49995374  0.50004626]
 [ 0.49995483  0.50004517]
 [ 0.4999654   0.5000346 ]
 [ 0.49996759  0.50003241]] [0 1 1 0 0]
[[-0.5000387   0.5000387 ]
 [ 0.49995374 -0.49995374]
 [ 0.49995483 -0.49995483]
 [-0.5000346   0.5000346 ]
 [-0.50003241  0.50003241]]
Loss after iteration 57: 0.693152
[[ 0.49996172  0.50003828]
 [ 0.4999542   0.5000458 ]
 [ 0.49995528  0.50004472]
 [ 0.4999658   0.5000342 ]
 [ 0.49996799  0.50003201]] [0 1 1 0 0]
[[-0.50003828  0.50003828]
 [ 0.4999542  -0.4999542 ]
 [ 0.49995528 -0.49995528]
 [-0.5000342   0.5000342 ]
 [-0.50003201  0.50003201]]
Loss after iteration 58: 0.693152
[[ 0.49996214  0.50003786]
 [ 0.49995466  0.50004534]
 [ 0.49995573  0.50004427]
 [ 0.49996619  0.50003381]
 [ 0.49996838  0.50003162]] [0 1 1 0 0]
[[-0.50003786  0.50003786]
 [ 0.49995466 -0.49995466]
 [ 0.49995573 -0.49995573]
 [-0.50003381  0.50003381]
 [-0.50003162  0.50003162]]
Loss after iteration 59: 0.693151
[[ 0.4999625

In [5]:
import matplotlib.pyplot as plt
import numpy as np

# Helper function to plot a decision boundary.
def plot_decision_boundary(pred_func, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model

# Generate a dataset and plot it
np.random.seed(0)
X, y = sklearn.datasets.make_moons(200, noise=0.20)
# plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral)
# plt.show()

layers_dim = [2, 3, 2]

model = Model(layers_dim)
model.train(X, y, num_passes=70, epsilon=0.01, reg_lambda=0.01, print_loss=True)

# # Plot the decision boundary
# plot_decision_boundary(lambda x: model.predict(x), X, y)
# plt.title("Decision Boundary for hidden layer size 3")
# plt.show()

[[ 0.63543161  0.36456839]
 [ 0.69455111  0.30544889]
 [ 0.62039558  0.37960442]
 [ 0.46777197  0.53222803]
 [ 0.67857082  0.32142918]] [0 1 1 0 1]
[[-0.36456839  0.36456839]
 [ 0.69455111 -0.69455111]
 [ 0.62039558 -0.62039558]
 [-0.53222803  0.53222803]
 [ 0.67857082 -0.67857082]]
Loss after iteration 0: 0.700174
[[ 0.53317122  0.46682878]
 [ 0.54034963  0.45965037]
 [ 0.52750566  0.47249434]
 [ 0.4129937   0.5870063 ]
 [ 0.53836556  0.46163444]] [0 1 1 0 1]
[[-0.46682878  0.46682878]
 [ 0.54034963 -0.54034963]
 [ 0.52750566 -0.52750566]
 [-0.5870063   0.5870063 ]
 [ 0.53836556 -0.53836556]]
Loss after iteration 1: 0.694472
[[ 0.52692121  0.47307879]
 [ 0.52770369  0.47229631]
 [ 0.5215541   0.4784459 ]
 [ 0.42481264  0.57518736]
 [ 0.52984545  0.47015455]] [0 1 1 0 1]
[[-0.47307879  0.47307879]
 [ 0.52770369 -0.52770369]
 [ 0.5215541  -0.5215541 ]
 [-0.57518736  0.57518736]
 [ 0.52984545 -0.52984545]]
Loss after iteration 2: 0.686350
[[ 0.52211796  0.47788204]
 [ 0.50840124  0.49159