In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import itertools
import sys

sys.path.append("..")

from utility_functions import (calculate_model_performance,
                               plot_ROC,
                               one_hot_encode,
                               split_data_as,
                               grid_search,
                               shuffled,
                               timeit)

EPSILON = 10e-08


def get_shapes(any_):
    for array in any_:
        try:
            print(array.shape)
        except:
            print("NONE")
    print("\n")


# ============= ACTIVATION FUNCTIONS ===============#

def sigmoid(Z, prime=False):
    # np.
    if prime:
        return sigmoid(Z) * (1 - sigmoid(Z))
    return 1 / (1 + np.exp(-Z))


def linear(Z, prime=False):
    if prime:
        return np.ones_like(Z)
    return Z


def relu(Z, alpha=0.01, prime=False):
    if prime:
        Z_relu = np.ones_like(Z, dtype=np.float64)
        Z_relu[Z < 0] = alpha
        return Z_relu
    return np.where(Z < 0, alpha * Z, Z)


def tanh(Z, prime=False):
    # np.tanh() could be used directly to speed this up
    if prime:
        return 1 - np.power(tanh(Z), 2)
    return (2 / (1 + np.exp(-2 * Z))) - 1


def elu(Z, prime=False):
    # https://mlfromscratch.com/activation-functions-explained/#/
    alpha = 0.2
    if prime:
        return np.where(Z < 0, alpha * (np.exp(Z)), 1)
    return np.where(Z < 0, alpha * (np.exp(Z) - 1), Z)


def softmax(Z, prime=False):
    # https://deepnotes.io/softmax-crossentropy
    # max(Z) term is added to stabilise the function.
    exps = np.exp(Z - np.max(Z))
    return exps / np.sum(exps, axis=0)


# References
# https://mc.ai/multilayered-neural-network-from-scratch-using-python/
# https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/
# https://www.coursera.org/learn/machine-learning/home/week/5
# https://www.coursera.org/specializations/deep-learning
# https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py
# https://github.com/JWarmenhoven/Coursera-Machine-Learning

class NeuralNetwork:

    def __init__(
            self,
            input_layer: tuple,
            hidden_layer: list,  # list of tuples
            output_layer: int,
            batch_size=16,
            alpha=1,
            optimizer="SGD",
            penalty="l2",
            lambd="0.1",
            keep_prob = None,
            epoch=500,
            random_state=42,
            verbose=True,
            metrics="accuracy"
    ):
        self.input_layer = input_layer
        self.hidden_layer = hidden_layer
        self.output_layer = output_layer
        self.mini_batch_size = batch_size
        self.alpha = alpha
        self.optimizer = optimizer
        self.penalty = penalty
        self.lambd = lambd
        self.keep_prob = keep_prob
        # dropout: http://jmlr.org/papers/volume15/srivastava14a.old/srivastava14a.pdf
        self.dropout = True if isinstance(self.keep_prob, float) else False
        self.epoch = epoch
        self.seed = random_state
        self.verbose = verbose
        self.metrics = metrics
        self.layers = len(self.weight_set_dimensions) + 1
        self.EPSILON = 10e-10


    def __str__(self):
        parameters = (
            "Input layer: {0}\n"
            "Hidden layer: {1}\n"
            "Output layer: {2}\n"
            "Batch size: {3}\n"
            "Learning rate: {4}\n"
            "Epoch: {5}\n"
            "Seed: {6}\n"
            "Verbose: {7}\n"
            "Metric: {8}"
        ).format(
            self.input_layer,
            " - ".join(map(str, self.hidden_layer)),
            self.output_layer,
            self.mini_batch_size,
            self.alpha,
            self.epoch,
            self.seed,
            self.verbose,
            self.metrics
        )
        return parameters

    def get_A(self, X):
        A, _ = self.forwardpass(X)
        return A

    def get_Z(self, X):
        _, Z = self.forwardpass(X)
        return Z
    
    # ============== LOSS FUNCTIONS ===============#

    # https://deepnotes.io/softmax-crossentropy

    def calculate_error(self, Y, Y_hat):
        # Y and Y_hat should be in the form of (no_of_classes, no_of_training_examples)
        cost = -np.sum(Y * np.log(Y_hat + EPSILON)) / self.m
        penalise_by = 0
        if self.penalty == "l1":
            for layer in range(1, self.layers):
                penalise_by += np.sum(np.abs(self.W[layer])) * self.lambd / (2 * self.m)
            return cost + penalise_by
        elif self.penalty == "l2":
            for layer in range(1, self.layers):
                penalise_by += np.sum(np.square(self.W[layer])) * self.lambd / (2 * self.m)
            return cost + penalise_by
        else:
            return cost


    def display_information(self, X, Y, epoch_no):
        model_performance_metrics = calculate_model_performance(
            np.argmax(Y, axis=0),
            self.predict(X)
        )
        print("%s: %.10f - epoch %s    iteration %s - loss %.20f" % (
            self.metrics,
            model_performance_metrics[self.metrics],
            epoch_no,
            self.no_of_iterations,
            self.calculate_error(Y,
                            self.get_A(X)[-1])
        )
              )

    def get_dimensions_and_activations(self):
        self.dimensions = []
        self.activation_functions = []

        self.dimensions.append(self.input_layer[0])
        self.activation_functions.append(self.input_layer[1])

        for dim, act_func in self.hidden_layer:
            self.dimensions.append(dim)
            self.activation_functions.append(act_func)

        self.dimensions.append(self.output_layer)

    @property
    def weight_set_dimensions(self):
        self.get_dimensions_and_activations()
        a, b = itertools.tee(self.dimensions[::-1])
        next(b, None)
        weight_set_dimensions = list(zip(a, b))[::-1]
        return weight_set_dimensions

    def initialise_weights(self, layer=None):
        self.W = np.empty_like(range(self.layers), dtype=object)
        self.B = np.empty_like(range(self.layers), dtype=object)
        for layer, (y, x) in zip(range(1, self.layers), self.weight_set_dimensions):
            np.random.seed(self.seed)
            self.W[layer] = np.random.rand(y, x) / np.sqrt(self.dimensions[layer - 1])
            self.B[layer] = np.random.rand(y, 1)

    def forwardpass(self, X):
        Z = np.empty_like(range(self.layers), dtype=object)
        A = np.empty_like(range(self.layers), dtype=object)
        A[0] = X

        for layer in range(1, self.layers):
            # activation_function starts from 0 whereas layer starts from 1
            active_function = self.activation_functions[layer - 1]
            arg_to_pass_to_eval = "(Z[layer])"

            Z[layer] = self.W[layer] @ A[layer - 1] + self.B[layer]
            A[layer] = eval(active_function + arg_to_pass_to_eval)
            
            # dropout is only applied to first hidden layer
            # https://www.kaggle.com/mtax687/dropout-regularization-of-neural-net-using-numpy
            if self.dropout and layer == 1:
                self.D = np.random.randn(A[layer].shape[0], A[layer].shape[1])
                self.D = (self.D < self.keep_prob)
                A[layer] = np.multiply(A[layer], self.D) / self.keep_prob
        return A, Z

    def backpropagation(self, Y, A, Z):
        self.delta = np.empty_like(range(self.layers), dtype=object)

        self.gradient_W = np.empty_like(range(self.layers), dtype=object)
        self.gradient_B = np.empty_like(range(self.layers), dtype=object)

        self.delta[-1] = A[-1] - Y

        # We substract 1 here as delta_final is calculated seperately above
        for layer in reversed(range(1, self.layers - 1)):
            # 1 is substracted from layer as activation_functions start indexing from 0
            active_function = self.activation_functions[layer - 1]
            arg_to_pass_to_eval = "(Z[layer], prime=True)"
            
            DA = self.W[layer + 1].T @ self.delta[layer + 1]
            if self.dropout and layer == 1:
                DA = np.multiply(DA, self.D) / self.keep_prob

            self.delta[layer] = (
                    DA *
                    eval(active_function + arg_to_pass_to_eval)
            )

        for layer in range(1, self.layers):
            self.gradient_W[layer] = (self.delta[layer] @ A[layer - 1].T) / self.m
            self.gradient_B[layer] = np.sum(self.delta[layer], axis=1, keepdims=True) / self.m
            
            if self.penalty == "l1":
                # https://towardsdatascience.com/only-numpy-implementing-different-combination-of-l1-norm-l2-norm-l1-regularization-and-14b01a9773b
                self.gradient_W[layer] += np.where(self.W[layer] < 0, -1, 1) * (self.lambd / self.m)
            elif self.penalty == "l2":
                self.gradient_W[layer] += self.W[layer] * (self.lambd / self.m)
            
        self.update_weights()


    def update_weights(self):
        if self.optimizer == "SGD":
            for layer in range(1, self.layers):
                self.W[layer] -= self.alpha * self.gradient_W[layer]
                self.B[layer] -= self.alpha * self.gradient_B[layer]

        elif self.optimizer["method"] == "SGDM":
            for layer in range(1, self.layers):
                beta = self.optimizer["beta"]
                self.v_dw[layer] = beta * self.v_dw[layer] + (1 - beta) * self.gradient_W[layer]
                self.v_db[layer] = beta * self.v_db[layer] + (1 - beta) * self.gradient_B[layer]

                self.W[layer] -= self.alpha * self.v_dw[layer]
                self.B[layer] -= self.alpha * self.v_db[layer]

        elif self.optimizer["method"] == "RMSP":
            for layer in range(1, self.layers):
                beta = self.optimizer["beta"]
                self.s_dw[layer] = beta * self.s_dw[layer] + (1 - beta) * np.square(self.gradient_W[layer])
                self.s_db[layer] = beta * self.s_db[layer] + (1 - beta) * np.square(self.gradient_B[layer])

                w_rms_grad = self.gradient_W[layer] / (np.sqrt(self.s_dw[layer]) + self.EPSILON)
                b_rms_grad = self.gradient_B[layer] / (np.sqrt(self.s_db[layer]) + self.EPSILON)

                self.W[layer] -= self.alpha * w_rms_grad
                self.B[layer] -= self.alpha * b_rms_grad

        elif self.optimizer["method"] == "ADAM":
            # EWA: Exponential weighted average
            # ToDo: Check if bias correction is necessary. The EWA will be inaccurate initially,
            # but it shouldn't take many iterations to compute correct EWA.
            for layer in range(1, self.layers):
                beta1 = self.optimizer["beta1"]
                beta2 = self.optimizer["beta2"]
                self.v_dw[layer] = beta1 * self.v_dw[layer] + (1 - beta1) * self.gradient_W[layer]
                self.v_db[layer] = beta1 * self.v_db[layer] + (1 - beta1) * self.gradient_B[layer]

                self.s_dw[layer] = beta2 * self.s_dw[layer] + (1 - beta2) * np.square(self.gradient_W[layer])
                self.s_db[layer] = beta2 * self.s_db[layer] + (1 - beta2) * np.square(self.gradient_B[layer])

                v_dw_corrected = self.v_dw[layer] / (1 - beta1 ** self.no_of_iterations)
                s_dw_corrected = self.s_dw[layer] / (1 - beta2 ** self.no_of_iterations)

                v_db_corrected = self.v_db[layer] / (1 - beta1 ** self.no_of_iterations)
                s_db_corrected = self.s_db[layer] / (1 - beta2 ** self.no_of_iterations)

                self.W[layer] -= self.alpha * (v_dw_corrected / (np.sqrt(s_dw_corrected) + self.EPSILON))
                self.B[layer] -= self.alpha * (v_db_corrected / (np.sqrt(s_db_corrected) + self.EPSILON))


    def initialise_cache(self):
        self.v_dw = np.empty_like(range(self.layers), dtype=object)
        self.v_db = np.empty_like(range(self.layers), dtype=object)
    
        self.s_dw = np.empty_like(range(self.layers), dtype=object)
        self.s_db = np.empty_like(range(self.layers), dtype=object)

        for layer, (y, x) in zip(range(1, self.layers), self.weight_set_dimensions):
            self.v_dw[layer] = np.zeros((y, x))
            self.v_db[layer] = np.zeros((y, 1))
            
            self.s_dw[layer] = np.zeros((y, x))
            self.s_db[layer] = np.zeros((y, 1))


    @timeit
    def fit(self, X, Y):
        self.m = X.shape[1] # where (no_of_features, no_of_training_examples)
        self.initialise_weights()
        self.initialise_cache()

        # By default the method is SGD(Stochastic Gradient Descent) if one wishes to use
        # the whole batch, simply pass the number of traning examples available as the
        # batch size when instantiating the class
        self.no_of_iterations = 0
        shuffled = np.arange(self.m)
        if self.verbose:
            print("Initialising weights...")
            print("Starting the training...")
            print("Initial cost: %.10f\n" % self.calculate_error(Y, self.get_A(X)[-1]))
        for epoch_no in range(1, self.epoch + 1):
            np.random.shuffle(shuffled)
            X_shuffled = X[:, shuffled]
            Y_shuffled = Y[:, shuffled]
            for i in range(0, self.m, self.mini_batch_size):
                self.no_of_iterations += 1
                X_mini_batch = X_shuffled[:, i: i + self.mini_batch_size]
                Y_mini_batch = Y_shuffled[:, i: i + self.mini_batch_size]

                A, Z = self.forwardpass(X_mini_batch)
                self.backpropagation(Y_mini_batch, A, Z)
                if self.no_of_iterations % 100 == 0 and self.verbose:
                    self.display_information(X, Y, epoch_no)

    def predict(
            self,
            X: np.ndarray,
            return_prob_matrix=False
    ):
        """Predict the output given the training data.

            Returns the predicted values in two forms:

            1.either by picking up the highest value along the columns for every row,
                i.e. "np.argmax(self.A[-1].T, axis=1)"
            2.or by returning a matrix that is in the shape of Y.T where each column
                represents the probability of the instance belonging to that class.
                Please note that every column in Y.T represents a class. To be able to
                return the probability matrix, the final activation function must be
                softmax!
                i.e. "array([0.9650488423, 0.0354737543, 0.0005225966])"

        Args:
            X (numpy.ndarray): Training set in the shape of
                (no_of_features, no_of_training examples).
            return_prob_matrix (bool, optional): Returns the probability matrix if True.
                Defaults to False.

        Returns:
            numpy.ndarray:

            if return_prob_matrix is False, the output is in the shape of
                (no_of_training_examples, 1)
            if return_prob_matrix is True, the output is in the shape of
                (no_of_training_examples, no_of_features)
        """
        A, Z = self.forwardpass(X)
        if return_prob_matrix:
            np.set_printoptions(precision=10, suppress=True)
            return A[-1].T
        return np.argmax(A[-1].T, axis=1)

# Testing with benchmark datasets

## 1.Iris Dataset

In [10]:
from sklearn.datasets import load_iris
data = load_iris()

x = data.data[:,[0,2]]
y = data.target

X = x.T
Y = one_hot_encode(y).T

In [11]:
print(X.shape)
print(Y.shape)

(2, 150)
(3, 150)


In [12]:
model = NeuralNetwork(
    input_layer=(2, 'relu'),
    hidden_layer=[(4,'relu'),(4,'softmax')],
    output_layer=3,
    batch_size=8,
    optimizer={"method": "ADAM", "beta1": 0.9, "beta2": 0.999},
    keep_prob=0.5,
    penalty=None,
    lambd=0.0001,
    epoch=2000,
    alpha=0.01
)

model.fit(X,Y)

Initialising weights...
Starting the training...
Initial cost: 2.6587031456

accuracy: 66.6666666662 - epoch 6    iteration 100 - loss 0.75586593755764930336
accuracy: 71.9999999995 - epoch 11    iteration 200 - loss 0.55747452395122465418
accuracy: 81.3333333328 - epoch 16    iteration 300 - loss 0.48229831076182466676
accuracy: 81.9999999995 - epoch 22    iteration 400 - loss 0.45386067197895857417
accuracy: 84.6666666661 - epoch 27    iteration 500 - loss 0.37294113968164377404
accuracy: 83.3333333328 - epoch 32    iteration 600 - loss 0.34163518121146330131
accuracy: 84.6666666661 - epoch 37    iteration 700 - loss 0.31364021128498503765
accuracy: 85.3333333328 - epoch 43    iteration 800 - loss 0.46334548920048407306
accuracy: 89.3333333327 - epoch 48    iteration 900 - loss 0.25348162351330327802
accuracy: 89.3333333327 - epoch 53    iteration 1000 - loss 0.29836301239307472244
accuracy: 90.6666666661 - epoch 58    iteration 1100 - loss 0.23936236395863313975
accuracy: 91.3333333

In [None]:
calculate_model_performance(np.argmax(Y, axis=0),
                           model.predict(X))

In [None]:
results_dict_all_models, results_average_dict, models = grid_search(
    x,
    y,
    clf=NeuralNetwork,
    lst_metrics=["F1", "accuracy"],
    sort_by = "accuracy",
    n_folds=5,
    dict_param_grid={
        'batch_size': [8, 16, 32],
        'input_layer': [(2, 'relu')],
        'hidden_layer': [
            [(4,'relu'), (4,'softmax')],
            [(4,'sigmoid'),(4,'softmax')]
        ],
        'optimizer': [
            {
                "method": "RMSP",
                "beta": 0.9
            }
        ],
        'output_layer': [3],
        'alpha': [0.001],
        'verbose': [False],
        'epoch': [1000]
    }
)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
dt = data.data[:,[0,2]]
x_min, x_max = dt[:, 0].min() - 1, dt[:, 0].max() + 1
y_min, y_max = dt[:, 1].min() - 1, dt[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# here "model" is your model's prediction (classification) function
Z = model.predict(np.c_[xx.ravel(), yy.ravel()].T) 

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,8))
plt.contourf(xx, yy, Z,alpha=0.4)
#plt.axis('off')

plt.scatter(dt[:, 0], dt[:, 1], c=y,s=20, edgecolor='k')
plt.xlabel('sepal length')
plt.ylabel('petal length')

## 2.Make Moons dataset

In [None]:
from sklearn.datasets import make_moons

x,y =make_moons(n_samples=1500, noise=.05)
X = x.T
Y = one_hot_encode(y).T

print(X.shape)
print(Y.shape)

In [None]:
model = NeuralNetwork(
    input_layer=(X.shape[0], 'relu'),
    hidden_layer=[(10,'relu'), (4,'softmax')],
    output_layer=Y.shape[0],
    batch_size=8,
    optimizer=
    {
        "method": "ADAM",
        "beta1": 0.9,
        "beta2": 0.999
    },
    penalty = "l2",
    keep_prob=0.5,
    lambd=0.001,
    epoch=250,
    alpha=0.01
)

model.fit(X,Y)

In [None]:
### Decision Boundaries
%matplotlib inline
import matplotlib.pyplot as plt
dt = x
x_min, x_max = dt[:, 0].min() - 0.5, dt[:, 0].max() + 0.5
y_min, y_max = dt[:, 1].min() - 0.5, dt[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# here "model" is your model's prediction (classification) function
Z = model.predict(np.c_[xx.ravel(), yy.ravel()].T) 

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,8))
plt.contourf(xx, yy, Z,alpha=0.4)
#plt.axis('off')
plt.scatter(dt[:, 0], dt[:, 1], c=y, s=20, edgecolor='k')
plt.title('Decision Boundaries')

## 3.Andrew NG Assignment 2 Dataset

In [None]:
ex2data2 = np.loadtxt("../ex2/data/ex2data2.txt", delimiter=",")

x = ex2data2[:, :-1]
y = ex2data2[:, -1]

X = x.T
Y = one_hot_encode(y).T

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
model = NeuralNetwork(
    input_layer=(X.shape[0], 'relu'),
    hidden_layer=[(10,'relu'), (4,'softmax')],
    output_layer=Y.shape[0],
    batch_size=8,
    optimizer={"method": "ADAM", "beta1": 0.9, "beta2": 0.999},
    penalty = None,
    dropout=False,
    keep_prob=0.9,
    lambd=0.05,
    epoch=1500,
    alpha=0.001
)

model.fit(X,Y)

In [None]:
results_dict_all_models, results_average_dict, models = grid_search_stratified(
    x,
    y,
    clf=NeuralNetwork,
    metrics=["F1", "accuracy"],
    sort_by = "accuracy",
    n_fold=6,
    param_grid_dict={
        'batch_size': [16, 32],
        'input_layer': [(2, 'relu')],
        'hidden_layer': [
            [(4,'relu'), (4,'relu'), (4,'softmax')],
            [(4,'sigmoid'),(4,'softmax')]
        ],
        'output_layer': [2],
        'alpha': [2, 4],
        'verbose': [False],
        'epoch': [5000]
    }
)

In [None]:
results_average_dict

In [None]:
print(models["model_5"])

In [None]:
### Decision Boundaries
%matplotlib inline
import matplotlib.pyplot as plt
X = ex2data2

x1_min, x1_max = X[:, 0].min() - 0.3, X[:, 0].max() + 0.3,
x2_min, x2_max = X[:, 1].min() - 0.3, X[:, 1].max() + 0.3,
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))

# here "model" is your model's prediction (classification) function
Z = model.predict(np.c_[xx1.ravel(), xx2.ravel()].T) 

negatives = ex2data2[ex2data2[:, -1] == 0]
positives = ex2data2[ex2data2[:, -1] == 1]

# Put the result into a color plot
Z = Z.reshape(xx1.shape)
plt.figure(figsize=(12,8))
plt.contourf(xx1, xx2, Z,alpha=0.4)
#plt.axis('off')
plt.scatter(negatives[:, 0], negatives[:, 1],s=50, color='k')
plt.scatter(positives[:, 0], positives[:, 1],s=50, color='r')
plt.title('Decision Boundaries')

plt.contour(xx1, xx2, Z, [0.5], linewidths=2, colors="g")

# MNIST dataset

In [None]:
from scipy.io import loadmat
data = loadmat('../ex3/data/ex3data1.mat')
data.keys()

In [None]:
x = data["X"]
y = data["y"]
y[y==10] = 0

In [None]:
dataset_test, dataset_train = split_data_as(x, y, train=0.9, test=0.1)
X_train = dataset_train[:, :-1].T
Y_train = one_hot_encode(dataset_train[:, -1]).T

X_test = dataset_test[:, :-1].T
Y_test = one_hot_encode(dataset_test[:, -1]).T

In [None]:
# X = x.T
# Y = one_hot_encode(y).T

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20,20))

sample = np.random.choice(data["X"].shape[0], 20)
ax.imshow(data["X"][sample,1:].reshape(-1,20).T)
ax.axis('off');

In [None]:
# http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html#4d4e495354

model = NeuralNetwork(
    input_layer=(X_train.shape[0], 'relu'),
    hidden_layer=[(200,'relu'),(100,'relu'),(4,'softmax')],
    output_layer=Y_train.shape[0],
    batch_size=16,
    optimizer={"method": "ADAM", "beta1": 0.9, "beta2": 0.999},
    penalty = "l2",
    dropout=True,
    lambd=0.1,
    epoch=500,
    alpha=0.001
)

model.fit(X_train, Y_train)

In [None]:
calculate_model_performance(np.argmax(Y_test, axis=0), model.predict(X_test))

In [None]:
model = NeuralNetwork(
    (X.shape[0], 'relu'),
    [(200, 'relu'), (100, 'relu'), (50, 'relu'), (10,'softmax')],
    Y.shape[0],
    batch_size=50,
    optimizer={
        "method": "RMSP",
        "beta": 0.9
                },
    epoch=100,
    alpha=0.001
)

model.fit(X_test,Y_test)

In [None]:
model = NeuralNetwork(
    (X.shape[0], 'relu'),
    [(25,'relu'), (4,'softmax')],
    Y.shape[0],
    batch_size=50,
    optimizer={
        "method": "SGDM",
        "beta": 0.9
                },
    epoch=1000,
    alpha=6)

model.fit(X,Y)

In [None]:
results_dict_all_models, results_average_dict, models = grid_search(
    x,
    y,
    clf=NeuralNetwork,
    lst_metrics=["F1", "accuracy"],
    sort_by = "F1",
    n_folds=10,
    dict_param_grid={
        'batch_size': [64, 128, 256],
        'input_layer': [(x.shape[1], 'relu')],
        'hidden_layer': [
            [(50, 'relu'), (25, 'relu'), (10,'softmax')],
            [(200, 'relu'), (100, 'relu'), (50, 'relu'), (10,'softmax')]
        ],
        'optimizer':[
            {
                "method": "ADAM",
                "beta1": 0.9,
                "beta2": 0.999
            }
        ],
        'output_layer': [10],
        'alpha': [0.0001, 0.001],
        'verbose': [False],
        'epoch': [250]
    }
)

In [None]:
results_average_dict

In [None]:
print(models["model_1"])

In [None]:
def display_miss_clasifications(model, digits_to_display):
    count = 0
    for index, (act, predicted) in enumerate(zip(np.argmax(Y,axis=0), model.predict(X))):
        if act != predicted:
            fig, ax = plt.subplots(figsize = (2,2))
            ax.set_title("%s: act %s --- predicted %s" %(index, act, predicted))
            ax.imshow(X[:, index].reshape(-1,20).T)
            ax.axis('off');
            count += 1
        if count == digits_to_display:
            break

In [None]:
display_miss_clasifications(model, 50)