In [1]:
import urllib.request
import os
import pickle, sys, random
import gzip, time
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
sys.path.append(os.path.pardir)

In [3]:
from data.mnist_1 import load_data
from data.mnist_2 import load_data_2

In [16]:
class Network:
    def __init__(self, sizes=[100, 100], activation="relu", dropout_rate=0.0):
        """
        :param sizes: list of layers
        :param activations: activation_functions
        """
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.weights = [np.random.randn(back_layer, forward_layer) * np.sqrt(2.0 / forward_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(back_layer, 1) for back_layer in sizes[1:]]
        self.dropout_rate = dropout_rate

        # TODO  activation_functions = {'sigmoid': sigmoid, 'relu': relu} tanh
        if activation.lower() == "sigmoid":
            self.activation = Network.sigmoid
            self.activation_derivative = Network.sigmoid_derivative
        elif activation.lower() == "relu":
            self.activation = Network.relu
            self.activation_derivative = Network.relu_derivative

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(w, a) + b)
            a *= (1.0 - self.dropout_rate)  ######### test dropout
        a = np.dot(self.weights[-1], a) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        # forward pass #
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, a) + b

            self.mask = np.random.rand(*z.shape) > self.dropout_rate
            z *= self.mask
        #    z /= (1 - self.dropout_rate)

            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(self.weights[-1], a) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(Network.softmax(final_layer))

        # backward pass#
        delta = Network.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(delta, a_hold[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l + 1].T, delta) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(delta, a_hold[-l - 1].T)
            gradient_b[-l] = delta

        return gradient_w, gradient_b


    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout

    @staticmethod
    def softmax(z):
        z = z - np.max(z)
        return np.exp(z) / np.sum(np.exp(z))

    @staticmethod
    def softmax_batch(z):
        z = z.T
        z = z - np.max(z, axis=0)
        t = np.exp(z) / np.sum(np.exp(z), axis=0)
        return t.T

    @staticmethod
    def softmax_derivative(a, b):
        return a - b


class Network_mini_batch(Network):
    def __init__(self, sizes=[100, 100], activation="relu"):
        super().__init__(sizes, activation)
        self.weights = [np.random.randn(forward_layer, back_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer) for layer in sizes[1:]]

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(a, w) + b)
        a = np.dot(a, self.weights[-1]) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(a, w) + b  # batch  z = a * w + b
            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(a, self.weights[-1]) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(self.softmax_batch(final_layer))
        
        delta = self.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(a_hold[-2].T, delta)
        gradient_b[-1] = np.sum(delta, axis=0)

        for l in range(2, self.num_layers):
            delta = np.dot(delta, self.weights[-l + 1].T) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(a_hold[-l - 1].T, delta)
            gradient_b[-l] = np.sum(delta, axis=0)

        return gradient_w, gradient_b

In [17]:
def train_DNN_minibatch(X_train, y_train, num_epochs, learning_rate, batch_size, network, X_test=None, y_test=None):
    for epoch in range(num_epochs):
        random_mask = np.random.choice(len(X_train), len(X_train), replace=False)
        X_train = X_train[random_mask]
        y_train = y_train[random_mask]

        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i : i + batch_size]
            y_batch = y_train[i : i + batch_size]
            gradient_w = [np.zeros(w.shape) for w in network.weights]
            gradient_b = [np.zeros(b.shape) for b in network.biases]
            add_w, add_b = network.backprop(X_batch, y_batch)
            gradient_w = [gw + aw for gw, aw in zip(gradient_w, add_w)]
            gradient_b = [gb + ab for gb, ab in zip(gradient_b, add_b)]
            network.weights = [weight - learning_rate * gw / batch_size for weight, gw in zip(network.weights, gradient_w)]
            network.biases = [bias - learning_rate * gb / batch_size for bias, gb in zip(network.biases, gradient_b)]

        if X_test is not None:
            print("Epoch {0}, training_accuracy: {1},\t validation accuracy: {2}".
                  format(epoch + 1, evaluate(X_train, y_train, network), evaluate(X_test, y_test, network)))
        else:
            print("Epoch {0}, training_accuracy: {1}".
                  format(epoch + 1, evaluate(X_train, y_train, network)))

def evaluate(X_val, y_val, network):
    y_pred = [np.argmax(network.predict(x)) for x in X_val]
    return np.mean([int(y_p == np.argmax(y)) for y_p, y in zip(y_pred, y_val)])

In [None]:
def one_hot(y_, n_classes=10):  ####
    # Function to encode neural one-hot output labels from number indexes
    # e.g.:
    # one_hot(y_=[[5], [0], [3]], n_classes=6):
    #     return [[0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0]]

    y_ = y_.reshape(len(y_))
    return np.eye(n_classes)[np.array(y_, dtype=np.int32)]  # Returns FLOATS


def load_data_2(normalize=True, flatten=True, one_hot=True, batch=True):
    if not os.path.exists(save_file):
        init_mnist()

    with open(save_file, 'rb') as f:
        dataset = pickle.load(f)

    if normalize:
        for t in [0, 2]:
    #        dataset[t] = dataset[t].astype(np.float32)
     #       dataset[t] /= 255.0
            dataset[t] = [x.astype(np.float32) / 255.0 for x in dataset[t]]

    if one_hot:
        for y in [1, 3]:
            dataset[y] = one_hot(dataset[y])

    if batch:
        for i in range(4):
            length = len(dataset[i])
            dataset[i] = np.array(dataset[i]).reshape(length, -1)

    if not flatten:
        for t in [0, 2]:
            dataset[t] = dataset[t].reshape(-1, 1, 28, 28)

    return (dataset[0], dataset[1]), (dataset[2], dataset[3])

In [23]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [19]:
(X_train, y_train), (X_test, y_test) = load_data_2(batch=True)

Done!


In [20]:
dnn = Network_mini_batch(sizes=[784, 30, 10], activation="relu")
train_DNN_minibatch(X_train, y_train, 30, 0.005, 32, dnn, X_test, y_test)

Epoch 1, training_accuracy: 0.6496666666666666,	 validation accuracy: 0.6625
Epoch 2, training_accuracy: 0.6859333333333333,	 validation accuracy: 0.6963
Epoch 3, training_accuracy: 0.7047,	 validation accuracy: 0.7151
Epoch 4, training_accuracy: 0.7294,	 validation accuracy: 0.7398
Epoch 5, training_accuracy: 0.7442333333333333,	 validation accuracy: 0.7566
Epoch 6, training_accuracy: 0.7512333333333333,	 validation accuracy: 0.7613
Epoch 7, training_accuracy: 0.7658833333333334,	 validation accuracy: 0.7761
Epoch 8, training_accuracy: 0.7694,	 validation accuracy: 0.7817
Epoch 9, training_accuracy: 0.7753333333333333,	 validation accuracy: 0.7821
Epoch 10, training_accuracy: 0.7921333333333334,	 validation accuracy: 0.7996
Epoch 11, training_accuracy: 0.7964333333333333,	 validation accuracy: 0.8049


KeyboardInterrupt: 

## Cifar10

In [19]:
import warnings
warnings.filterwarnings("ignore")
import urllib.request
import os, tarfile
import pickle, sys, random
import gzip, time
import numpy as np
from sklearn.preprocessing import StandardScaler

In [4]:
type(aa)

dict

In [13]:
aa.keys()

dict_keys([b'labels', b'data', b'batch_label', b'filenames'])

In [2]:
class standardscale:
    def __init__(self):
        self.mean = None
        self.std = None
        
    def fit_transform(self, X):
        self.mean = np.mean(X, axis=0)   # .astype(np.float32)
        self.std = np.std(X, axis=0)   # .astype(np.float32)
        return (X - self.mean) / self.std
        
    def transform(self, X):
        return (X - self.mean) / self.std

In [3]:
np.random.seed(42)

In [4]:
data, labels = [], []
for i in range(1, 6):
    with open('cifar10/data_batch_%d' % i, 'rb') as f:
        whole = pickle.load(f, encoding='bytes')
        data.extend(whole[b'data'])
        labels.extend(whole[b'labels'])

In [5]:
test_data, test_labels = [], []
with open('cifar10/test_batch', 'rb') as f:
    whole = pickle.load(f, encoding='bytes')
    test_data = whole[b'data']
    test_labels = np.array(whole[b'labels'])

In [5]:
# X_train = np.array(data) / 255.0
# X_test = np.array(test_data) / 255.0
# y_train = np.eye(10)[np.array(labels, dtype=np.int32)]
# y_test = np.eye(10)[np.array(test_labels, dtype=np.int32)]

In [24]:
np.iinfo(np.int16), np.finfo(np.float32)

(iinfo(min=-32768, max=32767, dtype=int16),
 finfo(resolution=1e-06, min=-3.4028235e+38, max=3.4028235e+38, dtype=float32))

In [6]:
X_train = np.array(data).astype(np.float32)
X_test = np.array(test_data).astype(np.float32)
y_train = np.eye(10)[np.array(labels, dtype=np.int32)]
y_test = np.eye(10)[np.array(test_labels, dtype=np.int32)]

# ss = StandardScaler()
# X_train = ss.fit_transform(X_train)
# X_test = ss.transform(X_test)

ss = standardscale()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [11]:
sys.getsizeof(X_train) / 1024 / 1024, sys.getsizeof(y_train) / 1024 / 1024

(585.9376068115234, 3.8148040771484375)

In [51]:
# X_train = [np.reshape(row, (3072, 1)) for row in data]
# X_test = [np.reshape(row, (3072, 1)) for row in test_data]
# two_dim = np.eye(10)[np.array(labels, dtype=np.int32)]
# two_dim_test = np.eye(10)[np.array(test_labels, dtype=np.int32)]
# y_train = [row.reshape(10, 1) for row in two_dim]
# y_test = [row.reshape(10, 1) for row in two_dim_test]

In [12]:
class Network:
    def __init__(self, sizes=[100, 100], activation="relu", dropout_rate=0.0):
        """
        :param sizes: list of layers
        :param activations: activation_functions
        """
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.weights = [np.random.randn(back_layer, forward_layer)  * np.sqrt(2.0 / forward_layer) 
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(back_layer, 1) for back_layer in sizes[1:]]
        self.dropout_rate = dropout_rate

        # TODO  activation_functions = {'sigmoid': sigmoid, 'relu': relu} tanh
        if activation.lower() == "sigmoid":
            self.activation = Network.sigmoid
            self.activation_derivative = Network.sigmoid_derivative
        elif activation.lower() == "relu":
            self.activation = Network.relu
            self.activation_derivative = Network.relu_derivative

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(w, a) + b)
            a *= (1.0 - self.dropout_rate)  ######### test dropout
        a = np.dot(self.weights[-1], a) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        # forward pass #
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, a) + b

            self.mask = np.random.rand(*z.shape) > self.dropout_rate
            z *= self.mask
        #    z /= (1 - self.dropout_rate)

            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(self.weights[-1], a) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(Network.softmax(final_layer))

        # backward pass#
        delta = Network.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(delta, a_hold[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l + 1].T, delta) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(delta, a_hold[-l - 1].T)
            gradient_b[-l] = delta

        return gradient_w, gradient_b


    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout

    @staticmethod
    def softmax(z):
        z = z - np.max(z)
        return np.exp(z) / np.sum(np.exp(z))

    @staticmethod
    def softmax_batch(z):
        z = z.T
        z = z - np.max(z, axis=0)
        t = np.exp(z) / np.sum(np.exp(z), axis=0)
        return t.T

    @staticmethod
    def softmax_derivative(a, b):
        return a - b


class Network_mini_batch(Network):
    def __init__(self, sizes=[100, 100], activation="relu"):
        super().__init__(sizes, activation)
        self.weights = [np.random.randn(forward_layer, back_layer) * np.sqrt(2.0 / forward_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer) for layer in sizes[1:]]

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(a, w) + b)
        a = np.dot(a, self.weights[-1]) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(a, w) + b  # batch  z = a * w + b
            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(a, self.weights[-1]) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(self.softmax_batch(final_layer))
        
        delta = self.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(a_hold[-2].T, delta)
        gradient_b[-1] = np.sum(delta, axis=0)

        for l in range(2, self.num_layers):
            delta = np.dot(delta, self.weights[-l + 1].T) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(a_hold[-l - 1].T, delta)
            gradient_b[-l] = np.sum(delta, axis=0)

        return gradient_w, gradient_b

In [13]:
def train_DNN_minibatch(X_train, y_train, num_epochs, learning_rate, batch_size, network, X_test=None, y_test=None):
    for epoch in range(num_epochs):
        start = time.time()
        random_mask = np.random.choice(len(X_train), len(X_train), replace=False)
        X_train = X_train[random_mask]
        y_train = y_train[random_mask]

        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i : i + batch_size]
            y_batch = y_train[i : i + batch_size]
        #    gradient_w = [np.zeros(w.shape) for w in network.weights]
        #    gradient_b = [np.zeros(b.shape) for b in network.biases]
            add_w, add_b = network.backprop(X_batch, y_batch)
        #    gradient_w = [gw + aw for gw, aw in zip(gradient_w, add_w)]
        #    gradient_b = [gb + ab for gb, ab in zip(gradient_b, add_b)]
        #    network.weights = [weight - learning_rate * gw / batch_size for weight, gw in zip(network.weights, gradient_w)]
        #    network.biases = [bias - learning_rate * gb / batch_size for bias, gb in zip(network.biases, gradient_b)]

            network.weights = [weight - learning_rate * gw / batch_size for weight, gw in zip(network.weights, add_w)]
            network.biases = [bias - learning_rate * gb / batch_size for bias, gb in zip(network.biases, add_b)]
            
        if X_test is not None:
            print("Epoch {}, training_accuracy: {:>6},  validation accuracy: {:>6},  epoch time: {:.2f}s".format(
                  epoch + 1, 
                  evaluate(X_train, y_train, network), 
                  evaluate(X_test, y_test, network), 
                  time.time() - start))
        else:
            print("Epoch {0}, training_accuracy: {1}".
                  format(epoch + 1, evaluate(X_train, y_train, network)))

def evaluate(X_val, y_val, network):
    y_pred = [np.argmax(network.predict(x)) for x in X_val]
    return np.mean([int(y_p == np.argmax(y)) for y_p, y in zip(y_pred, y_val)])

In [8]:
dnn = Network_mini_batch(sizes=[3072, 50, 10], activation="relu")   # , dropout_rate=0.5  / 255.0
train_DNN_minibatch(X_train, y_train, 30, 0.001, 32, dnn, X_test, y_test)

Epoch 1, training_accuracy: 0.29736,  validation accuracy: 0.2978,  epoch time: 13.59s
Epoch 2, training_accuracy: 0.33598,  validation accuracy: 0.3335,  epoch time: 12.80s
Epoch 3, training_accuracy: 0.35918,  validation accuracy: 0.3543,  epoch time: 11.79s
Epoch 4, training_accuracy: 0.36418,  validation accuracy: 0.3584,  epoch time: 11.24s
Epoch 5, training_accuracy:  0.383,  validation accuracy: 0.3718,  epoch time: 12.83s
Epoch 6, training_accuracy: 0.38762,  validation accuracy: 0.3756,  epoch time: 15.84s
Epoch 7, training_accuracy: 0.39564,  validation accuracy: 0.3871,  epoch time: 16.17s
Epoch 8, training_accuracy: 0.40156,  validation accuracy: 0.3906,  epoch time: 11.76s
Epoch 9, training_accuracy: 0.40632,  validation accuracy: 0.3933,  epoch time: 12.26s
Epoch 10, training_accuracy: 0.40862,  validation accuracy: 0.3952,  epoch time: 13.12s
Epoch 11, training_accuracy: 0.41696,  validation accuracy:  0.402,  epoch time: 14.71s
Epoch 12, training_accuracy: 0.4181,  vali

KeyboardInterrupt: 

In [8]:
dnn = Network_mini_batch(sizes=[3072, 50, 10], activation="relu")   # standardscaler lr=1e-3 batch=32 he_initialization
train_DNN_minibatch(X_train, y_train, 30, 0.001, 32, dnn, X_test, y_test)

Epoch 1, training_accuracy: 0.37208,  validation accuracy: 0.3695,  epoch time: 10.94s
Epoch 2, training_accuracy: 0.41092,  validation accuracy: 0.4089,  epoch time: 14.32s
Epoch 3, training_accuracy: 0.43184,  validation accuracy: 0.4226,  epoch time: 13.79s
Epoch 4, training_accuracy: 0.44864,  validation accuracy: 0.4357,  epoch time: 14.46s
Epoch 5, training_accuracy: 0.45824,  validation accuracy: 0.4426,  epoch time: 11.38s
Epoch 6, training_accuracy: 0.47076,  validation accuracy: 0.4534,  epoch time: 15.47s
Epoch 7, training_accuracy: 0.47942,  validation accuracy: 0.4587,  epoch time: 12.56s
Epoch 8, training_accuracy: 0.48462,  validation accuracy: 0.4607,  epoch time: 16.80s
Epoch 9, training_accuracy: 0.4916,  validation accuracy: 0.4634,  epoch time: 10.64s
Epoch 10, training_accuracy: 0.49618,  validation accuracy: 0.4628,  epoch time: 13.11s
Epoch 11, training_accuracy: 0.50256,  validation accuracy: 0.4713,  epoch time: 10.54s
Epoch 12, training_accuracy: 0.50498,  val

In [7]:
dnn = Network_mini_batch(sizes=[3072, 50, 10], activation="relu")   # , dropout_rate=0.5   standardscaler batch=2
train_DNN_minibatch(X_train, y_train, 30, 0.001, 2, dnn, X_test, y_test)

Epoch 1, training_accuracy: 0.47472,  validation accuracy: 0.4514,  epoch time: 110.88s
Epoch 2, training_accuracy: 0.49816,  validation accuracy: 0.4606,  epoch time: 101.24s
Epoch 3, training_accuracy: 0.51818,  validation accuracy: 0.4761,  epoch time: 116.53s
Epoch 4, training_accuracy: 0.52858,  validation accuracy:  0.473,  epoch time: 106.83s
Epoch 5, training_accuracy: 0.53798,  validation accuracy: 0.4799,  epoch time: 97.94s
Epoch 6, training_accuracy: 0.5605,  validation accuracy: 0.4982,  epoch time: 98.75s
Epoch 7, training_accuracy: 0.55068,  validation accuracy: 0.4841,  epoch time: 113.24s
Epoch 8, training_accuracy: 0.56852,  validation accuracy: 0.4883,  epoch time: 100.15s
Epoch 9, training_accuracy: 0.5784,  validation accuracy: 0.4966,  epoch time: 93.72s
Epoch 10, training_accuracy: 0.57538,  validation accuracy: 0.4857,  epoch time: 97.17s
Epoch 11, training_accuracy: 0.58556,  validation accuracy: 0.4979,  epoch time: 98.52s
Epoch 12, training_accuracy: 0.58198,

KeyboardInterrupt: 

In [7]:
dnn = Network_mini_batch(sizes=[3072, 1024, 512, 100, 10], activation="relu")   # , dropout_rate=0.5   standardscaler lr=1e-3 batch=32
train_DNN_minibatch(X_train, y_train, 30, 0.001, 32, dnn, X_test, y_test)

Epoch 1, training_accuracy: 0.41644,  validation accuracy: 0.4038,  epoch time: 287.32s
Epoch 2, training_accuracy: 0.46092,  validation accuracy: 0.4347,  epoch time: 245.60s
Epoch 3, training_accuracy: 0.4903,  validation accuracy: 0.4519,  epoch time: 244.37s
Epoch 4, training_accuracy: 0.5083,  validation accuracy: 0.4693,  epoch time: 244.65s
Epoch 5, training_accuracy: 0.52918,  validation accuracy:  0.482,  epoch time: 247.81s
Epoch 6, training_accuracy: 0.54614,  validation accuracy: 0.4875,  epoch time: 244.36s
Epoch 7, training_accuracy: 0.5569,  validation accuracy: 0.4959,  epoch time: 238.25s
Epoch 8, training_accuracy: 0.57158,  validation accuracy: 0.4959,  epoch time: 235.04s
Epoch 9, training_accuracy: 0.5887,  validation accuracy: 0.5019,  epoch time: 235.43s
Epoch 10, training_accuracy: 0.60128,  validation accuracy: 0.5103,  epoch time: 239.37s
Epoch 11, training_accuracy: 0.60844,  validation accuracy: 0.5086,  epoch time: 240.70s
Epoch 12, training_accuracy: 0.625

KeyboardInterrupt: 

In [7]:
dnn = Network_mini_batch(sizes=[3072, 8000, 4000, 2000, 500, 10], activation="relu")   # , dropout_rate=0.5   standardscaler lr=1e-3 batch=32
train_DNN_minibatch(X_train, y_train, 30, 0.001, 32, dnn, X_test, y_test)

KeyboardInterrupt: 

## Adam & Momentum

In [1]:
import warnings
warnings.filterwarnings("ignore")
import urllib.request
import os
import pickle, sys, random
import gzip, time
import numpy as np
from sklearn.preprocessing import StandardScaler
import pdb

In [2]:
data, labels = [], []
for i in range(1, 6):
    with open('cifar10/data_batch_%d' % i, 'rb') as f:
        whole = pickle.load(f, encoding='bytes')
        data.extend(whole[b'data'])
        labels.extend(whole[b'labels'])

In [3]:
test_data, test_labels = [], []
with open('cifar10/test_batch', 'rb') as f:
    whole = pickle.load(f, encoding='bytes')
    test_data = whole[b'data']
    test_labels = np.array(whole[b'labels'])

In [4]:
ss = StandardScaler()
X_train = np.array(data) 
X_test = np.array(test_data)
y_train = np.eye(10)[np.array(labels, dtype=np.int32)]
y_test = np.eye(10)[np.array(test_labels, dtype=np.int32)]

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [5]:
class Network:
    def __init__(self, sizes=[100, 100], activation="relu", dropout_rate=0.0):
        """
        :param sizes: list of layers
        :param activations: activation_functions
        """
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.weights = [np.random.randn(back_layer, forward_layer)  * np.sqrt(2.0 / forward_layer) 
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(back_layer, 1) for back_layer in sizes[1:]]
        self.dropout_rate = dropout_rate

        # TODO  activation_functions = {'sigmoid': sigmoid, 'relu': relu} tanh
        if activation.lower() == "sigmoid":
            self.activation = Network.sigmoid
            self.activation_derivative = Network.sigmoid_derivative
        elif activation.lower() == "relu":
            self.activation = Network.relu
            self.activation_derivative = Network.relu_derivative

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(w, a) + b)
            a *= (1.0 - self.dropout_rate)  ######### test dropout
        a = np.dot(self.weights[-1], a) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        # forward pass #
        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, a) + b

            self.mask = np.random.rand(*z.shape) > self.dropout_rate
            z *= self.mask
        #    z /= (1 - self.dropout_rate)

            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(self.weights[-1], a) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(Network.softmax(final_layer))

        # backward pass#
        delta = Network.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(delta, a_hold[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            delta = np.dot(self.weights[-l + 1].T, delta) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(delta, a_hold[-l - 1].T)
            gradient_b[-l] = delta

        return gradient_w, gradient_b


    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(z):
        return Network.sigmoid(z) * (1 - Network.sigmoid(z))

    @staticmethod
    def relu(z):
        return np.maximum(z, 0)

    @staticmethod
    def relu_derivative(z):
        mask = (z <= 0)
        dout = np.ones(z.shape)
        dout[mask] = 0.0
        return dout

    @staticmethod
    def softmax(z):
        z = z - np.max(z)
        return np.exp(z) / np.sum(np.exp(z))

    @staticmethod
    def softmax_batch(z):
        z = z.T
        z = z - np.max(z, axis=0)
        t = np.exp(z) / np.sum(np.exp(z), axis=0)
        return t.T

    @staticmethod
    def softmax_derivative(a, b):
        return a - b


class Network_mini_batch(Network):
    def __init__(self, sizes=[100, 100], activation="relu"):
        super().__init__(sizes, activation)
        self.weights = [np.random.randn(forward_layer, back_layer) * np.sqrt(2.0 / forward_layer) \
                        for forward_layer, back_layer in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(layer) for layer in sizes[1:]]

    def predict(self, a):
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            a = self.activation(np.dot(a, w) + b)
        a = np.dot(a, self.weights[-1]) + self.biases[-1]
        return a

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        a = x
        a_hold = [x]
        z_hold = []
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(a, w) + b  # batch  z = a * w + b
            a = self.activation(z)
            z_hold.append(z)
            a_hold.append(a)
        final_layer = np.dot(a, self.weights[-1]) + self.biases[-1]
        z_hold.append(final_layer)
        a_hold.append(self.softmax_batch(final_layer))
        
        delta = self.softmax_derivative(a_hold[-1], y)
        gradient_w[-1] = np.dot(a_hold[-2].T, delta)
        gradient_b[-1] = np.sum(delta, axis=0)

        for l in range(2, self.num_layers):
            delta = np.dot(delta, self.weights[-l + 1].T) * self.activation_derivative(z_hold[-l])
            gradient_w[-l] = np.dot(a_hold[-l - 1].T, delta)
            gradient_b[-l] = np.sum(delta, axis=0)

        return gradient_w, gradient_b

In [6]:
class Momentum:
    def __init__(self, lr, momentum, batch_size):
        self.lr = lr
        self.momentum = momentum
        self.batch_size = batch_size
        self.velocity = None

    def update(self, weights, biases, grad_w, grad_b):
        if self.velocity is None:
            self.velocity = dict()
            self.velocity['w'] = [np.zeros(w.shape) for w in weights]
            self.velocity['b'] = [np.zeros(b.shape) for b in biases]

        self.velocity['w'] = [self.momentum * vw - self.lr * gw / self.batch_size
                              for vw, gw in zip(self.velocity['w'], grad_w)]
        self.velocity['b'] = [self.momentum * vb - self.lr * gb / self.batch_size
                              for vb, gb in zip(self.velocity['b'], grad_b)]

    #    weights = [w + vw for w, vw in zip(weights, self.velocity['w'])].copy()
    #    biases = [b + vb for b, vb in zip(biases, self.velocity['b'])].copy()

        for i, (w, vw) in enumerate(zip(weights, self.velocity['w'])):
            weights[i] = w + vw

        for i, (b, vb) in enumerate(zip(biases, self.velocity['b'])):
            biases[i] = b + vb

In [None]:
class Adam:
    def __init__(self, lr, batch_size, rho1=0.9, rho2=0.999):
        self.lr = lr
        self.rho1 = rho1
        self.rho2 = rho2
        self.batch_size = batch_size
        self.iteration = 0
        self.ps = None

    def update(self, weights, biases, grad_w, grad_b):
        self.iteration += 1
        grad_w = [gw / self.batch_size for gw in grad_w]
        grad_b = [gb / self.batch_size for gb in grad_b]

        if self.ps is None:
            self.ps = {}
            self.ps['vw'] = [np.zeros(w.shape) for w in weights]
            self.ps['vb'] = [np.zeros(b.shape) for b in biases]
            self.ps['hw'] = [np.zeros(w.shape) for w in weights]
            self.ps['hb'] = [np.zeros(b.shape) for b in biases]

        self.ps['vw'] = [self.rho1 * vw + (1 - self.rho1) * gw for vw, gw in zip(self.ps['vw'], grad_w)]
        self.ps['vb'] = [self.rho1 * vb + (1 - self.rho1) * gb for vb, gb in zip(self.ps['vb'], grad_b)]
        self.ps['hw'] = [self.rho2 * hw + (1 - self.rho2) * (gw ** 2) for hw, gw in zip(self.ps['hw'], grad_w)]
        self.ps['hb'] = [self.rho2 * hb + (1 - self.rho2) * (gb ** 2) for hb, gb in zip(self.ps['hb'], grad_b)]
        unbias_vw = [vw / (1 - self.rho1 ** self.iteration) for vw in self.ps['vw']]
        unbias_vb = [vb / (1 - self.rho1 ** self.iteration) for vb in self.ps['vb']]
        unbias_hw = [hw / (1 - self.rho2 ** self.iteration) for hw in self.ps['hw']]
        unbias_hb = [hb / (1 - self.rho2 ** self.iteration) for hb in self.ps['hb']]

        for i, (w, vw, hw) in enumerate(zip(weights, unbias_vw, unbias_hw)):
            weights[i] = w - self.lr * vw / (np.sqrt(hw) + 1e-8)

        for i, (b, vb, hb) in enumerate(zip(biases, unbias_vb, unbias_hb)):
            biases[i] = b - self.lr * vb / (np.sqrt(hb) + 1e-8)

In [7]:
def train_DNN_minibatch(X_train, y_train, num_epochs, optimizer, batch_size, network, X_test=None, y_test=None):
    for epoch in range(num_epochs):
        start = time.time()
        random_mask = np.random.choice(len(X_train), len(X_train), replace=False)
        X_train = X_train[random_mask]
        y_train = y_train[random_mask]

        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i : i + batch_size]
            y_batch = y_train[i : i + batch_size]
    #        gradient_w = [np.zeros(w.shape) for w in network.weights]
    #        gradient_b = [np.zeros(b.shape) for b in network.biases]
            add_w, add_b = network.backprop(X_batch, y_batch)
    #        gradient_w = [gw + aw for gw, aw in zip(gradient_w, add_w)]
    #        gradient_b = [gb + ab for gb, ab in zip(gradient_b, add_b)]
            
            optimizer.update(network.weights, network.biases, add_w, add_b)   # gradient_w, gradient_b, add_w, add_b
    #        pdb.set_trace()
    #      network.weights = [weight - learning_rate * gw / batch_size for weight, gw in zip(network.weights, gradient_w)]
    #       network.biases = [bias - learning_rate * gb / batch_size for bias, gb in zip(network.biases, gradient_b)]

        if X_test is not None:
            print("Epoch {}, training_accuracy: {:>6},  validation accuracy: {:>6},  epoch time: {:.2f}s".format(
                  epoch + 1, 
                  evaluate(X_train, y_train, network), 
                  evaluate(X_test, y_test, network), 
                  time.time() - start))
        else:
            print("Epoch {0}, training_accuracy: {1}".
                  format(epoch + 1, evaluate(X_train, y_train, network)))

def evaluate(X_val, y_val, network):
    y_pred = [np.argmax(network.predict(x)) for x in X_val]
    return np.mean([int(y_p == np.argmax(y)) for y_p, y in zip(y_pred, y_val)])

In [None]:
# pdb.set_trace()
dnn = Network_mini_batch(sizes=[3072, 50, 10], activation="relu")   # standardscaler lr=1e-3 batch=32 he_initialization
optimizer = Momentum(lr=0.001, momentum=0.9, batch_size=32)
train_DNN_minibatch(X_train, y_train, 30, optimizer, 32, dnn, X_test, y_test)