<a href="https://colab.research.google.com/github/nicolez9911/colab/blob/main/AdvML_L4S1N1_Continous_Learning_Adaline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Continuous Learning


The focus of this notebook is the implementation of the continuous learning approaches introduced in the lecture.


## Imports

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt


## MNIST Import


In [None]:

# Sorts the dataset by target (i.e the numbers 0-9). Sorts only the first 60000 entries (the training set).
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]


# Download and sort the dataset
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
mnist["data"], mnist["target"]


print(mnist.target.shape)
print(mnist.data.shape)

from sklearn.utils import shuffle


# Select 5000 values for 0 and 9 each by manually checking the boundaries or by checking for target value.
mnist_4_target = mnist.target[0:5000]
mnist_4_data = mnist.data[0:5000]
mnist_5_target = mnist.target[5000:10000]
mnist_5_data = mnist.data[5000:10000]
#mnist_4_target = mnist.target[25000:30000]
#mnist_4_data = mnist.data[25000:30000]
#mnist_5_target = mnist.target[31000:36000]
#mnist_5_data = mnist.data[31000:36000]
mnist_bin_4_5_data = np.concatenate((mnist_4_data, mnist_5_data))
mnist_bin_4_5_target = np.concatenate((mnist_4_target, mnist_5_target))
mnist_bin_4_5_target = np.where(mnist_bin_4_5_target == 0, -1, 1)

# Shuffle should be something that the students have to identify as an element that impacts the learning
X, y = shuffle(mnist_bin_4_5_data, mnist_bin_4_5_target)
len(mnist_bin_4_5_data)

## Adaline with Gradient Descent

It is an implementation of Gradient Descent with Batch Gradient Descent calculation.

Note: This implementation is taken from Raschka et al. Machine Learning with Pyhon and seems to
contain a bug.

In [None]:
class AdalineGD(object):
    """ADAptive LInear NEuron classifier.
    Parameters
    ------------
    eta : float
      Learning rate (between 0.0 and 1.0)
    n_iter : int
      Passes over the training dataset.
    random_state : int
      Random number generator seed for random weight
      initialization.
    Attributes
    -----------
    w_ : 1d-array
      Weights after fitting.
    cost_ : list
      Sum-of-squares cost function value in each epoch.
    """
    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state

    def fit(self, X, y):
        """ Fit training data.
        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
          Training vectors, where n_samples is the number of samples and
          n_features is the number of features.
        y : array-like, shape = [n_samples]
          Target values.
        Returns
        -------
        self : object
        """
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=1 + X.shape[1])
        self.w_ = np.float64(self.w_)
        self.cost_ = []

        for i in range(self.n_iter):
            net_input = self.net_input(X)
            # Please note that the "activation" method has no effect
            # in the code since it is simply an identity function. We
            # could write `output = self.net_input(X)` directly instead.
            # The purpose of the activation is more conceptual, i.e.,
            # in the case of logistic regression (as we will see later),
            # we could change it to
            # a sigmoid function to implement a logistic regression classifier.
            output = self.activation(net_input)
            errors = (y - output)
            self.w_[1:] -= self.eta * (X.T.dot(errors))/X.shape[0]
            self.w_[0] += self.eta * errors.sum()
            cost = ((errors**2).sum() / 2.0)/X.shape[0]
            self.cost_.append(cost)
        return self


    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def activation(self, X):
        """Compute linear activation"""
        return X

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation(self.net_input(X)) >= 0.0, 1, -1)

## Plotting the Cost

The next cell plots the cost of the Adaline implementation in order to demonstrate the sensitivity
of the gradient descent with regard to the step size.




In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
num_epochs = 100
ada1 = AdalineGD(n_iter=num_epochs, eta=0.000001).fit(X_image_normalized, y)
ax[0].plot(range(1, len(ada1.cost_) + 1), ada1.cost_, marker='o')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('log(Sum-squared-error)')
ax[0].set_title('Adaline - Learning rate 0.01')

ada2 = AdalineGD(n_iter=num_epochs, eta=0.000005).fit(X_image_normalized, y)
ax[1].plot(range(1, len(ada2.cost_) + 1), ada2.cost_, marker='o')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Sum-squared-error')
ax[1].set_title('Adaline - Learning rate 0.0001')

# plt.savefig('images/02_11.png', dpi=300)
plt.show()


## Stochastic Gradient Descent

Stochastic Gradient Descent instead of Batch Gradient Descent calculates the updates after each sample.
It then shuffles the samples before calculating with the next sample.




In [None]:
class AdalineSGD(object):
    """ADAptive LInear NEuron classifier.

    Parameters
    ------------
    eta : float
      Learning rate (between 0.0 and 1.0)
    n_iter : int
      Passes over the training dataset.
    shuffle : bool (default: True)
      Shuffles training data every epoch if True to prevent cycles.
    random_state : int
      Random number generator seed for random weight
      initialization.


    Attributes
    -----------
    w_ : 1d-array
      Weights after fitting.
    cost_ : list
      Sum-of-squares cost function value averaged over all
      training samples in each epoch.


    """
    def __init__(self, eta=0.01, n_iter=10, shuffle=True, random_state=None):
        self.eta = eta
        self.n_iter = n_iter
        self.w_initialized = False
        self.shuffle = shuffle
        self.random_state = random_state

    def fit(self, X, y):
        """ Fit training data.

        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
          Training vectors, where n_samples is the number of samples and
          n_features is the number of features.
        y : array-like, shape = [n_samples]
          Target values.

        Returns
        -------
        self : object

        """
        self._initialize_weights(X.shape[1])
        self.cost_ = []
        for i in range(self.n_iter):
            if self.shuffle:
                X, y = self._shuffle(X, y)
            cost = []
            for xi, target in zip(X, y):
                cost.append(self._update_weights(xi, target))
            avg_cost = sum(cost) / len(y)
            self.cost_.append(avg_cost)
        return self

    def partial_fit(self, X, y):
        """Fit training data without reinitializing the weights"""
        if not self.w_initialized:
            self._initialize_weights(X.shape[1])
        if y.ravel().shape[0] > 1:
            for xi, target in zip(X, y):
                self._update_weights(xi, target)
        else:
            self._update_weights(X, y)
        return self

    def _shuffle(self, X, y):
        """Shuffle training data"""
        r = self.rgen.permutation(len(y))
        return X[r], y[r]

    def _initialize_weights(self, m):
        """Initialize weights to small random numbers"""
        self.rgen = np.random.RandomState(self.random_state)
        self.w_ = self.rgen.normal(loc=0.0, scale=0.01, size=1 + m)
        self.w_initialized = True

    def _update_weights(self, xi, target):
        """Apply Adaline learning rule to update the weights"""
        output = self.activation(self.net_input(xi))
        error = (target - output)
        self.w_[1:] += self.eta * xi.dot(error)
        self.w_[0] += self.eta * error
        cost = 0.5 * error**2
        return cost

    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def activation(self, X):
        """Compute linear activation"""
        return X

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation(self.net_input(X)) >= 0.0, 1, -1)

## Exercise

Normalisation of the features is a commonly applied technique to optimize the learning of
the gradient descent if approach:

1. normalize the Feature Matrix using the cell below
2. analyze the effect of normalize on the feature matrix
3. train AdalineSGD with a normalized and non-normalized feature matrix.




In [None]:
from sklearn.preprocessing import normalize
# standardize features

X_normed = normalize(X, axis=0, norm='max')
X_image_normalized = X*(1.0/255)

In [None]:
ada = AdalineGD(n_iter=100, eta=0.000001)
ada.fit(X_normed, y)



plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Sum-squared-error')

plt.tight_layout()
# plt.savefig('images/02_14_2.png', dpi=300)
plt.show()



In [None]:
ada = AdalineSGD(n_iter=100, eta=0.001, random_state=1)
ada.fit(X_normed, y)


plt.tight_layout()
# plt.savefig('images/02_15_1.png', dpi=300)
plt.show()

plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Sum-squared-error')

plt.tight_layout()
# plt.savefig('images/02_15_2.png', dpi=300)
plt.show()