# Solution to my first NN: Logist regression

We are here following the pipeline suggested from the [assignment](https://www.coursera.org/learn/intro-to-deep-learning/peer/0AgYP/my1stnn).

In [None]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from preprocessed_mnist import load_dataset

In [None]:
# Random seed
np.random.seed(42)

* Begin with logistic regression from the previous assignment to classify some number against others (e.g. zero vs nonzero)

From `preprocessed_mnist` the data has already been:

1. Been normalized (note that the images only have one channel)
2. Split into train, validation and test

## Logistic regression separating zeros from non-zeros

### Data preparation

In [None]:
# Load the dataset
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

In [None]:
# Parameters
small_number = 1e-3
batch_size = 1000
epoch = 15

In [None]:
def reshaper_3d_to_2d(var):
    """
    Reshapes a 3-d array to a 2-d array, collapsing the two last dimensions
    
    Parameters
    ----------
    var : array, shape (samples, image-rows, image-colums)
        The variable to reshape
        
    Returns
    -------
    reshaped : array, shape (samples, image-rows, image-colums)
        The reshaped variable    
    """
    
    reshaped = var.reshape(var.shape[0], var.shape[1]*var.shape[2])
    
    return reshaped

In [None]:
def cast_non_zero_to_one(var):
    """
    Cast non-zero values to one
    
    The input variable will be copied prior to transformation
    
    Parameters
    ----------
    var : array-like
        The variable to make the transformation on
    
    Returns
    -------
    binary : array-like
        The transformed variable
    """
    
    binary = var.copy()
    
    binary[binary != 0] = 1
    
    return binary

In [None]:
X_train_r = reshaper_3d_to_2d(X_train)
X_val_r = reshaper_3d_to_2d(X_val)
X_test_r = reshaper_3d_to_2d(X_test)

In [None]:
# Casting y to matrices (makes matmul possible)
y_train = y_train[:, np.newaxis]
y_val = y_val[:, np.newaxis]
y_test = y_test[:, np.newaxis]

In [None]:
y_train_b = cast_non_zero_to_one(y_train)
y_val_b = cast_non_zero_to_one(y_val)
y_test_b = cast_non_zero_to_one(y_test)

In [None]:
n_training_ex = X_train_r.shape[0]
n_features = X_train_r.shape[1]
n_output = 1

### Tensorflow preparation

#### How can a network take in all training examples at once?
* When predicting one example, we are essential sending in a row-vector (`1 x n`-matrix)
* When training several examples, we are sending in several one-vectors (`m x n`-matix)
* The loss will still be a scalar as the `input_y` will be `m x 1`-dimensional, where we will take an inner product with `predicted_y`, which is also `m x 1` dimensional

In [None]:
# The first dimension is None, as we would like to vary the number of input examples

input_X = tf.placeholder("float32", shape=(None, n_features), name="input_x")
input_y = tf.placeholder("float32", shape=(None, n_output), name="input_y")

identity = tf.placeholder("float32", shape=(None, 1), name='identity')

We will use a notation similar to Michael Nielsens [notation](http://neuralnetworksanddeeplearning.com/chap2.html), but altered to be consistent with pandas [tidy data](https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf) 

$X_{\text{e}, \text{f}}W^{\text{l}}_{\text{ f}, \text{t}} + \mathbb{I}_{\text{e}, 1}b_{1, \text{t}}$

* `e` - number of examples
* `l` - layer
* `f` - from node (or activation) id
* `t` - to node id

Each `to node id` is a sum of all `from node (or activation) id`.
This means that the rows are trained seperately, and only depends on the `to node id`

In [None]:
def get_w_and_b(n_to_nodes, n_from_nodes):
    """
    Returns weights and biases based on the input dimensions
    
    Parameters
    ----------
    n_to_nodes : int
        Number of nodes in the next layer (t in notation above)
    n_from_nodes : int
        Number of nodes in the previous layer (f in notation above)
        
    Returns
    -------
    W : Variable, shape (n_from_nodes, n_to_nodes)
        The weights variable
    b : Variable, shape (n_training_ex, n_to_nodes)
        The bias variable
    """
     
    # We initialize with random weights to break symmetry
    W = tf.Variable(initial_value=np.random.randn(n_from_nodes, n_to_nodes)*small_number,
                    name="weights",
                    dtype='float32')

    b = tf.Variable(initial_value=np.random.randn(1, n_to_nodes)*small_number,
                    name="bias",
                    dtype='float32')
    
    return W, b

In [None]:
W, b = get_w_and_b(n_to_nodes=1, n_from_nodes=n_features)

### The model code

In [None]:
predicted_y = tf.nn.sigmoid(input_X @ W + identity @ b)
# Instead of using matrix-transpose followed by a matrix-multiplication, we do a reduce_mean operation
# NOTE: When optimizing, the 1/m factor when taking reduce_mean contra taking matmul will not change the 
#       location of the minima 
loss = tf.reduce_mean(- input_y * tf.log(predicted_y) - (1-input_y) * tf.log(1 - predicted_y))
optimizer = tf.train.MomentumOptimizer(0.01, 0.5).minimize(loss)

In [None]:
def calc_f_score(pred, true):
    """
    Calculates the f-score
    
    Notes
    -----
    Assuming here that 0 is the positive value (as we are distinguishing zeros from non-zeros)
    
    Parameters
    ----------
    pred : array-like
        The predicted values
    true : array-like
        The acutal values
        
    Returns
    -------
    f_score : float
        The f-score
    """
    
    tp = np.sum(np.logical_and(pred == 0, true == 0))
    fp = np.sum(np.logical_and(pred == 0, true != 0))
    fn = np.sum(np.logical_and(pred != 0, true == 0))
    
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    f_score = 2*precision*recall/(precision + recall)
    
    return f_score

In [None]:
class Batch(object):
    """
    Class to generate batches without replacement.
    
    Inspired by [0].
    
    References
    ----------
    [0] https://medium.com/wwblog/getting-started-with-tensorflow-a65bc824141b
    """
    
    def __init__(self, X, y, batch_size):
        """
        Set the member data and generates the indices
        
        Parameters
        ----------
        X : array, shape (n_examples, n_features)
            The data matrix
        y : array, shape (n_examples, 1)
            The ground truth
        """
        self.batch_size = batch_size
        self.X = X
        self.y = y
        
        size = y.shape[0]
        
        if batch_size > size:
            print('batch_size > size, setting batch_size = size')
            self.batch_size = size
        
        # Random shuffling of the indices
        self.indices = np.array(range(size))
        np.random.shuffle(self.indices)
        
        self.get_next_indices = self.indices_generator()
        
    def indices_generator(self):
        """
        Generator which returns indices
        """
        
        for i in range(self.indices.shape[0] // self.batch_size):
            yield self.indices[i*self.batch_size:i*self.batch_size + self.batch_size]       
    
    def get_batch(self):
        """
        Rerurns a batch
        
        Returns
        -------
        X : array, shape (batch_size, n_features)
            The data matrix
        y : array, shape (batch_size, 1)
            The ground truth
        """
        
        try:
            indices = next(self.get_next_indices)
        except StopIteration:
            # Replenish exhausted generator
            self.get_next_indices = self.indices_generator()
            indices = next(self.get_next_indices)

        X = self.X[indices, :]
        y = self.y[indices, :]
        return X, y

Calculate iteration

In [None]:
if batch_size > n_training_ex:
    batch_size = n_training_ex

iter_per_epoch = n_training_ex // batch_size
total_iter = iter_per_epoch * epoch

Initialize plotting variables

In [None]:
training_loss = np.zeros(total_iter)
validation_loss = np.zeros(total_iter)
test_loss = np.zeros(total_iter)

iterations = np.array(range(total_iter), dtype=int)

In [None]:
sess = tf.Session()

# The variables must be initialized
# https://stackoverflow.com/questions/44299666/when-global-variables-initializer-is-actually-required
sess.run(tf.global_variables_initializer())

batch = Batch(X_train_r, y_train_b, batch_size)

### Training

**NOTE**: Calculating the loss is expensive, and is not usually done for each iteration (rather for each epoch, or only for the training batch)

In [None]:
for i in iterations:
    print(f'Iteration: {i+1}/{total_iter}', end='\r')
    X, y = batch.get_batch()
    
    sess.run(optimizer, 
             {input_X: X,
              input_y: y, 
              identity: np.ones((y.shape[0], 1))})
    
    training_loss[i] = sess.run(loss, 
                                {input_X: X_train_r,
                                 input_y: y_train_b,
                                 identity: np.ones((y_train_b.shape[0], 1))})
    validation_loss[i] = sess.run(loss, 
                                  {input_X: X_val_r, 
                                   input_y: y_val_b, 
                                   identity: np.ones((y_val_b.shape[0], 1))})
    test_loss[i] = sess.run(loss,
                            {input_X: X_test_r, 
                             input_y: y_test_b,
                             identity: np.ones((y_test_b.shape[0], 1))})

### Prediction

In [None]:
y_pred_train = sess.run(predicted_y, 
                     {input_X: X_train_r, 
                      identity: np.ones((y_train_b.shape[0], 1))})
y_pred_validation = sess.run(predicted_y, 
                          {input_X: X_val_r, 
                           identity: np.ones((y_val_b.shape[0], 1))})
y_pred_test = sess.run(predicted_y, 
                    {input_X: X_test_r, 
                     identity: np.ones((y_test_b.shape[0], 1))})

In [None]:
sess.close()

In [None]:
# Casting predictions
y_pred_train_b = (y_pred_train > 0.5).astype(int)
y_pred_validation_b = (y_pred_validation > 0.5).astype(int)
y_pred_test_b = (y_pred_test > 0.5).astype(int)

### Plotting loss

In [None]:
# Calc f-score
print('Calcualting f-scores')
f_score_train = calc_f_score(y_pred_train_b, y_train_b)
f_score_validation = calc_f_score(y_pred_validation_b, y_val_b)
f_score_test = calc_f_score(y_pred_test_b, y_test_b)

In [None]:
fig, ax = plt.subplots()

ax.plot(iterations, training_loss, label=f'Train, f-score={f_score_train:.2}')
ax.plot(iterations, validation_loss, label=f'Validation, f-score={f_score_validation:.2}')
ax.plot(iterations, test_loss, label=f'Test, f-score={f_score_test:.2}')

ax.set_xlabel('Iterations')
ax.set_ylabel('Loss')

ax.grid()

_ = ax.legend(loc='best', fancybox=True, framealpha=0.5)

### Plotting highest and lowest prediction probability

In [None]:
largest_ind = np.where(y_pred_train == y_pred_train.max())[0][0]
lowest_ind = np.where(y_pred_train == y_pred_train.min())[0][0]

title_1 = f'Highest prediction probability\n {y_pred_train[largest_ind]}'
title_2 = f'Lowest prediction probability\n {y_pred_train[lowest_ind]}'

fig, (ax1, ax2) = plt.subplots(ncols=2)

ax1.imshow(X_train[largest_ind], cmap="Greys")
_ = ax1.set_title(title_1)

ax2.imshow(X_train[lowest_ind], cmap="Greys")
_ = ax2.set_title(title_2)

ax1.xaxis.set_major_locator(plt.NullLocator())
ax1.yaxis.set_major_locator(plt.NullLocator())
ax2.xaxis.set_major_locator(plt.NullLocator())
ax2.yaxis.set_major_locator(plt.NullLocator())