# Neural Networks

A **neural network** is a data processing pipeline composed of layers, each of which performs a transformation on the input data into a new representation, until it closely resembles the desired output.

**Gradient descent** is the optimization technique that powers neural networks:

* A prediction is made on a training example.

* The **loss function** compares the prediction to the actual target, producing a **loss value**: a measure of how well the model's prediction matches what was expected.

* A **backward pass** computes the gradient of the loss with respect to the model parameters, which describes how the loss varies as you move the model's coefficients in different directions. You can use this gradient to move the coefficients all at once in a direction that decreases the loss.

* The **optimizer** uses the loss value to update the model's weights via the **Backpropagation algorithm**. The optimizer might update the model weights using: learning_rate * gradient (the **learning rate** represents the "speed" of the gradient descent process)

* This process is usually performed in randomly selected batches, so it is called **mini-batch stochastic gradient descent**. Running the process on all the data at once would be more accurate but more expensive.

Tensorflow is capable of autodifferentiation using GradientDescent.


## Example of Autodifferentiation with TensorFlow

In [1]:
import tensorflow as tf
x = tf.Variable(0.)
with tf.GradientTape() as tape:
    y = 2 * x + 3
grad_of_y_wrt_x = tape.gradient(y, x)

2023-11-22 11:44:53.832101: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-22 11:45:01.777630: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Naive Implementation of a Neural Network

In [2]:
# Dense Layer
class NaiveDense:
    def __init__(self, input_size, output_size, activation):
        self.activation = activation

        w_shape = (input_size, output_size)
        w_initial_value = tf.random.uniform(w_shape, minval=0, maxval=1e-1)
        self.W = tf.Variable(w_initial_value)

        b_shape = (output_size,)
        b_initial_value = tf.zeros(b_shape)
        self.b = tf.Variable(b_initial_value)

    def __call__(self, inputs):
        return self.activation(tf.matmul(inputs, self.W) + self.b)
    
    @property
    def weights(self):
        return [self.W, self.b]

In [3]:
# Neural Network
class NaiveSequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, inputs):
        x = inputs
        for layer in self.layers:
            x = layer(x)
        return x
    
    @property
    def weights(self):
        weights = []
        for layer in self.layers:
            weights += layer.weights
        return weights

In [4]:
# Optimizer
learning_rate = 1e-3
def update_weights(gradients, weights):
    for g, w in zip(gradients, weights):
        w.assign_sub(g * learning_rate) # equivalent of -= for tensorflow variables

In [5]:
# Stochastic Gradient Descent (One Step)
def one_training_step(model, images_batch, labels_batch):
    with tf.GradientTape() as tape:

        # Make predictions for all examples in the batch
        predictions = model(images_batch)

        # Compute the losses for all examples in the batch
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(labels_batch, predictions)

        # Get the average loss over all examples in the batch
        average_loss = tf.reduce_mean(per_sample_losses)

    # Compute the gradient of the loss with respect to the model weights
    gradients = tape.gradient(average_loss, model.weights)

    # Update the weights in such a way that will decrease the loss (e.g. learning_rate * gradient)
    update_weights(gradients, model.weights)

    # Return the average loss over all examples in this batch
    return average_loss

In [6]:
# Batch Generator
import math

class BatchGenerator:

    def __init__(self, images, labels, batch_size=128):
        assert len(images) == len(labels)

        self.index = 0
        self.images = images
        self.labels = labels
        self.batch_size = batch_size
        self.num_batches = math.ceil(len(images) / batch_size)

    def next(self):
        images = self.images[self.index : self.index + self.batch_size]
        labels = self.labels[self.index : self.index + self.batch_size]
        self.index += self.batch_size
        return images, labels

In [7]:
# Fit (Training)
def fit(model, images, labels, epochs, batch_size=128):

    # For each epoch  (pass through the whole training dataset)
    for epoch_counter in range(epochs):

        print(f"Epoch {epoch_counter}")

        batch_generator = BatchGenerator(images, labels)

        # For each randomly selected batch of training examples
        for batch_counter in range(batch_generator.num_batches):

            images_batch, labels_batch = batch_generator.next()

            # Make predictions (forward pass) and compute the loss
            loss = one_training_step(model, images_batch, labels_batch)

            if batch_counter % 100 == 0:

                print(f"loss at batch {batch_counter}: {loss:.2f}")

## Example of a Neural Network model using the Naive Implementation

In [8]:
# Example model definition
model = NaiveSequential([
    NaiveDense(input_size=28*28, output_size=512, activation=tf.nn.relu),
    NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
])
assert len(model.weights) == 4

In [9]:
# Train the example model on the MNIST image dataset
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28*28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28*28))
test_images = test_images.astype("float32") / 255

fit(model, train_images, train_labels, epochs=10, batch_size=128)

Epoch 0
loss at batch 0: 5.08
loss at batch 100: 2.25
loss at batch 200: 2.23
loss at batch 300: 2.11
loss at batch 400: 2.27
Epoch 1
loss at batch 0: 1.96
loss at batch 100: 1.90
loss at batch 200: 1.85
loss at batch 300: 1.76
loss at batch 400: 1.88
Epoch 2
loss at batch 0: 1.64
loss at batch 100: 1.60
loss at batch 200: 1.53
loss at batch 300: 1.46
loss at batch 400: 1.55
Epoch 3
loss at batch 0: 1.38
loss at batch 100: 1.37
loss at batch 200: 1.27
loss at batch 300: 1.24
loss at batch 400: 1.30
Epoch 4
loss at batch 0: 1.17
loss at batch 100: 1.19
loss at batch 200: 1.07
loss at batch 300: 1.07
loss at batch 400: 1.12
Epoch 5
loss at batch 0: 1.02
loss at batch 100: 1.05
loss at batch 200: 0.92
loss at batch 300: 0.95
loss at batch 400: 0.99
Epoch 6
loss at batch 0: 0.91
loss at batch 100: 0.94
loss at batch 200: 0.82
loss at batch 300: 0.85
loss at batch 400: 0.90
Epoch 7
loss at batch 0: 0.82
loss at batch 100: 0.85
loss at batch 200: 0.73
loss at batch 300: 0.78
loss at batch 40

In [10]:
# Evaluation of the example model
import numpy as np
predictions = model(test_images)
predictions = predictions.numpy()
predicted_labels = np.argmax(predictions, axis=1)
matches = predicted_labels == test_labels
print(f"accuracy: {matches.mean():.2f}")

accuracy: 0.82


## Example of a Neural Network model using TensorFlow

In [11]:
from tensorflow import keras
from tensorflow.keras import layers

In [12]:
# Example model definition
model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(10, activation="softmax")
])
model.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [13]:
# Train the example model on the MNIST image dataset
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28*28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28*28))
test_images = test_images.astype("float32") / 255
model.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb9a0cdc250>

In [14]:
# Evaluation of the example model
import numpy as np
predictions = model(test_images)
predictions = predictions.numpy()
predicted_labels = np.argmax(predictions, axis=1)
matches = predicted_labels == test_labels
print(f"accuracy: {matches.mean():.2f}")

accuracy: 0.98


# Neural Networks for Regression

- Mean squared error (MSE) is a loss function commonly used for regression.
- Mean absolute error (MAE) is a common metric used for regression.
- When features in the input data have value in different ranges, each feature should be scaled independently as a preprocessing step.
- When there is little data available:
    - Using K-fold validation is a great way to reliably evaluate a model.
    - It is prefereable to use a small model with few intermediate layers (typically only one or two) in order to avoid severe overfitting.

## Regression Use Cases

### Predicting house prices: A regression example

(Predict a continuous value instead of a discrete variable.)

In [None]:
from tensorflow.keras.datasets import boston_housing
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()
# train_images.shape == (404, 13) we have 404 training examples, each with 13 numerical features (e.g. crime rate)
# test_images.shape == (102, 13) we have 102 test examples, each with 13 numerical features (e.g. crime rate)

# Prepare the data (normalize)
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data -= mean
train_data /= std
test_data -= mean
test_data /= std

# It would be problematic to feed into a neural network values that all take wildly different ranges. 
# The model might be able to automatically adapt but it would make learning more difficult. 
# A common best practice is to normalize the data — that is, to ensure that all values are centered around 0 and have a standard deviation of 1.
# normalization = (x - mean) / std
# Note that the quantities used for normalizing the test data are computed using the training data.

In [None]:
# Build the model
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1) # no activation -> linear layer (this is the typical setup for scalar regression) (applying an activation function would constrain the range the output can take)
    ])
    model.compile(
        optimizer="rmsprop",
        loss="mse", # mean squared error (the square of the difference between the predictions and the targets) (this is a widely used loss function for regression problems)
        metrics=["mae"] # mean absolute error (the absolute value of the difference between the predictions and the targets)
    )
    return model

In [None]:
# Because the dataset is small, the validation accuracy can change a lot depending on which examples we choose to use for validation and which we choose for training.
# The best practice in such situations is to use K-fold cross-validation.
# It consists of splitting the available data into K partitions (typically K=4 or 5),
# instantiating K identical models, and training each one on K-1 partitions while evaluating on the remaining partitions.
# The validation score for the model used is then the average of the K validation scores obtained.

# K-fold cross-validation
import numpy as np
k = 4
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []

for fold in range(k):

    print(f"processing fold #{fold}")

    # Select the validation data
    val_data = train_data[num_val_samples * fold : num_val_samples * (fold + 1)]
    val_targets = train_targets[num_val_samples * fold : num_val_samples * (fold + 1)]

    # Prepare the training data
    partial_train_data = np.concatenate([
        train_data[:num_val_samples * fold],
        train_data[num_val_samples * (fold + 1):]
    ], axis=0)
    partial_train_targets = np.concatenate([
        train_targets[:num_val_samples * fold],
        train_targets[num_val_samples * (fold + 1):]
    ], axis=0)

    # Build the Keras model (already compiled)
    model = build_model()

    # Train the model (in silent mode, verbose=0)
    model.fit(
        partial_train_data,
        partial_train_targets,
        epochs=num_epochs,
        batch_size=1,
        verbose=0
    )

    # Evaluate the model on the validation data
    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    all_scores.append(val_mae)

print(f"all_scores: {all_scores}")

In [None]:
np.mean(all_scores)