# Effect of different initial bias settings for relu

In [None]:
import sys
sys.path.append('../..')
from deep_bottleneck.eval_tools.experiment_loader import ExperimentLoader
from deep_bottleneck.eval_tools.utils import format_config, find_differing_config_keys
import matplotlib.pyplot as plt
from io import BytesIO

import pandas as pd
import numpy as np

In [None]:
loader = ExperimentLoader()

In [None]:
experiment_ids = [862, 859, 860, 868, 867, 863, 864, 866, 861, 865, 869, 870]
experiments = loader.find_by_ids(experiment_ids)
differing_config_keys = find_differing_config_keys(experiments)

In [None]:
fig, ax = plt.subplots(6,2, figsize=(14, 48))
ax = ax.flat

for i, experiment in enumerate(experiments):
    img = plt.imread(BytesIO(experiment.artifacts['infoplane_test'].content))
    ax[i].axis('off')
    ax[i].imshow(img)
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=16)
plt.tight_layout()    
plt.show()

In [None]:
fig, ax = plt.subplots(6,2, figsize=(12, 21))
ax = ax.flat

for i, experiment in enumerate(experiments): 
    df = pd.DataFrame(data=np.array([experiment.metrics['training.accuracy'].values, 
                                     experiment.metrics['test.accuracy'].values]).T,
                  index=experiment.metrics['test.accuracy'].index,
                  columns=['train_acc', 'val_acc'])

    df.plot(linestyle='', marker='.', markersize=5, ax=ax[i])
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=12)
    ax[i].set_ylim([0,1])
    ax[i].set(xlabel='epoch', ylabel='accuracy')

plt.tight_layout()    
plt.show()

In [None]:
fig, ax = plt.subplots(6,2, figsize=(14, 48))
ax = ax.flat

for i, experiment in enumerate(experiments):
    img = plt.imread(BytesIO(experiment.artifacts['infoplane_train'].content))
    ax[i].axis('off')
    ax[i].imshow(img)
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=16)
plt.tight_layout()    
plt.show()

In [None]:
fig, ax = plt.subplots(12,1, figsize=(14, 48))
ax = ax.flat

for i, experiment in enumerate(experiments):
    img = plt.imread(BytesIO(experiment.artifacts['snr_train'].content))
    ax[i].axis('off')
    ax[i].imshow(img)
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=16)
plt.tight_layout()    
plt.show()

In [None]:
fig, ax = plt.subplots(12,1, figsize=(15, 105))
ax = ax.flat

for i, experiment in enumerate(experiments): 
    img = plt.imread(BytesIO(experiment.artifacts['activations_train'].content))
    ax[i].axis('off')
    ax[i].imshow(img)
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=20)

plt.tight_layout()    
plt.show()

## Experiment with max_weight_norm=0.4

In the following we present an example with `relu` and the norm of the weight vector for each layer restricted to 0.4
This is a significantly stronger regularization which this time will also have an effect on the performance of the network. 

In [None]:
relu04 = loader.find_by_id(603)
relu04.config

In the infoplane plot below it can be seen that training is impaired for the choice of such strict weight regularization.

In [None]:
relu04.artifacts['infoplane'].show()

In [None]:
relu04.artifacts['activations'].show(figsize=(12,16))

The activation pattern of several peaks is even more pronounced with stronger restiction on the size of the weights. 

The performance of the network is worse than with higher weightnorm. But the training dynamics still look ok. The network learns the task up to a certain accurcy without overfitting.

In [None]:
relu04.metrics['training.accuracy'].plot()
relu04.metrics['validation.accuracy'].plot()
plt.ylabel('accurcy')
plt.xlabel('epoch')
plt.legend()

## Supplementary material

Below we find plots indicating the development of means and standard deviation of the gradient, its signal to noise ratio as well as the norm of the weight vector for all layers over the course of training. Comparing plots for unconstrained vs. constrained weight vector, we can reassure ourselves that rescaling the weights worked as we expected.

In [None]:
fig, ax = plt.subplots(4,1, figsize=(16, 20))
ax = ax.flat

for i, experiment in enumerate(experiments): 
    img = plt.imread(BytesIO(experiment.artifacts['snr'].content))
    ax[i].axis('off')
    ax[i].imshow(img)
    ax[i].set_title(format_config(experiment.config, *differing_config_keys),
                    fontsize=20)

plt.tight_layout()    
plt.show()

Below we find the configuration of all non-varied parameters that we used for the experiments above.

In [None]:
variable_config_dict = {k: '<var>' for k in differing_config_keys}
config = experiment.config
config.update(variable_config_dict)
config