# Spec 0.1

In [88]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import transforms
from torchvision.datasets import KMNIST
from sklearn.metrics import classification_report


This is similar to NAS with RL. There, the actions were decided hyperparameters; here, actions are deciding which weights to go to. Rather than update weights (such as with backpropagation), we move to certain weights (i.e., dettached layers).


<u> States</u>  Continuous state space with each state representing the the output of the CNN.

<u> Actions </u> Continuous action space with each action being a probability distribution of which next layer to go to.

<u> Reward </u> Loss from CNN.


### Notes

What information does the gradient give us? How can we use it?
- The partial derivatives tell us the rate of change of the loss due to that weight. It is a vector of slopes. So then we have some interesting pieces of information:
    - the weight 
    - the weight's rate of change
    - the weight's importance (in terms of the partial derivative of the loss w.r.t. this weight)

Using the gradient at each layer
- Keep track of information about the weights by index
- This is one benefit of having the same dimension for the layer/weights in each detached layer
- The trend for a weight can indicate what change we want to make
- But maybe this doesn't work because each change to the weight makes sense in terms of the context (i.e., the changes to all the other weights; e.g., a certain change in weight be unecessary if we make some other change to a different weight)

- A connection is that if I knew how a weight was changing, then I could know what to look for (here, what next layer we want because it has that weight at a desired value).

When does the network stop?
- One possibility is in the case of a CNN, it stops when it reaches the dense layer (i.e., when the RL agent chooses that for its action). 

What is the reward for the controller?
- The loss at each layer

What is my state space? Is it continuous or discrete?
- If my states are the detached layers, then one state will have the same value regardless of where in the sequence of layers it and by which layers was it preceded - but of course that information (sequence and composition) are critically important to how the built network performs
- Discrete: Each state is a network (i.e., the network after that action, which was the attachment of some layer)
- Continuous: If each state is a network, how can approximate networks and use a continuous model?
    - We can represent the network as a vector with something like [number_of_detached_layers, weights]
    - The downside of this approach is there is a set number of layers (set architecture)
    - But this doesn't work either because it doesn't seem to do anything with the data

Do the weights of individual detached layers get updated? When do they get updated?
- TODO

How is the value of the state (so that detached layer in this network, so far) getting updated?
- TODO

Is there an opportunity to use square matrices and the inverse property to engineer backwards a solution? Can we work back from some output to the matrices (weights) we need?
- TODO

In [70]:
# define training hyperparameters
INIT_LR = 1e-3
BATCH_SIZE = 64
EPOCHS = 10
# define the train and val splits
TRAIN_SPLIT = 0.75
VAL_SPLIT = 1 - TRAIN_SPLIT
# set the device we will be using to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [118]:
# load the KMNIST dataset
print("[INFO] loading the CIFAR10 dataset...")
trainData = torchvision.datasets.MNIST(root="data", train=True, download=True,
	transform=transforms.ToTensor())
testData = torchvision.datasets.MNIST(root="data", train=False, download=True,
	transform=transforms.ToTensor())

# calculate the train/validation split
print("[INFO] generating the train/validation split...")
numTrainSamples = int(len(trainData) * TRAIN_SPLIT)
numValSamples = int(len(trainData) * VAL_SPLIT)
(trainData, valData) = torch.utils.data.random_split(trainData,
	[numTrainSamples, numValSamples],
	generator=torch.Generator().manual_seed(42))

[INFO] loading the CIFAR10 dataset...
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz


100.0%

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100.0%


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


100.0%

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw

[INFO] generating the train/validation split...





In [122]:
# initialize the train, validation, and test data loaders
trainDataLoader = torch.utils.data.DataLoader(trainData, shuffle=True,batch_size=BATCH_SIZE)
valDataLoader = torch.utils.data.DataLoader(valData, batch_size=BATCH_SIZE)
testDataLoader = torch.utils.data.DataLoader(testData, batch_size=BATCH_SIZE)

# calculate steps per epoch for training and validation set
trainSteps = len(trainDataLoader.dataset) // BATCH_SIZE
valSteps = len(valDataLoader.dataset) // BATCH_SIZE

In [127]:
class DetachedLayer(nn.Module):
    '''
    Layer that will be chained with other layers by the
    RL agent.
    '''
    def __init__(self, in_channels, classes, kernel_size=(5,5)):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels=20, kernel_size=kernel_size)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=kernel_size)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.fc1 = nn.Linear(in_features=800, out_features=500)
        self.relu3 = nn.ReLU()

        self.fc2 = nn.Linear(in_features=500, out_features=classes)  # final output has 10 classes
        self.logSoftmax = nn.LogSoftmax(dim=1)


    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        # the '1' in flatten keeps the first dimension,
        # which is the batch size 4
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        output = self.logSoftmax(x)

        return output


In [128]:
print("[INFO] initializing the model...")
model = DetachedLayer(
	in_channels=1,
	classes=len(trainData.dataset.classes)).to(device)


[INFO] initializing the model...


In [129]:
opt = torch.optim.Adam(model.parameters(), lr=INIT_LR)
lossFn = nn.NLLLoss()

In [134]:
# initialize a dictionary to store training history
H = {
	"train_loss": [],
	"train_acc": [],
	"val_loss": [],
	"val_acc": []
}
print("[INFO] training the network...")
startTime = time.time()
for e in range(0, EPOCHS):
	model.train()

	totalTrainLoss = 0
	totalValLoss = 0
	trainCorrect = 0
	valCorrect = 0
	for (x, y) in trainDataLoader:
		(x, y) = (x.to(device), y.to(device))
		pred = model(x)
		loss = lossFn(pred, y)

		opt.zero_grad()
		loss.backward()
		opt.step()

		totalTrainLoss += loss
		trainCorrect += (pred.argmax(1) == y).type(
			torch.float).sum().item()

endTime = time.time()
print("[INFO] total time taken to train the model: {:.2f}s".format(
	endTime - startTime))
print("[INFO] evaluating network...")
with torch.no_grad():
	model.eval()
	
	preds = []
	for (x, y) in testDataLoader:
		x = x.to(device)
		
		pred = model(x)
		preds.extend(pred.argmax(axis=1).cpu().numpy())

print(classification_report(testData.targets.cpu().numpy(),
	np.array(preds), target_names=testData.classes))

[INFO] training the network...


In [133]:
	# calculate the average training and validation loss
	avgTrainLoss = totalTrainLoss / trainSteps
	avgValLoss = totalValLoss / valSteps

	# calculate the training and validation accuracy
	trainCorrect = trainCorrect / len(trainDataLoader.dataset)
	valCorrect = valCorrect / len(valDataLoader.dataset)

	# update our training history
	H["train_loss"].append(avgTrainLoss.cpu().detach().numpy())
	H["train_acc"].append(trainCorrect)
	H["val_loss"].append(avgValLoss.cpu().detach().numpy())
	H["val_acc"].append(valCorrect)

	print("Average train loss: {:.5f}, Average train accuracy: {:.4f}".format(
		avgTrainLoss, trainCorrect))
	print("Average validation loss: {:.5f}, Average validation accuracy: {:.4f}\n".format(
		avgValLoss, valCorrect))

[INFO] EPOCH: 9/10
Train loss: 0.00514, Train accuracy: 0.0000
Val loss: 0.00000, Val accuracy: 0.0000

