# Homework 2

In [1]:
from fastai.text.all import *

# import pytorch

## Porblem 1 (25 ponits): Deep Learning Basics

### Note: You can only use tensors for this problem.

In [None]:
path = untar_data(URLs.MNIST)
Path.BASE_PATH = path
path.ls()

In [None]:
(path/'training').ls().sorted()

In [None]:
(path/'testing').ls().sorted()

### 1a (10 points). Create a 'typical' digit for each by averaging and classify the test data with them. Report your overall accuracy. 

In [None]:
def mnist_distance(a, b): return (a - b).abs().mean((-1, -2))

def is_digit(x, mean_digit, other_mean_digit): return mnist_distance(x,mean_digit) < mnist_distance(x,other_mean_digit)

In [None]:
# train
digits = ['1', '2', '3', '4', '5', '6', '7', '8', '9']

mean_representations = {}

for digit in digits:
    paths = (path/'training'/digit).ls().sorted()

    digit_tensors = [tensor(Image.open(o)) for o in paths]

    stacked_digits = torch.stack(digit_tensors).float() / 255

    mean_rep = stacked_digits.mean(0)

    mean_representations[digit] = mean_rep

In [None]:
# evaluate
accs = []
for digit in tqdm(digits):
    paths = (path/'testing'/digit).ls().sorted()

    digit_tensors = torch.stack([tensor(Image.open(o)) for o in paths])

    mean_digit = mean_representations[digit]

    final_preds = (torch.zeros(digit_tensors.shape[0], dtype=torch.bool) + 1).bool()
    for other_digit in digits:

        if other_digit == digit: continue

        mean_other_digit = mean_representations[other_digit]

        preds = is_digit(digit_tensors, mean_digit, mean_other_digit)

        final_preds = torch.logical_and(preds, final_preds)
    
    acc = final_preds.float().mean()
    accs.append(acc)
    print("Accuracy for digit '{}':\t{}".format(digit,acc))
print("Overall ACC: {}".format(tensor(accs).float().mean()))


### 1b (10 points). Create a NN to classify digits (trained with regular SGD). Note that:
- Since you deal with 10 categories here, it is preferrable to use cross entropy loss.
- Again, anything other than tensors are not allowed in this problem so build your NN from scratch.

In [None]:
import torch
from fastai.vision.all import *

In [None]:
class Flatten(Transform):
    def encodes(self, x: PIL.Image.Image):
        return tensor(np.array(x).flatten())

# Create the DataBlock with the Flatten transform
datablock = DataBlock(blocks=(ImageBlock(cls=PILImageBW), CategoryBlock),
                      get_items=get_image_files,
                      splitter=RandomSplitter(),
                      get_y=parent_label,
                      item_tfms=[Resize(28, method=ResizeMethod.Squish, resamples=(1, 1)), 
                                 Flatten(),
                                 ToTensor(),
                                    Normalize.from_stats(*imagenet_stats)
                                ])

# Create the DataLoaders with batch size of 64
dls = datablock.dataloaders(path, bs=64)
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
dls.to(torch.device(device))

In [45]:
class MyNN:
    def __init__(self, layers, lr, epochs, optim='sgd', gamma=0.9) -> None:
        # self.input_size = input_size
        # self.hidden_size = hidden_size
        # self.output_size = output_size

        self.activation = torch.nn.ReLU()
        self.loss = torch.nn.CrossEntropyLoss()

        self.lr = lr
        self.epochs = epochs    

        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')

        self.soft_max = torch.nn.Softmax(dim=1)

        self.eps = 1e-6

        if optim == 'sgd':
            self.opt = self.sgd
        elif optim == 'sgd_mom':
            self.opt = self.sgd_momentum
        elif optim == 'rms':
            self.opt = self.rms_prop
        self.gamma = gamma

        # Create random weight tensors for input to hidden and hidden to output layers
        # self.weights_input_hidden = torch.randn(input_size, hidden_size, requires_grad=True, device=self.device, dtype=torch.float64)
        # self.weights_hidden_output = torch.randn(hidden_size, output_size, requires_grad=True, device=self.device, dtype=torch.float64)

        self.weights = {}
        self.n_layers = len(layers)
        for i in range(len(layers)):
            self.weights['w'+str(i)] = torch.randn(layers[i][0], layers[i][1], requires_grad=True, device=self.device, dtype=torch.float64) 

            if optim == 'sgd_mom' or optim == 'rms':
                self.weights['v'+str(i)] = torch.zeros_like(self.weights['w'+str(i)])

    def train(self, dls):
        for i in range(self.epochs):
            losses = []
            accs = []
            print('Starting epoch: {}'.format(i))
            for batch in dls.train:
                data, labels = batch[0], batch[1]
                data = tensor(data, dtype=torch.float32, device=self.device, requires_grad=True)
                labels = tensor(labels, dtype=torch.uint8, device=self.device,  requires_grad=True)

                data = data.to(torch.float64)
                labels = labels.to(torch.uint8)
                x = self.forward(data)

                loss = self.cross_entropy_loss(x, labels)
                
                loss.backward()

                losses.append(loss.detach().float())


                accs.append(self.accuracy(x, labels))

                with torch.no_grad():
                    self.opt()

            print('\t loss = {}'.format(tensor(loss).float().mean()))
            print('\t acc = {}'.format(tensor(accs).float().mean()))

    def accuracy(self, preds, labels):
        """sumary_line
        
        Keyword arguments:
        labels -- list of integer labels
        preds -- list of float probabilities for each label per data point
        Return: accuract of preds
        """
        # s = self.soft_max(preds)
        s = self.soft_max(preds)
        preds = torch.argmax(s, dim=1)
        return (preds == labels).float().mean()
    
    def eval(self, dls):
        accs = []
        with torch.no_grad:
            for batch in dls.test:
                data, labels = batch[0], batch[1]
                data = tensor(data, dtype=torch.float32, device=self.device)
                labels = tensor(labels, dtype=torch.uint8, device=self.device)

                data = data.to(torch.float64)
                labels = labels.to(torch.uint8)
                x = self.forward(data)

                preds = torch.argmax(x, dim=1)
                acc = (preds == labels).float().mean()
                accs.append(acc)
        print('Accuracy: {}'.format(tensor(accs).float().mean()))

    def sgd(self):
        for i in range(self.n_layers):
            self.weights['w'+str(i)] -= self.lr * self.weights['w'+str(i)].grad
            # self.weights['b'+str(i)] -= self.lr * self.weights['b'+str(i)].grad

    def sgd_momentum(self):
        for i in range(self.n_layers):
            self.weights['v'+str(i)] = self.gamma * self.weights['v'+str(i)] - self.lr * self.weights['w'+str(i)]
            self.weights['w'+str(i)] += self.weights['v'+str(i)]
            
    def rms_prop(self):
        for i in range(self.n_layers):
            self.weights['v'+str(i)] = self.gamma * self.weights['v'+str(i)] + (1 - self.gamma) * self.weights['w'+str(i)].grad**2
            self.weights['w'+str(i)] += (self.lr * self.weights['w'+str(i)].grad) / (torch.sqrt(self.weights['v'+str(i)]+ self.eps) )

    def forward(self, batch):
        x = torch.matmul(batch, self.weights['w'+str(0)])
        # x = self.sigmoid_activation(x)
        x = self.activation(x)

        # print(self.weights['w'+str(0)].is_leaf)
        for i in range(1, self.n_layers):
            x = torch.matmul(x, self.weights['w'+str(i)])
            if i != self.n_layers-1:
                # x = self.sigmoid_activation(x)
                x = self.activation(x)
        # print(self.weights['w'+str(i)].is_leaf)
            
        return x

    def sigmoid_activation(self, x):
        return 1 / (1 + torch.exp(-x + self.eps))
    
    def cross_entropy_loss(self, predictions, targets):
        batch_size = predictions.size(0)
        # s = self.soft_max(predictions)
        s = self.soft_max(predictions)

        log_liklihood = -torch.log(s[range(batch_size), targets.tolist()] + self.eps)
        loss = log_liklihood.sum()
        return loss

In [46]:
torch.autograd.set_detect_anomaly(True)
model = MyNN([(784, 5), (5, 10)], lr=0.01, epochs=10)
model.train(dls)
model.eval(dls)

Starting epoch: 0
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


KeyboardInterrupt: 

In [None]:
model = MyNN([784, 256, 128, 64, 10], lr=0.01, epochs=10, optim='sgd_mom')
model.train(dls)
model.eval(dls)

In [None]:
model = MyNN([784, 256, 128, 64, 10], lr=0.01, epochs=30, optim='rms')
model.train(dls)
model.eval(dls)

#### Cant use torch ig

In [None]:
model = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Flatten(),
    nn.Linear(64 * 7 * 7, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)

model.to(torch.device('cuda'))

In [None]:
learn = Learner(dls, model, opt_func=SGD, loss_func=nn.CrossEntropyLoss(), metrics=accuracy)
learn.fit(5)


### 1c (5 points). Implement your own:
- SGD with Momemtum
- RMSProp

by using two more tensors to tracked the smoothed gradients and L2 norms, respectively. Re-train your model on them and report performance.

#### SGD W/ Momentum 

Code selected from NN class code above:

```

def fit_sgd_momentum(self, dls, momentum=0.9):
        velocity_input = torch.zeros_like(self.weights_input_hidden)
        velocity_output = torch.zeros_like(self.weights_hidden_output)

        overall_loss = []
        for i in range(self.epochs):
            losses = []
            print('Starting epoch: {}'.format(i))
            for batch in dls.train:
                data, labels = batch[0], batch[1]
                data = tensor(data, dtype=torch.float32, device=self.device, requires_grad=True)
                labels = tensor(labels, dtype=torch.uint8, device=self.device,  requires_grad=True)

                data = data.to(torch.float64)
                labels = labels.to(torch.uint8)
                x = self.forward(data)

                loss = self.cross_entropy_loss(x, labels)
                # self.loss(x, labels)
                losses.append(loss.detach().float())
                overall_loss.append(loss.detach().float())
                loss.backward()
                with torch.no_grad():
                    velocity_input = momentum * velocity_input - self.lr * self.weights_input_hidden.grad
                    velocity_output = momentum * velocity_output - self.lr * self.weights_hidden_output.grad

                    self.weights_input_hidden += velocity_input
                    self.weights_hidden_output += velocity_output

            print('\t loss = {}'.format(tensor(loss).float().mean()))
        print('\t overall loss = {}'.format(tensor(overall_loss).float().mean()))

```

#### RMSProp

Code selected from NN class code above:

```
def fit_rms_prop(self, dls, decay_rate=0.9):
        cache_input = torch.zeros_like(self.weights_input_hidden)
        cache_output = torch.zeros_like(self.weights_hidden_output)

        overall_loss = []
        for i in range(self.epochs):
            losses = []
            print('Starting epoch: {}'.format(i))
            for batch in dls.train:
                data, labels = batch[0], batch[1]
                data = tensor(data, dtype=torch.float32, device=self.device, requires_grad=True)
                labels = tensor(labels, dtype=torch.uint8, device=self.device,  requires_grad=True)

                data = data.to(torch.float64)
                labels = labels.to(torch.uint8)
                x = self.forward(data)

                loss = self.cross_entropy_loss(x, labels)
                # self.loss(x, labels)
                losses.append(loss.detach().float())
                overall_loss.append(loss.detach().float())
                loss.backward()
                with torch.no_grad():
                    cache_input = decay_rate * cache_input + (1 - decay_rate) * self.weights_input_hidden.grad
                    cache_output = decay_rate * cache_output + (1 - decay_rate) * self.weights_hidden_output.grad

                    self.weights_input_hidden -= (self.lr * self.weights_input_hidden.grad) / (torch.sqrt(cache_input) + self.eps)
                    self.weights_hidden_output -= (self.lr * self.weights_hidden_output.grad) / (torch.sqrt(cache_output) + self.eps)

            print('\t loss = {}'.format(tensor(loss).float().mean()))
        print('\t overall loss = {}'.format(tensor(overall_loss).float().mean()))
```

## Problem 2 (25 points): Getting Your Data Ready For a NN Language Model

In [2]:
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH = path
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [3]:
lines = [line for p in path.ls()[::-1] for line in p.read_text().split(' \n')]
text = ' . '.join([l for l in lines])
text[:1000]

'eight thousand one . eight thousand two . eight thousand three . eight thousand four . eight thousand five . eight thousand six . eight thousand seven . eight thousand eight . eight thousand nine . eight thousand ten . eight thousand eleven . eight thousand twelve . eight thousand thirteen . eight thousand fourteen . eight thousand fifteen . eight thousand sixteen . eight thousand seventeen . eight thousand eighteen . eight thousand nineteen . eight thousand twenty . eight thousand twenty one . eight thousand twenty two . eight thousand twenty three . eight thousand twenty four . eight thousand twenty five . eight thousand twenty six . eight thousand twenty seven . eight thousand twenty eight . eight thousand twenty nine . eight thousand thirty . eight thousand thirty one . eight thousand thirty two . eight thousand thirty three . eight thousand thirty four . eight thousand thirty five . eight thousand thirty six . eight thousand thirty seven . eight thousand thirty eight . eight thou

### 2a (5 points). Split **text** by space to get **tokens** and create a **vocab** with unique **tokens**.

In [55]:
def tokenize_by_space(text):
    return text.split(' ')

def create_vocab_from_tokens(tokens):
    # vocab = {token for i, token in enumerate(set(tokens))}
    # return list(vocab)
    return L(*tokens).unique()

tokens = tokenize_by_space(text)
vocab = create_vocab_from_tokens(tokens)

In [56]:
vocab

(#31) ['eight','thousand','one','.','two','three','four','five','six','seven'...]

### 2b (5 points). Create a mapping from word to indices and convert **tokens** into **nums**, which is a list of indices.

In [58]:
def create_word2idx(vocab):
    return {token: idx for idx, token in enumerate(vocab)}

def convert_tokens_to_ids(tokens, word2idx):
    return L(word2idx[i] for i in tokens)

word2idx = create_word2idx(vocab)
nums = convert_tokens_to_ids(tokens, word2idx)

### 2c (5 points). Create a _dataset_ where each tuple is the input and output of a NN language model. The sequence length of inputs and outputs should be 32. Then create a _training set_ with the first 80% data and a _validation set_ with the rest.

In [77]:
sl = 32

def create_tuples(nums):
    return L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))

def train_test_split(sequences):
    train_size = int(len(sequences) * 0.8)
    train, test = sequences[:train_size], sequences[train_size:]
    return train, test

In [78]:
dset = create_tuples(nums)
# dset
train_ds, valid_ds = train_test_split(dset)

### 2d (10 points). Reorder your training set and validation set to be ready for _dataloaders_ to be used in a NN language model. Note that:
- `m = len(dset) // 64` batches will be created in the new order without shuffling (the first 64 rows will be the first batch, the next 64 rows will be the second batch, and so on).
- The new first 64 rows should be the `1st, (1+m)-th, (1+2m)-th, ..., (1+64m)-th` rows in the original corresponding dataset; The next 64 rows should be the `2nd, (2+m)-th, (2+2m)-th, ..., (2+64m)-th` rows; and so on.

In [79]:
def reorder_dataset(dataset):
    m = len(dataset) // 64
    new_dset = []
    for i in range(64):
        for j in range(m):
            t = dataset[i+j*64]
            new_dset.append(t)
            # new_dset.append(dataset[i+j*64])
    # print(torch.stack(new_dset).shape)
    # print(torch.tensor(new_dset).shape)
    # return torch.stack(new_dset)
    return new_dset


In [80]:
train_ds_reordered = reorder_dataset(train_ds)
valid_ds_reordered = reorder_dataset(valid_ds)

In [81]:
dls = DataLoaders.from_dsets(
    train_ds_reordered,
    valid_ds_reordered,
    bs=64, drop_last=True, shuffle=False
)

## Problem 3 (25 points): Simple NN Language Model

### 3a (15 points). The stacked/unrolled representation depict the same 2-layer RNN. Implement this RNN with only:
- torch.tensor
- tensor functions (from torch.tensor or torch)
- torch.nn.Embedding
- torch.nn.Linear
- torch.nn.relu

<img alt="2-layer RNN" width="550" caption="2-layer RNN" src="./att_00025.png">

<img alt="2-layer unrolled RNN" width="550" caption="Two-layer unrolled RNN" src="./att_00026.png">

In [91]:
class ACoolRNN(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        outs = []
        for i in range(seq_len):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

### 3b (10 points). Find the best learning rate and train your NN with 1cycle policy and Adam (using _Learner_ class from fastai). Report your result with a reasonable metric.

In [92]:
# Find the best learning rate and train your NN with 1cycle policy and Adam (using Learner class from fastai). Report your result with a reasonable metric

learner = Learner(dls, ACoolRNN(len(vocab), 64), loss_func=loss_func, metrics=accuracy, opt_func=Adam, cbs=ModelResetter)
# learner = Learner(dls, TwoLayerRNN(seq_len, 512, seq_len), loss_func=nn.CrossEntropyLoss(), metrics=accuracy, opt_func=Adam)
learner.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time


ValueError: Expected input batch_size (2048) to match target batch_size (64).

## Problem 4 (25 points):

### 4a (10 points). Reuse the IMDB reviews dataset from Homework 1. Tokenize (experiment with the vocab size yourself) it with the subword tokenizer you developed in Homework 1 and create _Dataloaders_ in the similar way of Problem 2 to get ready for a LSTM Language Model.

### 4b (10 points). Implement a 2-layer LSTM Language Model that has:
- One dropout layer (with 50% chance)
- Weight tying between the input and output embedding layers
- Activation regularization
- Temporal activation regularization

You can use anything, such as the LSTM module from pytorch.

In [None]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p_dropout, bs):

    def reset(self): 
        

In [None]:
class MyRNNRegularizer(Callback):
    def __init__(self, alpha=0., beta=0.): self.alpha, self.beta = alpha, beta

    # This is called after the forward pass of the model
    def after_pred(self):
        # Store inside this class the raw output of the model and the dropped out output, respectively 
        self.raw, self.out = 
        # Modify the model's output to be just the activation of the last layer
        self.learn.pred = 

    # This is called after the normal loss is computed
    def after_loss(self):
        if not self.training: return
        if self.alpha != 0.:
            self.learn.loss += 
        if self.beta != 0.:
            self.learn.loss += 

### 4c (5 points). Find the best learning rate and train your NN with 1cycle policy and Adam. Report your result with a reasonable metric.

In [None]:
learn = Learner(dls, 
                # ...
                cbs=[ModelResetter, MyRNNRegularizer(alpha=2, beta=1)])