# Homework 2

In [1]:
from fastai.text.all import *

# import pytorch

## Porblem 1 (25 ponits): Deep Learning Basics

### Note: You can only use tensors for this problem.

In [None]:
path = untar_data(URLs.MNIST)
Path.BASE_PATH = path
path.ls()

In [None]:
(path/'training').ls().sorted()

In [None]:
(path/'testing').ls().sorted()

### 1a (10 points). Create a 'typical' digit for each by averaging and classify the test data with them. Report your overall accuracy. 

In [None]:
def mnist_distance(a, b): return (a - b).abs().mean((-1, -2))

def is_digit(x, mean_digit, other_mean_digit): return mnist_distance(x,mean_digit) < mnist_distance(x,other_mean_digit)

In [None]:
# train
digits = ['1', '2', '3', '4', '5', '6', '7', '8', '9']

mean_representations = {}

for digit in digits:
    paths = (path/'training'/digit).ls().sorted()

    digit_tensors = [tensor(Image.open(o)) for o in paths]

    stacked_digits = torch.stack(digit_tensors).float() / 255

    mean_rep = stacked_digits.mean(0)

    mean_representations[digit] = mean_rep

In [None]:
# evaluate
accs = []
for digit in tqdm(digits):
    paths = (path/'testing'/digit).ls().sorted()

    digit_tensors = torch.stack([tensor(Image.open(o)) for o in paths])

    mean_digit = mean_representations[digit]

    final_preds = (torch.zeros(digit_tensors.shape[0], dtype=torch.bool) + 1).bool()
    for other_digit in digits:

        if other_digit == digit: continue

        mean_other_digit = mean_representations[other_digit]

        preds = is_digit(digit_tensors, mean_digit, mean_other_digit)

        final_preds = torch.logical_and(preds, final_preds)
    
    acc = final_preds.float().mean()
    accs.append(acc)
    print("Accuracy for digit '{}':\t{}".format(digit,acc))
print("Overall ACC: {}".format(tensor(accs).float().mean()))


### 1b (10 points). Create a NN to classify digits (trained with regular SGD). Note that:
- Since you deal with 10 categories here, it is preferrable to use cross entropy loss.
- Again, anything other than tensors are not allowed in this problem so build your NN from scratch.

In [1]:
from fastai.vision.all import *
import torch
import torchvision.transforms as transforms
import torchvision


In [2]:
path = untar_data(URLs.MNIST)
Path.BASE_PATH = path
path.ls()

(#2) [Path('training'),Path('testing')]

In [8]:
transform = transforms.Compose(
    [
    transforms.Grayscale(), 
    transforms.ToTensor(), 
    transforms.Normalize([0.5], [0.5]), 
    transforms.Lambda(lambda x: torch.flatten(x)),
    # transforms.Lambda(lambda x:F.one_hot(x,10)),
    ]
)

In [9]:
full_dataset = torchvision.datasets.ImageFolder((path/"training").as_posix(), transform = transform)

# train_size = int(0.8 * len(full_dataset))
# valid_size = 0
train_size = len(full_dataset)
valid_size = 0

train_dataset, valid_dataset = torch.utils.data.random_split(full_dataset, [train_size, valid_size])

testing_set = torchvision.datasets.ImageFolder((path/"testing").as_posix(), transform = transform)

In [10]:
train_dataset.dataset.classes

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [11]:
bs = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=bs, shuffle=True)
# valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=bs, shuffle=True)
test_loader = torch.utils.data.DataLoader(testing_set, batch_size=len(testing_set), shuffle=True)
dataloaders = {
    "train": train_loader,
    "test": test_loader
}


In [13]:
eps = 1e-6

def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()

def linear(xb, w, b):
    return xb@w + b

def softmax(logits):
    return torch.exp(logits + eps) / (torch.exp(logits + eps).sum(-1, keepdim=True) + eps)

def log_prob(preds):
    preds = preds.squeeze()
    return -torch.log(preds + eps)

def ce_loss(preds, yb):
    # probs = softmax(preds)
    # log_probs = log_prob(probs)
    # return log_probs.mean()
    return F.cross_entropy(preds, yb)

def relu(x):
    return x.clamp_min(0.)

def accuracy(preds, yb):
    preds = torch.argmax(softmax(preds), dim=1) == yb
    return preds.float().mean()
    # return correct.float().mean()

def train(train_dl, epochs=10, lr=1e-2):
    weights = init_params((28*28, 128))
    bias = init_params(1)

    h1_w = init_params((128, 64))
    bias_h1 = init_params(1)    

    out_w = init_params((64, 10))
    out_b = init_params(1)

    accs = []
    losses = []
    for epoch in range(epochs):
        print("Epoch: {}".format(epoch))
        per_epoch_loss = []
        per_epoch_acc = []
        for xb, yb in train_dl:
            preds = relu(linear(xb, weights, bias))
            preds = relu(linear(preds, h1_w, bias_h1))
            preds = linear(preds, out_w, out_b)
            # loss = ce_loss(preds, yb)
            loss = ce_loss(preds, yb)

            loss.backward()

            per_epoch_loss.append(loss.item())
            per_epoch_acc.append(accuracy(preds, yb).item())

            weights.data -= weights.grad * lr
            bias.data -= bias.grad * lr

            h1_w.data -= h1_w.grad * lr
            bias_h1.data -= bias_h1.grad * lr

            out_w.data -= out_w.grad * lr
            out_b.data -= out_b.grad * lr

            weights.grad.zero_()
            bias.grad.zero_()

            h1_w.grad.zero_()
            bias_h1.grad.zero_()

            out_w.grad.zero_()
            out_b.grad.zero_()

        accs.extend(per_epoch_acc)
        losses.extend(per_epoch_loss)

        print('\tLoss: {}'.format(torch.mean(tensor(per_epoch_loss))))
        print('\tAccuracy: {}'.format(torch.mean(tensor(per_epoch_acc))))

    
    print('Overall Loss: {}'.format(torch.mean(tensor(losses))))
    print('Overall Accuracy: {}'.format(torch.mean(tensor(accs))))
        
    return weights, bias

def evaluate(test_dl, w, b):
    # with torch.no_grad:
    for xb, yb in test_dl:
        preds = linear(xb, w, b)
        acc = accuracy(preds, yb)
        print("Test Accuracy: {}".format(acc))

w, b = train(train_loader, epochs=10, lr=1e-1)
evaluate(test_loader, w, b)

Epoch: 0
	Loss: 3257.454345703125
	Accuracy: 0.09874733537435532
Epoch: 1
	Loss: 2.3025851249694824
	Accuracy: 0.09868070483207703
Epoch: 2
	Loss: 2.3025851249694824
	Accuracy: 0.0987306758761406
Epoch: 3
	Loss: 2.3025851249694824
	Accuracy: 0.09871401637792587
Epoch: 4
	Loss: 2.3025851249694824
	Accuracy: 0.0987306758761406
Epoch: 5
	Loss: 2.3025851249694824
	Accuracy: 0.0987306758761406
Epoch: 6


In [None]:
def sgd_with_momentum(params, lr, mom):
    for p in params:
        p.data -= p.grad.data * lr + p.data * mom
        p.grad.data.zero_()

### 1c (5 points). Implement your own:
- SGD with Momemtum
- RMSProp

by using two more tensors to tracked the smoothed gradients and L2 norms, respectively. Re-train your model on them and report performance.

#### SGD W/ Momentum 

In [None]:
def train(train_dl, epochs=10, lr=1e-2, mom=0.9):
    weights = init_params((28*28, 128))
    bias = init_params(1)
    v_weights = 0
    v_bias = 0

    h1_w = init_params((128, 64))
    bias_h1 = init_params(1)    
    v_h1 = 0
    v_bias_h1 = 0

    out_w = init_params((64, 10))
    out_b = init_params(1)
    v_out_w = 0
    v_out_b = 0

    accs = []
    losses = []
    for epoch in range(epochs):
        print("Epoch: {}".format(epoch))
        per_epoch_loss = []
        per_epoch_acc = []
        for xb, yb in train_dl:
            preds = relu(linear(xb, weights, bias))
            preds = relu(linear(preds, h1_w, bias_h1))
            preds = linear(preds, out_w, out_b)
            # loss = ce_loss(preds, yb)
            loss = ce_loss(preds, yb)

            loss.backward()

            per_epoch_loss.append(loss.item())
            per_epoch_acc.append(accuracy(preds, yb).item())

            mom_inv = (1-mom)
            v_weights = mom * v_weights + mom_inv * weights.grad
            v_bias = mom * v_bias + mom_inv * bias.grad

            v_h1 = mom * v_h1 + h1_w.grad * mom_inv
            v_bias_h1 = mom * v_bias_h1 + bias_h1.grad * mom_inv

            v_out_w = mom * v_out_w + out_w.grad * mom_inv
            v_out_b = mom * v_out_b + out_b.grad * mom_inv

            weights.data -= v_weights * lr
            bias.data -= v_bias * lr

            h1_w.data -= v_h1 * lr
            bias_h1.data -= v_bias_h1 * lr

            out_w.data -= v_out_w * lr
            out_b.data -= v_out_b * lr

            weights.grad.zero_()
            bias.grad.zero_()

            h1_w.grad.zero_()
            bias_h1.grad.zero_()

            out_w.grad.zero_()
            out_b.grad.zero_()

        accs.extend(per_epoch_acc)
        losses.extend(per_epoch_loss)

        print('\tLoss: {}'.format(torch.mean(tensor(per_epoch_loss))))
        print('\tAccuracy: {}'.format(torch.mean(tensor(per_epoch_acc))))

    
    print('Overall Loss: {}'.format(torch.mean(tensor(losses))))
    print('Overall Accuracy: {}'.format(torch.mean(tensor(accs))))
        
    return weights, bias

#### RMSProp

Code selected from NN class code above:

```
def fit_rms_prop(self, dls, decay_rate=0.9):
        cache_input = torch.zeros_like(self.weights_input_hidden)
        cache_output = torch.zeros_like(self.weights_hidden_output)

        overall_loss = []
        for i in range(self.epochs):
            losses = []
            print('Starting epoch: {}'.format(i))
            for batch in dls.train:
                data, labels = batch[0], batch[1]
                data = tensor(data, dtype=torch.float32, device=self.device, requires_grad=True)
                labels = tensor(labels, dtype=torch.uint8, device=self.device,  requires_grad=True)

                data = data.to(torch.float64)
                labels = labels.to(torch.uint8)
                x = self.forward(data)

                loss = self.cross_entropy_loss(x, labels)
                # self.loss(x, labels)
                losses.append(loss.detach().float())
                overall_loss.append(loss.detach().float())
                loss.backward()
                with torch.no_grad():
                    cache_input = decay_rate * cache_input + (1 - decay_rate) * self.weights_input_hidden.grad
                    cache_output = decay_rate * cache_output + (1 - decay_rate) * self.weights_hidden_output.grad

                    self.weights_input_hidden -= (self.lr * self.weights_input_hidden.grad) / (torch.sqrt(cache_input) + self.eps)
                    self.weights_hidden_output -= (self.lr * self.weights_hidden_output.grad) / (torch.sqrt(cache_output) + self.eps)

            print('\t loss = {}'.format(tensor(loss).float().mean()))
        print('\t overall loss = {}'.format(tensor(overall_loss).float().mean()))
```

## Problem 2 (25 points): Getting Your Data Ready For a NN Language Model

In [2]:
path = untar_data(URLs.HUMAN_NUMBERS)
Path.BASE_PATH = path
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [3]:
lines = [line for p in path.ls()[::-1] for line in p.read_text().split(' \n')]
text = ' . '.join([l for l in lines])
text[:1000]

'eight thousand one . eight thousand two . eight thousand three . eight thousand four . eight thousand five . eight thousand six . eight thousand seven . eight thousand eight . eight thousand nine . eight thousand ten . eight thousand eleven . eight thousand twelve . eight thousand thirteen . eight thousand fourteen . eight thousand fifteen . eight thousand sixteen . eight thousand seventeen . eight thousand eighteen . eight thousand nineteen . eight thousand twenty . eight thousand twenty one . eight thousand twenty two . eight thousand twenty three . eight thousand twenty four . eight thousand twenty five . eight thousand twenty six . eight thousand twenty seven . eight thousand twenty eight . eight thousand twenty nine . eight thousand thirty . eight thousand thirty one . eight thousand thirty two . eight thousand thirty three . eight thousand thirty four . eight thousand thirty five . eight thousand thirty six . eight thousand thirty seven . eight thousand thirty eight . eight thou

### 2a (5 points). Split **text** by space to get **tokens** and create a **vocab** with unique **tokens**.

In [55]:
def tokenize_by_space(text):
    return text.split(' ')

def create_vocab_from_tokens(tokens):
    # vocab = {token for i, token in enumerate(set(tokens))}
    # return list(vocab)
    return L(*tokens).unique()

tokens = tokenize_by_space(text)
vocab = create_vocab_from_tokens(tokens)

In [56]:
vocab

(#31) ['eight','thousand','one','.','two','three','four','five','six','seven'...]

### 2b (5 points). Create a mapping from word to indices and convert **tokens** into **nums**, which is a list of indices.

In [58]:
def create_word2idx(vocab):
    return {token: idx for idx, token in enumerate(vocab)}

def convert_tokens_to_ids(tokens, word2idx):
    return L(word2idx[i] for i in tokens)

word2idx = create_word2idx(vocab)
nums = convert_tokens_to_ids(tokens, word2idx)

### 2c (5 points). Create a _dataset_ where each tuple is the input and output of a NN language model. The sequence length of inputs and outputs should be 32. Then create a _training set_ with the first 80% data and a _validation set_ with the rest.

In [93]:
sl = 32

def create_tuples(nums):
    return L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))

def train_test_split(sequences):
    train_size = int(len(sequences) * 0.8)
    train, test = sequences[:train_size], sequences[train_size:]
    return train, test

In [94]:
dset = create_tuples(nums)
# dset
train_ds, valid_ds = train_test_split(dset)

### 2d (10 points). Reorder your training set and validation set to be ready for _dataloaders_ to be used in a NN language model. Note that:
- `m = len(dset) // 64` batches will be created in the new order without shuffling (the first 64 rows will be the first batch, the next 64 rows will be the second batch, and so on).
- The new first 64 rows should be the `1st, (1+m)-th, (1+2m)-th, ..., (1+64m)-th` rows in the original corresponding dataset; The next 64 rows should be the `2nd, (2+m)-th, (2+2m)-th, ..., (2+64m)-th` rows; and so on.

In [98]:

def reorder_dataset(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds


In [99]:
train_ds_reordered = reorder_dataset(train_ds, bs=64)
valid_ds_reordered = reorder_dataset(valid_ds, bs=64)

In [100]:
dls = DataLoaders.from_dsets(
    train_ds_reordered,
    valid_ds_reordered,
    bs=64, drop_last=True, shuffle=False
)

## Problem 3 (25 points): Simple NN Language Model

### 3a (15 points). The stacked/unrolled representation depict the same 2-layer RNN. Implement this RNN with only:
- torch.tensor
- tensor functions (from torch.tensor or torch)
- torch.nn.Embedding
- torch.nn.Linear
- torch.nn.relu

<img alt="2-layer RNN" width="550" caption="2-layer RNN" src="./att_00025.png">

<img alt="2-layer unrolled RNN" width="550" caption="Two-layer unrolled RNN" src="./att_00026.png">

In [108]:
class ACoolRNN(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)   
        self.h_h_2 = nn.Linear(n_hidden, n_hidden)  
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            self.h2 = F.relu(self.h_h_2(self.h)) + self.h
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

### 3b (10 points). Find the best learning rate and train your NN with 1cycle policy and Adam (using _Learner_ class from fastai). Report your result with a reasonable metric.

In [111]:
# Find the best learning rate and train your NN with 1cycle policy and Adam (using Learner class from fastai). Report your result with a reasonable metric

learner = Learner(dls, ACoolRNN(len(vocab), 64), loss_func=loss_func, metrics=accuracy, opt_func=Adam, cbs=ModelResetter)
# learner = Learner(dls, TwoLayerRNN(seq_len, 512, seq_len), loss_func=nn.CrossEntropyLoss(), metrics=accuracy, opt_func=Adam)
learner.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.290276,3.195672,0.206217,00:00
1,2.664793,1.972941,0.470215,00:00
2,2.093985,1.853706,0.467936,00:00
3,1.780326,1.837062,0.475505,00:00
4,1.584092,1.764385,0.498942,00:00
5,1.445986,1.736075,0.522542,00:00
6,1.347024,1.806055,0.532715,00:00
7,1.272966,1.828314,0.534424,00:00
8,1.218623,1.81997,0.540365,00:00
9,1.181584,1.822541,0.538981,00:00


## Problem 4 (25 points):

### 4a (10 points). Reuse the IMDB reviews dataset from Homework 1. Tokenize (experiment with the vocab size yourself) it with the subword tokenizer you developed in Homework 1 and create _Dataloaders_ in the similar way of Problem 2 to get ready for a LSTM Language Model.

### 4b (10 points). Implement a 2-layer LSTM Language Model that has:
- One dropout layer (with 50% chance)
- Weight tying between the input and output embedding layers
- Activation regularization
- Temporal activation regularization

You can use anything, such as the LSTM module from pytorch.

In [None]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p_dropout, bs):

    def reset(self): 
        

In [None]:
class MyRNNRegularizer(Callback):
    def __init__(self, alpha=0., beta=0.): self.alpha, self.beta = alpha, beta

    # This is called after the forward pass of the model
    def after_pred(self):
        # Store inside this class the raw output of the model and the dropped out output, respectively 
        self.raw, self.out = 
        # Modify the model's output to be just the activation of the last layer
        self.learn.pred = 

    # This is called after the normal loss is computed
    def after_loss(self):
        if not self.training: return
        if self.alpha != 0.:
            self.learn.loss += 
        if self.beta != 0.:
            self.learn.loss += 

### 4c (5 points). Find the best learning rate and train your NN with 1cycle policy and Adam. Report your result with a reasonable metric.

In [None]:
learn = Learner(dls, 
                # ...
                cbs=[ModelResetter, MyRNNRegularizer(alpha=2, beta=1)])