In [2]:
from torch import nn
import torch.nn.functional as F

In [3]:
import pdb
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: 
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])     

# Model setup

In [4]:
n, m = x_train.shape
c = y_train.max()+1
nh = 50
n_in = 784
n_out = 10

In [5]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [6]:
model = Model(784, 50, 10) # now 10 cuz we have 10 labels
preds = model(x_train)

In [7]:
preds.shape

torch.Size([50000, 10])

# Cross entropy

Given that we have 10 categories
We compare a 10 els vector of probabilities -> argmax -> vs the integer target

Let's compute softmax: (activation func of last layer to send lin layer out into probability space)
$$ \hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum\limits_{0 \leq j \lt n} e^{x_{j} }}$$

but we actually want log of softmax

In [8]:
test = preds[0].clone()
test, test.shape

(tensor([-0.05,  0.03,  0.22,  0.02,  0.00, -0.09, -0.04, -0.12, -0.15,  0.20], grad_fn=<CloneBackward0>),
 torch.Size([10]))

In [9]:
test.exp().sum()

tensor(10.08, grad_fn=<SumBackward0>)

In [10]:
((test.exp()/test.exp().sum()).log()).shape

torch.Size([10])

In [11]:
def log_softmax(x): 
    # return (x.exp()/x.exp().sum(-1, keepdim=True)).log()
    return (x.exp()/x.exp().sum(-1, keepdim=True)).log()

In [12]:
log_softmax(test).shape

torch.Size([10])

Recalling the properties of logs, we can simply the computation as: (s.t. avoid division/multiplication cuz they might lead to 
numerical instability)

In [13]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

This definition of log softmax might incurr in overflow: in the denominator sum of e^xi if any xi is a large number-> e^xi will be much larger -> overflow. Thus we apply the LogSumExp trick:
consider:

$$ a=max(x_{j}) $$
then:
$$ log \left(\sum_{j=1}^{n} e^{x_{j}}\right) = log \left(e^{a} \sum_{j=1}^{n} e^{x_{j}-a}\right) = a + log \left(\sum_{j=1}^{n}{e^{x_{j}-a}} \right)$$

In [14]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()    

def log_softmax(x):
    return x - x.logsumexp(-1,keepdim=True) # x.logsumexp pyt version of logsumexp

In [15]:
test_close(logsumexp(preds), preds.logsumexp(-1))

since we have a multilabel plroblem (not binar label) we can use as softmax:
    $$ -\sum_{c=1}^My_{o,c}\log(p_{o,c}) $$
- M - number of classes
- y binary indicator (0 or 1) if class label c is the correct classification for observation o
- p predicted probability observation o is of class c

Since y is one hot encoded, the loss for an obs i and supposing that the correct class is the third (idx = 3), is:
$$ L_{i} = -\log(\hat{y}_{i, 3}) $$
that is called negative cross entropy loss

In [16]:
# plot here y = -log(x) with 0<=x<=1

# Indexing 
To avoid making all targets 1-hot-encoded, we can use indexing

In [17]:
y_train[:3]
# these are the correct classes for the first 3 obs in x_train

tensor([5, 0, 4])

In [18]:
# the probability extimated for the correct class are:
preds[0,5],preds[1,0], preds[2,4]


(tensor(-0.09, grad_fn=<SelectBackward0>),
 tensor(-0.07, grad_fn=<SelectBackward0>),
 tensor(0.15, grad_fn=<SelectBackward0>))

In [19]:
# we can do the same of above:
preds[[0,1,2], y_train[:3]]
# [0,1,2] list of idxs to select rows from preds
# y_train[:3] list of idxs to select from each row

tensor([-0.09, -0.07,  0.15], grad_fn=<IndexBackward0>)

In [20]:
# negative cross entropy loss
def nll(inp, targets):
    return -inp[range(targets.shape[0]), targets].mean()

In [21]:
sm_pred = log_softmax(preds) # take raw preds and pass em thru log_soft_max
loss = nll(sm_pred, y_train) # compute loss

# nll in pytorch is called F.nll_loss
test_close(F.nll_loss(F.log_softmax(preds,-1), y_train), loss, 1e-3)

In [22]:
# in pytorch F.log_softmax and F.nll_loss are combined in one optimized func
# called F.cross_entropy
test_close(F.cross_entropy(preds, y_train), loss, 1e-3)

# Training Loop

Basically the training loop repeats over the following steps:

- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [23]:
loss_func = F.cross_entropy

In [24]:
bs=50                  # batch size

xb = x_train[:bs]     # a mini-batch from x
preds = model(xb)      # predictions
preds[0], preds.shape

(tensor([-0.05,  0.03,  0.22,  0.02,  0.00, -0.09, -0.04, -0.12, -0.15,  0.20], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [25]:
yb = y_train[:bs]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3])

In [26]:
loss_func(preds, yb)

tensor(2.28, grad_fn=<NllLossBackward0>)

In [27]:
preds.argmax(dim=1) # for predictions use argmax over each pred row

tensor([2, 2, 4, 2, 2, 9, 2, 2, 1, 9, 9, 9, 2, 2, 1, 2, 2, 2, 9, 9, 2, 2, 9, 2, 2, 2, 2, 2, 2, 1, 9, 1, 2, 2, 2, 9, 2, 9, 2,
        9, 9, 2, 9, 2, 9, 2, 2, 2, 2, 2])

In [28]:
def accuracy(out, y):
    return (out.argmax(dim=1) == y).float().mean()

In [29]:
accuracy(preds, yb)

tensor(0.16)

In [30]:
lr = 0.5
epochs = 3

In [31]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs)) # list of idxs of bach
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        if i == 0: # at the beginning of each epoch
            print(f'loss {loss}, acc {accuracy(preds, yb)}')
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'): # it is a layer that has weights
                    l.weight -= l.weight.grad * lr
                    l.bias -= l.bias.grad * lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
            

loss 2.2825334072113037, acc 0.1599999964237213
loss 0.21871758997440338, acc 0.9399999976158142
loss 0.13389278948307037, acc 0.9599999785423279


Let's now start refactoring -> doing the same things with less code
# Parameters

In [32]:
# let's see how the nn.Module() class works
# we don't normally use it like this, so let's just look at how it works
module = nn.Module()
module.foo = nn.Linear(3,4) # create an attribute of the module
module

Module(
  (foo): Linear(in_features=3, out_features=4, bias=True)
)

In [33]:
# we can grab all the children of the module, that acts as root
list(module.named_children()) # list of tuples

[('foo', Linear(in_features=3, out_features=4, bias=True))]

In [34]:
# get all params of the module
list(module.parameters())

[Parameter containing:
 tensor([[-0.24, -0.11,  0.46],
         [-0.17, -0.46, -0.07],
         [-0.29,  0.31,  0.08],
         [-0.24,  0.13, -0.17]], requires_grad=True),
 Parameter containing:
 tensor([ 0.01,  0.51, -0.11, -0.47], requires_grad=True)]

In [35]:
# usually we inherit nn.module and then we build our own network on top of it
# nn.module allows automatic param tracking as we've seen above
class MLP(nn.Module):
    # a custom pyt module, it knows its attribute and its parameters
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.l2 = nn.Linear(nh, n_out)
        self.relu = nn.ReLU()
        self.asd = "hello"
        
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))
        
        

In [36]:
model = MLP(784, 50, 10)
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [37]:
model # prints all data members

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
  (relu): ReLU()
)

In [38]:
for name, pytObj in model.named_children():
    print(f"{name}: {pytObj}")

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)
relu: ReLU()


In [39]:
# a custom pyt module, it knows its parameters
for p in model.parameters():
    print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [40]:
# this is useful cuz we can change the train loop as following

def fit():
    for epoch in range(epochs):
        for i in range(0, n, bs):
            s = slice(i, min(n, i+bs)) # list of idxs of bach
            xb, yb = x_train[s], y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            if i == 0: # at the beginning of each epoch
                print(f'loss {loss}, acc {accuracy(preds, yb)}')
            with torch.no_grad():
                for param in model.parameters():
                    param -= param.grad * lr
                model.zero_grad()            

In [41]:
fit()

loss 2.3002638816833496, acc 0.14000000059604645
loss 0.1547200083732605, acc 0.9200000166893005
loss 0.164123997092247, acc 0.9200000166893005


In [42]:
# But how can a nn.Module keep track of the params it contains?
# __setattr__(self,args)

# let's create nn.Module from scratch
class MyModule(object): # object is always omitted but we inherit from it
    def __init__(self, n_in, nh, n_out):
        self._modules = {} # private container dict
        self.l1 = nn.Linear(n_in, nh) # this actually calls __setattr__
        self.l2 = nn.Linear(nh, n_out) # this actually calls __setattr__
    
    # __setattr__ to create a datamember at runtime for this obj:
    # this.foo = x will create a dmember self.foo = x
    def __setattr__(self, k, v):
        if not k.startswith("_"): # if the datamember is not private
            self._modules[k] = v
        super().__setattr__(k,v) # calls the method of python Class code that
        # actually does the creation of the dmember
        
    # defines how to print this
    def __repr__(self):
        return f'{self._modules}'

    def parameters(self):
        for l in self._modules.values(): # for each layer
            yield from l.parameters() # creates generator from list

In [43]:
myMod = MyModule(784, 50, 10)
myMod

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [44]:
for p in myMod.parameters():
    print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [45]:
# so now we can use pytorch nn.Module!

# Registering Layers

In [46]:
from functools import reduce

In [47]:
# in the model above we had as data member:
# self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]


In [48]:
# this model now takes as input in ctor a list of layers
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__() # always call this!
        self.layers = layers
        for i, layer in enumerate(self.layers):
            self.add_module(f'layer_{i}', l) # base class method
    
    def forward(self,x):
        # reduce/reduction: 
        # reduce(func, sequence, initial_val) -> val
        # start with x, then apply f(x,l) where x is init val and l is first
        # el of the list, then l is iterated over sequence
        # while val is the output of the prev f(x,l)
        
        return reduce(lambda val, layer: layer(val), self.layers, x)

In [49]:
myMod = Model(layers)
myMod

Model(
  (layer_0): Linear(in_features=50, out_features=10, bias=True)
  (layer_1): Linear(in_features=50, out_features=10, bias=True)
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [50]:
# the pytorch implementation of the above:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers) # now this handles the loop above
        # it registres Layers automatically for us
        
    def forward(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [51]:
myMod = SequentialModel(layers) # nn.Squential
myMod

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [52]:
model = nn.Sequential(nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out))
fit()

loss 2.319960594177246, acc 0.07999999821186066
loss 0.1238829717040062, acc 0.9599999785423279
loss 0.06840486824512482, acc 0.9599999785423279


# Optim
Optimizer of params is crucial, let's implement it on our own!

In [53]:
class Optimizer():
    def __init__(self, params, lr = 0.5):
        self.params = list(params) # impo to take them as list! (no generator)
        self.lr = lr
    
    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr
    
    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_() # .data is a way to avoid with torch.no_grad():


In [54]:
model = nn.Sequential(nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out))
opt = Optimizer(model.parameters())

In [55]:
def report(loss, preds, yb):
    print(f'loss {loss}, acc {accuracy(preds, yb)}')

In [56]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.10364146530628204, acc 0.9800000190734863
loss 0.0641181617975235, acc 1.0
loss 0.05478580296039581, acc 0.9800000190734863


In [57]:
# so now we can use pyt optimizers!
from torch import optim

def get_model():
    model = nn.Sequential(nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out))
    return model, optim.SGD(model.parameters(), lr = lr)

In [58]:
model, opt = get_model()

# Dataset and DataLoader
We can improve our training loop code by creating an abstraction to handle
batching, xb, yb.

Goal: 

x_batch, y_batch = trainDataset[batchSize]

In [59]:
class Dataset():
    def __init__(self, x, y):
        self.x, self.y = x, y
        
    def __len__(self): 
        return len(self.x)
    
    def __getitem__(self, i): # overloader for [] operator, supports slicing
        # this method could have a lot of logic, usually is parallelized
        return self.x[i], self.y[i]

In [60]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
assert len(train_ds) == len(x_train)
assert len(valid_ds) == len(y_valid)

In [61]:
xb = train_ds[0:5]; xb

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9]))

In [62]:
model, opt = get_model()

In [63]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = train_ds[s] # could also use directly [i:min(n, i+bs)]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.13820990920066833, acc 0.9399999976158142
loss 0.096625916659832, acc 0.9599999785423279
loss 0.07814852893352509, acc 0.9599999785423279


Now we want to reduce the train loop even further by creating an iterator (by implementing the \_\_iter\_\_() method) that returns an iterable of batches of (x_batch,y_batch), i.e. a data loader
Goal:

for x_batch, y_batch in dataloader:

In [63]:
class DataLoader():
    def __init__(self, dataset, batchsize):
        self.dataset, self.batchsize = dataset, batchsize
    def __iter__(self):
        for i in range(0, len(self.dataset), self.batchsize):
            yield self.dataset[i:i+self.batchsize]

In [64]:
# so now we can
# 1) create dataset
# 2) use it to create a dataloader
# 3) iterate over batches of data 

In [65]:
batchSize = 64
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
trainDataLoader, validDataLoader = DataLoader(train_ds, batchSize), DataLoader(valid_ds, batchSize)

In [66]:
for epoch in range(epochs):
    for xb, yb in trainDataLoader:
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.18502801656723022, acc 0.9375
loss 0.0874142050743103, acc 0.9375
loss 0.03932715207338333, acc 1.0


So now we have the core backbone of a training loop; thus we can start adding features!
# Random Sampling
Training set must be shuffled at each epoch (nb: validation shouldnt).
To do so we are going to create a class "Sampler" that will shuffle train data idxs (that have to be applied)

In [68]:
import random

class Sampler():
    
    # returns an iterator of shuffled idxs for input dataset
    
    def __init__(self, dataset, shuffle=False):
        self.n = len(dataset)
        self.shuffle = shuffle
    
    def __iter__(self): # this must return an iterator object
        res = list(range(self.n))
        if self.shuffle:
            random.shuffle(res)
        return iter(res) 

In [69]:
from itertools import islice
s = Sampler(train_ds) # not shuffled
list(islice(s,5)) 
# islice make an iterator that returns selected elements from the iterable. 

[0, 1, 2, 3, 4]

In [69]:
s = Sampler(train_ds, shuffle=True)
list(islice(s,5)) 

[8583, 40714, 37864, 19868, 36819]

So now we can create a BatchSampler: a BatchSampler will return batchSized
randomized idxs that are going to be used to grab data from DataLoader.    

In [65]:
import fastcore.all as fc

class BatchSampler():
    def __init__(self, sampler, batchSize, drop_last=False):
        self.sampler = sampler
        self.batchSize = batchSize 
        self.drop_last = drop_last
    
    # look at lecture2 for chunk
    # it splits a list in a list of lists of desired size
    def __iter__(self):
        # returns list of random idxs for given batch
        yield from fc.chunked(iter(self.sampler), self.batchSize, self.drop_last) 

In [71]:
batchesIdxs = BatchSampler(s, 5)
list(islice(batchesIdxs,5)) 

# so now we have a way to iterate thru all dataset grabbing
# batches of random idxs 

[[25551, 47757, 13043, 36752, 44096],
 [504, 37364, 42213, 47094, 29046],
 [2645, 45715, 7041, 13751, 17650],
 [6643, 39060, 6680, 26018, 5960],
 [3130, 2236, 35033, 8108, 28336]]

In [72]:
def collate(b): # b = [(xi,yi), (xj,yj), ...]
    xs, ys = zip(*b) # creates 2 lists with all xis and all yis
    return torch.stack(xs), torch.stack(ys)

In [121]:
train_ds[5][0].shape, train_ds[5][1].shape

(torch.Size([784]), torch.Size([]))

In [122]:
collate([train_ds[5], train_ds[6]])

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([2, 1]))

In [73]:
# so we can now rewrite our DataLoader with this new functionality
class DataLoader():
    
    # using dataset
    # and batchSampler
    # returns batches of data
    
    def __init__(self, dataset, batchSampler, collate_fn = collate):
        self.dataset = dataset
        self.batchSampler = batchSampler
        self.collate_fn = collate_fn
    
    def __iter__(self):
        yield from (self.collate_fn(self.dataset[i] for i in batch) for batch in self.batchSampler)


In [74]:
# pyt DataLoader works as:
# dataset # its idxing returns (xi,yi)
# samplers # shuffles idxs
# batch sampler # randomizes idxs
# collate 
# dataloader 

In [75]:
batchSize = 64
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
train_sampler = BatchSampler(Sampler(train_ds), batchSize)
train_dataLoader = DataLoader(train_ds, train_sampler)

In [76]:
xb,yb = next(iter(train_dataLoader))
import matplotlib.pyplot as plt
#plt.imshow(xb[0].view(28,28))
xb.shape # batchsize rows, each row is an obs

torch.Size([64, 784])

# MultiProcessing DataLoader
Dataloaders in pyt do exactly what we have seen above, but the process
to pre-process data to be returned when indexing into a dataset is done on 
different processors. The standard python api for multiprocessing is: _multiprocessing_, so we might use it, but instead we use _torch.multiprocessing_ because standard py lib does not work well with tensors.

In [77]:
import torch.multiprocessing as mp

In [78]:
# indexing in dataset and calling its __getitem__ is equivalent

x1, y1 = train_ds[[3,6,9]]
x2, y2 = train_ds.__getitem__([3,6,9])

assert torch.all(x1 == x2) == tensor(True)
assert torch.all(y1 == y2) == tensor(True)

x1, y1 = train_ds[3:9]
x2, y2 = train_ds.__getitem__(range(3, 9))

assert torch.all(x1 == x2) == tensor(True)
assert torch.all(y1 == y2) == tensor(True)

In [79]:
# so to parellalize we can call map with the __getitem__ func:
# map(func, sequence)
# map takes a seq and applies a func to each el of that sequence 
# (with that el as input of the func that is being mapped)
# returns a seq of the same length of the input seq

In [80]:
# thus for example to break a batch of 4 els 
# with idxs 3, 6, 8, 1

for o in map(train_ds.__getitem__, ([3, 6], [8, 1])):
    print(o)
# this grabs the first el of input seq, whihc is the list [3, 6]
# calls and f:train_ds.__getitem__ with that input
# we do this because multiprocessing has a class that is able to instanciate
# serveral threads (i.e. a pool of threads -> where each thread is a "worker")
# thus we can run map where each func is applied over the seq in a separate thread

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 1]))
(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 0]))


In [81]:
class DataLoader():
    def __init__(self, dataset, batchSampler, n_workers=1, collate_fn=collate):
        self.dataset = dataset
        self.batchSampler = batchSampler
        self.n_workers = n_workers
        self.collate_fn = collate_fn
    
    def __iter__(self):
        # using n threads execute map
        with mp.Pool(self.n_workers) as ex:
            yield from ex.map(self.dataset.__getitem__, iter(self.batchSampler))

In [82]:
# # this cell hangs in jupyter
# train_dl = DataLoader(train_ds, batchSampler= train_sampler, n_workers=2)
# it = iter(train_dl)
# xb, yb = next(it)
# xb, yb

# PyTorch DataLoader

In [72]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler

RandomSampler returns randomized idxs

SequentialSampler returns normal ordering of idxs

In [84]:
bs = 64
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

train_sampler = BatchSampler(RandomSampler(train_ds), batch_size=bs, drop_last= False)
valid_sampler = BatchSampler(SequentialSampler(valid_ds), batch_size=bs, drop_last= False)


In [85]:
train_dataLoader = DataLoader(dataset=train_ds, 
                              # batch_size=bs, # in this setting bs is in sampler
                              sampler=train_sampler, 
                              collate_fn=collate)

val_dataLoader = DataLoader(valid_ds, 
                            sampler=valid_sampler, 
                            collate_fn=collate)

In [87]:
def get_model(n_in, nh, n_out):
    model = nn.Sequential(nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out))
    return model, optim.SGD(model.parameters(), lr = lr)

model, opt = get_model(784, 50, 10)

for epoch in range(epochs):
    for xb, yb in train_dataLoader:
        xb.squeeze_() # there are some issues on shapes, keep going
        yb.squeeze_()
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.12716785073280334, acc 0.9375
loss 0.13605919480323792, acc 0.9375
loss 0.0028977771289646626, acc 1.0


In [88]:
# we can avoid defining the BatchSampler: we can just provide bs to datalader
# and wheter to shuffle data or not (auto creation of Sequential/Random Samplers)

train_dataLoader = DataLoader(train_ds, bs, shuffle=True, drop_last=True) # num_workers = 2
valid_dataLoader = DataLoader(valid_ds, bs, shuffle=False) # num_workers = 2

In [74]:
DataLoader?

In [89]:
model, opt = get_model(784, 50, 10)

for epoch in range(epochs):
    for xb, yb in train_dataLoader:        
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.1906106024980545, acc 0.9375
loss 0.08068399131298065, acc 0.984375
loss 0.014962541870772839, acc 1.0


In [90]:
# Trick to speed up things:

train_sampler = BatchSampler(RandomSampler(train_ds), batch_size=bs, drop_last= False)
valid_sampler = BatchSampler(SequentialSampler(valid_ds), batch_size=bs, drop_last= False)

train_dl = DataLoader(train_ds, sampler = train_sampler)
valid_dl = DataLoader(valid_ds, sampler = train_sampler)

# pass to dataloader the sampler directly, cuz dataset knows how to retrieve
# set of idxs

# DataSet -> grab idxs -> returns obs, returns x[i],y[i], supports slicing

# Sampler -> returns list of idxs of the whole dataset (shuffled or not)

# BatchSampler -> breaks in batches of idxs the idxs returned by the Sampler

# DataLoader -> indexes in DataSet using batches of idxs returned by BatchSampler, collate list of DataSet[idx] together in torch.tensor(x), torch.tensor(y)

# The trick is to pass the sampler directly to the data loader with a batchSize, s.t. the dataloader itself uses the idxs returned by the Sampler direectly to index in DataSet, using the batchSize to split up 
    # 	and without using/instaciating BatchSampler and DataLoader

In [91]:
model, opt = get_model(784, 50, 10)

for epoch in range(epochs):
    for xb, yb in train_dataLoader:        
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss 0.21624523401260376, acc 0.9375
loss 0.12360577285289764, acc 0.96875
loss 0.1926768571138382, acc 0.953125


# Validation

In [104]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
            
        model.eval()
        with torch.no_grad():
            tot_loss, tot_acc, count = 0.,0.,0
            for xb, yb in valid_dl:
                pred = model(xb)
                n = len(xb)
                count += n
                tot_loss += loss_func(pred, yb).item()*n
                tot_acc += accuracy(pred, yb).item()*n
        print(epoch, tot_loss/count, tot_acc/count)
    return tot_loss/count, tot_acc/count

In [105]:
def get_dataLoaders(train_ds, valid_ds, bs, **kwards):        
    return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwards), DataLoader(valid_ds, batch_size=bs*2, **kwards))
# you can double bs for validation data loader cuz it wont have to do backprop and thus half mem


In [106]:
train_dl, valid_dl = get_dataLoaders(train_ds, valid_ds, 64)
model, opt = get_model(784, 50, 10)
loss,acc= fit(5, model, loss_func, opt, train_dl, valid_dl)

0 0.16301606733798982 0.9556
1 0.144886371332407 0.959
2 0.15856560630053282 0.9515
3 0.3052870091378689 0.9228
4 0.10596682603061199 0.9704
