In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.transforms import transforms
from torchvision.datasets.mnist import MNIST
from trainer import Trainer

# Tensors

In [2]:
# populate a 3x3 tensor with samples drawn from a random distribution
x = torch.randn(3, 3)
y = torch.randn(3, 3)
x

tensor([[ 1.8257,  0.5835,  0.8889],
        [ 0.2154,  0.2320,  0.0178],
        [-1.7530, -0.0785, -0.2000]])

In [3]:
y

tensor([[ 0.2445, -0.1320,  0.4195],
        [ 1.1207, -1.2937,  0.0059],
        [-1.4143, -0.5531, -0.3274]])

In [4]:
# we can index into this guy just like a python list!
x[0]

tensor([1.8257, 0.5835, 0.8889])

In [5]:
# we can also do some boolean stuff with him
x > 0

tensor([[1, 1, 1],
        [1, 1, 1],
        [0, 0, 0]], dtype=torch.uint8)

In [6]:
# can also concatenate two tensors together on various dimensions
z = torch.cat((x, y), 1)
print("z:", z)
print("z.size()", z.size())

z: tensor([[ 1.8257,  0.5835,  0.8889,  0.2445, -0.1320,  0.4195],
        [ 0.2154,  0.2320,  0.0178,  1.1207, -1.2937,  0.0059],
        [-1.7530, -0.0785, -0.2000, -1.4143, -0.5531, -0.3274]])
z.size() torch.Size([3, 6])


In [7]:
# and reshape them
z.view(9, -1)

tensor([[ 1.8257,  0.5835],
        [ 0.8889,  0.2445],
        [-0.1320,  0.4195],
        [ 0.2154,  0.2320],
        [ 0.0178,  1.1207],
        [-1.2937,  0.0059],
        [-1.7530, -0.0785],
        [-0.2000, -1.4143],
        [-0.5531, -0.3274]])

In [8]:
# in different ways
z.view(3, 3, 2)

tensor([[[ 1.8257,  0.5835],
         [ 0.8889,  0.2445],
         [-0.1320,  0.4195]],

        [[ 0.2154,  0.2320],
         [ 0.0178,  1.1207],
         [-1.2937,  0.0059]],

        [[-1.7530, -0.0785],
         [-0.2000, -1.4143],
         [-0.5531, -0.3274]]])

In [9]:
# can add elementwise
x + y

tensor([[ 2.0702,  0.4515,  1.3083],
        [ 1.3360, -1.0617,  0.0237],
        [-3.1673, -0.6317, -0.5274]])

In [10]:
# can also matrix multiply! (this is important)
torch.matmul(x, y)

tensor([[-0.1569, -1.4874,  0.4783],
        [ 0.2875, -0.3384,  0.0859],
        [-0.2338,  0.4436, -0.6703]])

In [11]:
# most important lesson from linear algebra: if you want to multiply some matrix `a` with some matrix `b`
# `a` needs to be of size (m x n) and `b` needs to be of size (n x p)
# this will give you a final matrix c of size (m x p)
# here is the error message you will get if that's not the case
a = torch.randn((4, 3))
b = torch.randn((4, 3))
print("size of matrix a:", a.size())
print("size of matrix b:", b.size())
torch.matmul(a, b)

# literally the bane of my existence

size of matrix a: torch.Size([4, 3])
size of matrix b: torch.Size([4, 3])


RuntimeError: size mismatch, m1: [4 x 3], m2: [4 x 3] at /Users/soumith/miniconda2/conda-bld/pytorch_1532624435833/work/aten/src/TH/generic/THTensorMath.cpp:2070

In [None]:
# solutions! 
torch.matmul(a, torch.transpose(b, dim0=0, dim1=1))  # be careful though if you're working with batches

# Layers (Low-Level)

In [12]:
# define a new matrix x
x = torch.randn(3, 3)
print("x: ", x)
print("x.size():", x.size())

x:  tensor([[-1.5729,  0.8726, -0.8991],
        [-0.0950,  0.0925,  1.8261],
        [-0.1823,  0.8657,  0.6824]])
x.size(): torch.Size([3, 3])


In [13]:
# define a weight matrix 
w1 = torch.randn(6, 3)
b1 = torch.randn(6)

In [14]:
w1

tensor([[ 0.4739,  0.8955, -0.0811],
        [ 0.6793, -0.7189,  1.6245],
        [ 0.4240, -0.0701,  0.0375],
        [-0.4710, -0.7020,  0.3119],
        [-1.8891, -1.5711,  1.2038],
        [-1.0248,  1.8811,  0.7875]])

In [15]:
b1

tensor([-0.4235,  2.0292,  0.0034, -1.7639, -0.5050, -1.3253])

In [16]:
# multiply input by weight matrix
layer1_output = torch.matmul(x, torch.transpose(w1, 0, 1))
layer1_output 

tensor([[ 0.1089, -3.1563, -0.7617, -0.1521,  0.5181,  2.5452],
        [-0.1103,  2.8354,  0.0216,  0.5493,  2.2323,  1.7094],
        [ 0.6335,  0.3625, -0.1124, -0.3090, -0.1943,  2.3526]])

In [17]:
# add the bias term 
layer1_output = layer1_output + b1
layer1_output

tensor([[-0.3146, -1.1271, -0.7583, -1.9160,  0.0130,  1.2198],
        [-0.5338,  4.8646,  0.0250, -1.2146,  1.7272,  0.3841],
        [ 0.2099,  2.3917, -0.1090, -2.0729, -0.6993,  1.0273]])

# Layers (High-Level)

In [18]:
linear_layer1 = nn.Linear(in_features=3, out_features=6) 

In [19]:
x2 = linear_layer1(x)
x2

tensor([[ 0.9879,  0.0411,  0.8826, -0.5743,  0.5697,  0.6398],
        [ 0.6288, -1.0389, -0.7006,  0.4881,  1.3127,  1.4414],
        [ 0.7568, -0.6123, -0.5626,  0.1392,  0.6643,  1.2159]],
       grad_fn=<ThAddmmBackward>)

In [20]:
linear_layer1.weight

Parameter containing:
tensor([[-0.3070,  0.3160,  0.1251],
        [-0.1028,  0.0630, -0.3225],
        [-0.5709, -0.4983, -0.4140],
        [ 0.1493,  0.0390,  0.3200],
        [-0.5555, -0.0908,  0.5479],
        [-0.2313,  0.5253,  0.5699]], requires_grad=True)

In [21]:
linear_layer1.bias

Parameter containing:
tensor([ 0.3419, -0.4655,  0.0472, -0.0857,  0.2678,  0.3301], requires_grad=True)

# Models

In [22]:
mnist = MNIST(root=".", download=True, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))

In [23]:
# check out the size of the first sample in mnist
mnist[0][0].size()

torch.Size([1, 28, 28])

In [30]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        """
        define model attributes
        """
        self.layer1 = nn.Linear(28*28, 512)
        self.layer2 = nn.Linear(512, 512)
        self.layer3 = nn.Linear(512, 10)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        """
        define model
        """
        x = x.view(-1, 28*28)
        h1 = self.relu(self.layer1(x))
        h2 = self.relu(self.layer2(h1))
        logits = self.relu(self.layer3(h2))
        
        return logits  # usually return `logits` (output of final layer) or `probs` (softmax(logits))

# Let's Run This Guy!

In [31]:
# create model and trainer
model = Model()
trainer = Trainer(model=model, save_dir="./models")

In [33]:
trainer.train(train=mnist, val=None, epochs=10, batch_size=32, log_per_batches=10, learning_rate=0.001, device="cpu")



  0%|          | 0/1875 [00:00<?, ?it/s][A[A

  0%|          | 4/1875 [00:00<00:49, 38.14it/s][A[A

Epoch 0




  0%|          | 8/1875 [00:00<00:50, 37.01it/s][A[A

Training Loss: 0.859:   0%|          | 8/1875 [00:00<00:50, 37.01it/s][A[A

Training Loss: 0.859:   1%|          | 12/1875 [00:00<00:50, 36.99it/s][A[A

Training Loss: 0.859:   1%|          | 17/1875 [00:00<00:46, 39.61it/s][A[A

Training Loss: 0.786:   1%|          | 17/1875 [00:00<00:46, 39.61it/s][A[A

Training Loss: 0.786:   1%|          | 22/1875 [00:00<00:46, 40.12it/s][A[A

Training Loss: 0.786:   1%|▏         | 27/1875 [00:00<00:43, 42.30it/s][A[A

Training Loss: 0.988:   1%|▏         | 27/1875 [00:00<00:43, 42.30it/s][A[A

Training Loss: 0.988:   2%|▏         | 32/1875 [00:00<00:43, 42.16it/s][A[A

Training Loss: 0.988:   2%|▏         | 38/1875 [00:00<00:41, 44.72it/s][A[A

Training Loss: 0.842:   2%|▏         | 38/1875 [00:00<00:41, 44.72it/s][A[A

Training Loss: 0.842:   2%|▏         | 43/1875 [00:01<00:40, 45.19it/s][A[A

Training Loss: 0.842:   3%|▎         | 49/1875 [00:01<00:38, 47.25it/s][A

Training Loss: 0.952:  18%|█▊        | 345/1875 [00:08<00:33, 45.70it/s][A[A

Training Loss: 0.999:  18%|█▊        | 345/1875 [00:08<00:33, 45.70it/s][A[A

Training Loss: 0.999:  19%|█▊        | 350/1875 [00:08<00:35, 43.51it/s][A[A

Training Loss: 0.999:  19%|█▉        | 355/1875 [00:08<00:33, 45.17it/s][A[A

Training Loss: 0.929:  19%|█▉        | 355/1875 [00:08<00:33, 45.17it/s][A[A

Training Loss: 0.929:  19%|█▉        | 360/1875 [00:08<00:33, 44.95it/s][A[A

Training Loss: 0.929:  19%|█▉        | 365/1875 [00:08<00:34, 43.53it/s][A[A

Training Loss: 0.994:  19%|█▉        | 365/1875 [00:08<00:34, 43.53it/s][A[A

Training Loss: 0.994:  20%|█▉        | 370/1875 [00:08<00:34, 43.89it/s][A[A

Training Loss: 0.994:  20%|██        | 375/1875 [00:09<00:33, 45.13it/s][A[A

Training Loss: 0.991:  20%|██        | 375/1875 [00:09<00:33, 45.13it/s][A[A

Training Loss: 0.991:  20%|██        | 380/1875 [00:09<00:33, 45.25it/s][A[A

Training Loss: 0.991:  21%|██        | 3

Training Loss: 0.737:  37%|███▋      | 685/1875 [00:16<00:24, 48.98it/s][A[A

Training Loss: 0.656:  37%|███▋      | 685/1875 [00:16<00:24, 48.98it/s][A[A

Training Loss: 0.656:  37%|███▋      | 690/1875 [00:16<00:24, 48.81it/s][A[A

Training Loss: 0.656:  37%|███▋      | 695/1875 [00:16<00:24, 48.49it/s][A[A

Training Loss: 0.805:  37%|███▋      | 695/1875 [00:16<00:24, 48.49it/s][A[A

Training Loss: 0.805:  37%|███▋      | 700/1875 [00:16<00:24, 48.36it/s][A[A

Training Loss: 0.805:  38%|███▊      | 705/1875 [00:16<00:24, 48.36it/s][A[A

Training Loss: 0.795:  38%|███▊      | 705/1875 [00:16<00:24, 48.36it/s][A[A

Training Loss: 0.795:  38%|███▊      | 710/1875 [00:16<00:24, 47.40it/s][A[A

Training Loss: 0.795:  38%|███▊      | 715/1875 [00:16<00:24, 47.23it/s][A[A

Training Loss: 0.867:  38%|███▊      | 715/1875 [00:16<00:24, 47.23it/s][A[A

Training Loss: 0.867:  38%|███▊      | 720/1875 [00:16<00:24, 46.66it/s][A[A

Training Loss: 0.867:  39%|███▊      | 7

Training Loss: 0.711:  54%|█████▎    | 1004/1875 [00:24<00:23, 37.02it/s][A[A

Training Loss: 0.711:  54%|█████▍    | 1009/1875 [00:24<00:22, 38.39it/s][A[A

Training Loss: 0.796:  54%|█████▍    | 1009/1875 [00:24<00:22, 38.39it/s][A[A

Training Loss: 0.796:  54%|█████▍    | 1013/1875 [00:24<00:22, 38.79it/s][A[A

Training Loss: 0.796:  54%|█████▍    | 1018/1875 [00:25<00:21, 39.71it/s][A[A

Training Loss: 0.773:  54%|█████▍    | 1018/1875 [00:25<00:21, 39.71it/s][A[A

Training Loss: 0.773:  55%|█████▍    | 1022/1875 [00:25<00:25, 33.51it/s][A[A

Training Loss: 0.773:  55%|█████▍    | 1027/1875 [00:25<00:23, 36.15it/s][A[A

Training Loss: 0.793:  55%|█████▍    | 1027/1875 [00:25<00:23, 36.15it/s][A[A

Training Loss: 0.793:  55%|█████▍    | 1031/1875 [00:25<00:22, 36.86it/s][A[A

Training Loss: 0.793:  55%|█████▌    | 1035/1875 [00:25<00:22, 37.44it/s][A[A

Training Loss: 0.734:  55%|█████▌    | 1035/1875 [00:25<00:22, 37.44it/s][A[A

Training Loss: 0.734:  55%|█

KeyboardInterrupt: 