[model](https://learn.microsoft.com/en-us/training/modules/intro-machine-learning-pytorch/4-model)

In [4]:

import torch
from torch import nn

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


Our neural network are composed of the following:

- The input layer with 28x28 or 784 features/pixels.
- The first linear module takes the input 784 features and transforms it to a hidden layer with 512 features
- The ReLU activation function will be applied in the transformation
- The second linear module take 512 features as input from the first hidden layer and transforms it to the next hidden layer with 512 features
- The ReLU activation function will be applied in the transformation
- The third linear module take 512 features as input from the second hidden layer and transforms it to the output layer with 10, which is the number of classes
- The ReLU activation function will be applied in the transformation

In [5]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [8]:
model = NeuralNetwork().to(device)
print(model)

print(f"First Linear weights: {model.linear_relu_stack[0].weight} \n")

print(f"First Linear bias: {model.linear_relu_stack[0].bias} \n")

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)
First Linear weights: Parameter containing:
tensor([[ 0.0064,  0.0033, -0.0008,  ..., -0.0349,  0.0304, -0.0191],
        [ 0.0205,  0.0003,  0.0353,  ..., -0.0287, -0.0056,  0.0316],
        [-0.0068, -0.0122, -0.0323,  ...,  0.0143,  0.0106, -0.0085],
        ...,
        [ 0.0293, -0.0348, -0.0090,  ...,  0.0344, -0.0305, -0.0031],
        [-0.0328, -0.0079, -0.0110,  ...,  0.0256,  0.0119,  0.0305],
        [ 0.0029, -0.0303, -0.0121,  ...,  0.0206, -0.0227,  0.0142]],
       device='cuda:0', requires_grad=True) 

First Linear weights: Parameter containing:
tensor([ 3.3697e-02,  3.3230e-02, -1.2195e-02, -1.8797e-02,  2.7887e-02,
         2.7492e-02, -6.

To use the model, we pass it the input data. This executes the model's forward, along with some background operations. However, do not call model.forward() directly! Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class.

We get the prediction densities by passing it through an instance of the nn.Softmax.

In [7]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([6], device='cuda:0')


以下举例说明上述模型中的 layer，以 3 张 28*28 的图片为例。

In [9]:
input_image = torch.rand(3, 28, 28)
print(input_image.size())

torch.Size([3, 28, 28])


In [10]:
# Flatten 会将 2D 的 28*28 的图片转换为 1D 的 784 的向量
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [11]:
# Linear 会使用其存储的 weight 和 bias 对输入的 784 的向量做线性变换，转换为 20 的向量
layer1 = nn.Linear(in_features=28 * 28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [12]:
# ReLU 会对输入的 20 的向量做非线性变换，目的是为帮助神经网络学习各种各样的现象
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.2866,  0.0505, -0.0978,  0.1137,  0.3252, -0.3445, -0.5225, -0.1804,
          0.0451, -0.4283,  0.2847,  0.1886,  0.1066,  0.8289,  0.1577, -0.3942,
         -0.1343, -0.2590,  0.5462,  0.2448],
        [ 0.1187, -0.0324, -0.1250,  0.3004,  0.3058, -0.3972, -0.3747,  0.1747,
          0.1251, -0.4282,  0.4507,  0.3431,  0.0023,  0.6037,  0.6050, -0.1815,
         -0.0409, -0.3206,  0.0945,  0.2545],
        [ 0.2922,  0.0636, -0.2587,  0.2318,  0.0198, -0.1845, -0.2926,  0.0074,
          0.3746, -0.4456,  0.3713,  0.3546, -0.1526,  0.5471,  0.4274, -0.1127,
         -0.2227, -0.4861, -0.0498, -0.1933]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.2866, 0.0505, 0.0000, 0.1137, 0.3252, 0.0000, 0.0000, 0.0000, 0.0451,
         0.0000, 0.2847, 0.1886, 0.1066, 0.8289, 0.1577, 0.0000, 0.0000, 0.0000,
         0.5462, 0.2448],
        [0.1187, 0.0000, 0.0000, 0.3004, 0.3058, 0.0000, 0.0000, 0.1747, 0.1251,
         0.0000, 0.4507, 0.3431, 0.0023, 0.6037, 0.60

In [14]:
# Sequential 会将上述的操作组合起来，按照顺序执行
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)
input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)
print(logits)

tensor([[ 0.1799,  0.0852, -0.3402, -0.2800,  0.0255,  0.3052, -0.1402,  0.0298,
         -0.0684,  0.1986],
        [ 0.2339,  0.0604, -0.1770, -0.2691, -0.0673,  0.1105, -0.0235,  0.2472,
         -0.2349,  0.2803],
        [ 0.0760, -0.0758, -0.2676, -0.2299,  0.1419,  0.2491, -0.2389,  0.0145,
         -0.0516,  0.2138]], grad_fn=<AddmmBackward0>)


In [16]:
# Softmax 会将 logits 转换为预测概率
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
print(pred_probab)

tensor([[0.1175, 0.1069, 0.0698, 0.0742, 0.1007, 0.1332, 0.0853, 0.1011, 0.0917,
         0.1197],
        [0.1221, 0.1026, 0.0809, 0.0738, 0.0903, 0.1079, 0.0944, 0.1237, 0.0764,
         0.1279],
        [0.1080, 0.0928, 0.0766, 0.0795, 0.1153, 0.1284, 0.0788, 0.1015, 0.0951,
         0.1239]], grad_fn=<SoftmaxBackward0>)


In [17]:
# 模型参数，比如 weights 和 bias 会在训练是进行优化
# 通过 model.parameters() 可以获取模型中的所有参数
print("Model structure: ", model, "\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure:  NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
) 


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0064,  0.0033, -0.0008,  ..., -0.0349,  0.0304, -0.0191],
        [ 0.0205,  0.0003,  0.0353,  ..., -0.0287, -0.0056,  0.0316]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([0.0337, 0.0332], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0002,  0.0346, -0.0395,  ..., -0.0198, -0.0107,  0.0116],
        [-0.0117, -0.0299, -0.0269,  ..., -0.0088,  0.0311,  0.0043]],
       device='cuda