In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
import torch.optim as optim


### 搭建模型

In [2]:
class LinearBNAC(nn.Module):
    def __init__(self, in_channels, out_channels, bias=True, dropout=0.3, is_output=False):
        super(LinearBNAC, self).__init__()
        if is_output and out_channels==1:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Sigmoid()
            )
        elif is_output:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Softmax(dim=1)
            )   
        else:
            self.linear = nn.Sequential(
                nn.Linear(in_channels, out_channels, bias=bias),
                nn.Dropout(dropout),
                nn.BatchNorm1d(out_channels),
                nn.LeakyReLU(inplace=True)
            )
            
    def forward(self, x):
        out=self.linear(x)
        return out

In [3]:
class Model(nn.Module):
    def __init__(self, input_dimention, output_classes=1):
        super(Model, self).__init__()
        self.layer1 = LinearBNAC(input_dimention, 128)
        self.layer2 = LinearBNAC(128,64)
        self.layer3 = LinearBNAC(64,32)
        self.output = LinearBNAC(32, output_classes, is_output=True)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return x 
        

### 準備輸入資料、優化器、標籤資料、模型輸出

In [4]:
model = Model(input_dimention=256,output_classes=10)
optimizer = optim.Adam(params = model.parameters(),lr=0.00001,weight_decay = 0.00001)

In [5]:
batch_size = 4
input_features = 256
dummy_input = torch.randn(batch_size, input_features,)

#target = torch.empty(4, dtype=torch.float).random_(10)
target = torch.tensor([9., 5., 4., 4.], dtype=torch.long)

In [6]:
output = model(dummy_input)
print(output)

tensor([[0.1287, 0.0685, 0.0616, 0.1404, 0.1033, 0.1443, 0.0926, 0.0407, 0.1498,
         0.0702],
        [0.1203, 0.0480, 0.1416, 0.1162, 0.0700, 0.1280, 0.0657, 0.0663, 0.1341,
         0.1098],
        [0.0900, 0.1483, 0.0875, 0.0960, 0.0883, 0.0979, 0.1324, 0.0424, 0.1421,
         0.0752],
        [0.0777, 0.0889, 0.0506, 0.1024, 0.1120, 0.0955, 0.1851, 0.0607, 0.1447,
         0.0824]], grad_fn=<SoftmaxBackward>)


In [17]:
output.shape

torch.Size([4, 10])

### 計算 CrossEntropy Loss
* 請注意哪一個 Loss最適合：我們已經使用 softmax
* 因為我們有使用dropout，並隨機產生dummy_input，所以各為學員得到的值會與解答不同，然而步驟原理需要相同

In [7]:
from torch.nn import NLLLoss, LogSoftmax, CrossEntropyLoss

In [8]:
criterion = CrossEntropyLoss()

In [9]:
loss = criterion(torch.log(output), target)

### 完成back propagation並更新梯度

In [10]:
loss.backward()

In [11]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0419, -0.0618,  0.0592,  ...,  0.0120,  0.0128, -0.0565],
        [-0.0279,  0.0093, -0.0039,  ...,  0.0021,  0.0199, -0.0520],
        [ 0.0219, -0.0565, -0.0349,  ..., -0.0075,  0.0203, -0.0376],
        ...,
        [ 0.0091, -0.0071, -0.0009,  ..., -0.0530, -0.0401, -0.0281],
        [-0.0512, -0.0359,  0.0119,  ...,  0.0474, -0.0591,  0.0500],
        [ 0.0113, -0.0410,  0.0444,  ..., -0.0444,  0.0272, -0.0203]],
       requires_grad=True)


grad : tensor([[ 2.7334e-03, -7.1069e-03, -5.6411e-03,  ...,  3.1802e-03,
          1.3541e-03,  2.6503e-03],
        [-3.5636e-03, -5.2345e-03, -6.1796e-04,  ...,  5.6166e-03,
         -2.5874e-03,  1.0849e-02],
        [ 1.6196e-03,  2.2374e-02, -4.4906e-02,  ...,  8.4206e-02,
         -1.9472e-02,  3.8642e-02],
        ...,
        [ 1.0500e-03,  7.3309e-03, -3.4614e-02,  ...,  3.2952e-02,
         -5.9758e-03,  2.6111e-02],
        [-8.6923e-05, -4.2194e-04,  2.2786e-04,  ..., -6.4796e-04,
       

In [12]:
optimizer.step()

In [13]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0419, -0.0618,  0.0593,  ...,  0.0120,  0.0128, -0.0566],
        [-0.0279,  0.0094, -0.0039,  ...,  0.0021,  0.0199, -0.0520],
        [ 0.0219, -0.0566, -0.0349,  ..., -0.0075,  0.0203, -0.0376],
        ...,
        [ 0.0091, -0.0071, -0.0009,  ..., -0.0530, -0.0401, -0.0281],
        [-0.0512, -0.0358,  0.0119,  ...,  0.0474, -0.0591,  0.0500],
        [ 0.0114, -0.0410,  0.0444,  ..., -0.0444,  0.0272, -0.0203]],
       requires_grad=True)


grad : tensor([[ 2.7329e-03, -7.1075e-03, -5.6405e-03,  ...,  3.1803e-03,
          1.3542e-03,  2.6498e-03],
        [-3.5639e-03, -5.2344e-03, -6.1799e-04,  ...,  5.6166e-03,
         -2.5872e-03,  1.0848e-02],
        [ 1.6198e-03,  2.2374e-02, -4.4906e-02,  ...,  8.4206e-02,
         -1.9472e-02,  3.8642e-02],
        ...,
        [ 1.0501e-03,  7.3308e-03, -3.4614e-02,  ...,  3.2951e-02,
         -5.9762e-03,  2.6110e-02],
        [-8.7435e-05, -4.2230e-04,  2.2798e-04,  ..., -6.4749e-04,
       

### 清空 gradient

In [14]:
optimizer.zero_grad()

In [15]:
print('weight : {}'.format(model.layer1.linear[0].weight))
print('\n')
print('grad : {}'.format(model.layer1.linear[0].weight.grad))

weight : Parameter containing:
tensor([[-0.0419, -0.0618,  0.0593,  ...,  0.0120,  0.0128, -0.0566],
        [-0.0279,  0.0094, -0.0039,  ...,  0.0021,  0.0199, -0.0520],
        [ 0.0219, -0.0566, -0.0349,  ..., -0.0075,  0.0203, -0.0376],
        ...,
        [ 0.0091, -0.0071, -0.0009,  ..., -0.0530, -0.0401, -0.0281],
        [-0.0512, -0.0358,  0.0119,  ...,  0.0474, -0.0591,  0.0500],
        [ 0.0114, -0.0410,  0.0444,  ..., -0.0444,  0.0272, -0.0203]],
       requires_grad=True)


grad : tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
