<a href="https://colab.research.google.com/github/mengcius/pytorch-learn/blob/master/5_%E6%A2%AF%E5%BA%A6%E4%B8%8B%E9%99%8D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 5_梯度下降

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F 

### sigmoid


In [0]:
x=torch.linspace(-100,100,10) #线性等分向量
x

tensor([-100.0000,  -77.7778,  -55.5556,  -33.3333,  -11.1111,   11.1111,
          33.3333,   55.5555,   77.7778,  100.0000])

In [0]:
torch.sigmoid(x)

tensor([0.0000e+00, 1.6655e-34, 7.4564e-25, 3.3382e-15, 1.4945e-05, 9.9999e-01,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00])

In [0]:
F.sigmoid(x) #不建议用torch.nn.functional中的sigmoid



tensor([0.0000e+00, 1.6655e-34, 7.4564e-25, 3.3382e-15, 1.4945e-05, 9.9999e-01,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00])

### tanh

In [0]:
x=torch.linspace(-10,10,10)
x

tensor([-10.0000,  -7.7778,  -5.5556,  -3.3333,  -1.1111,   1.1111,   3.3333,
          5.5556,   7.7778,  10.0000])

In [0]:
torch.tanh(x)

tensor([-1.0000, -1.0000, -1.0000, -0.9975, -0.8045,  0.8045,  0.9975,  1.0000,
         1.0000,  1.0000])

### relu

In [0]:
x=torch.linspace(-10,10,10)
x

tensor([-10.0000,  -7.7778,  -5.5556,  -3.3333,  -1.1111,   1.1111,   3.3333,
          5.5556,   7.7778,  10.0000])

In [0]:
torch.relu(x)

tensor([ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.1111,  3.3333,  5.5556,
         7.7778, 10.0000])

In [0]:
F.relu(x)

tensor([ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.1111,  3.3333,  5.5556,
         7.7778, 10.0000])

### autograd.grad求mse_loss的梯度

torch.nn.functional.mse_loss() 计算均方差损失

torch.autograd.grad(loss, [w1, w2,...]) 求导，返回[w1 grad, w2 grad...]


In [0]:
x=torch.ones(1)
w=torch.full([1],2) #预测为y^=x*w，w=torch.full([1],2,requires_grad=True)
x,w

(tensor([1.]), tensor([2.]))

In [0]:
mse=F.mse_loss(torch.ones(1),x*w) #均方差，实际值y-预测值y^的平方和
mse

tensor(1.)

In [0]:
torch.autograd.grad(mse,[w]) #让mse损失对w求导
#报错w无需求导，要将w标注为可求导的
#RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [0]:
w.requires_grad_() #标注w是需要梯度信息的，或在一开始就w=torch.full([1],2,requires_grad=True)
#因为torch是动态图，在w更新后，要再计算一次mse，重新更新图
mse=F.mse_loss(torch.ones(1),x*w)
torch.autograd.grad(mse,[w]) #让mse损失对w求导，返回w的梯度

(tensor([2.]),)

### backward求mse_loss的梯度

xloss.backward() 反向传播，无返回，但附在了w1.grad，w2.grad


In [0]:
x=torch.ones(1)
w=torch.full([1],2,requires_grad=True) #预测为y^=x*w
x,w

(tensor([1.]), tensor([2.], requires_grad=True))

In [0]:
mse=F.mse_loss(torch.ones(1),x*w) #均方差
mse.backward() #反向传播，会计算出图中需要计算的梯度，不返回值，会把各自的梯度附在grad成员变量上
w.grad

tensor([2.])

In [0]:
w.grad.norm() 

tensor(2.)

### Softmax激活函数及其梯度

F.softmax(a,dim=0) 求softmax


In [0]:
a=torch.rand(3)
a.requires_grad_()

tensor([0.6954, 0.7282, 0.5982], requires_grad=True)

In [0]:
p=F.softmax(a,dim=0) #求softmax激活后的值，可以指定维度，以便跳过batch那维
p

tensor([0.3400, 0.3514, 0.3086], grad_fn=<SoftmaxBackward>)

In [0]:
p.backward() #有问题？之前执行了一次，再次执行需要保持图retain_graph=True ？
#RuntimeError: grad can be implicitly created only for scalar outputs

In [0]:
#求softmax对参数a的梯度
torch.autograd.grad(p[1],[a],retain_graph=True) #p只能是长度为1的[1]或标量，[a]长度不限制

(tensor([-0.1195,  0.2279, -0.1084]),)

In [0]:
torch.autograd.grad(p[0],[a]) #当输入和输出的节点序号相同时，它的梯度才为正

(tensor([ 0.2244, -0.1195, -0.1049]),)

### 单一输出感知机

In [15]:
x=torch.randn(1,10)
w=torch.randn(1,10,requires_grad=True)
w

tensor([[ 0.4849, -0.3153, -0.3720,  0.8757,  0.4986,  0.7716,  0.7317,  1.0519,
         -2.2420,  0.3338]], requires_grad=True)

In [16]:
o=torch.sigmoid(x@w.t()) #sigmoid激活函数，压缩到了[0,1]
o,o.shape

(tensor([[0.0064]], grad_fn=<SigmoidBackward>), torch.Size([1, 1]))

In [17]:
loss=F.mse_loss(torch.ones(1,1),o) #均方差损失,返回标量
loss,loss.shape

(tensor(0.9872, grad_fn=<MeanBackward0>), torch.Size([]))

In [0]:
loss.backward()

In [19]:
w.grad #梯度有10个，loss对w中[1,10]每个元素求导
#后面就可以看哪个梯度小就往哪里更新w，不断优化，是loss趋近于0，预测值接近真实值

tensor([[ 1.0688e-03,  8.7082e-05, -1.3061e-03,  1.5318e-02,  1.9848e-02,
         -5.8090e-03, -1.3079e-02,  2.7386e-02, -1.1368e-02, -1.2993e-03]])

### 多输出感知机

In [21]:
x=torch.randn(1,10)
w=torch.randn(2,10,requires_grad=True)
w

tensor([[ 0.0605, -0.1957, -0.4953, -1.4759, -0.1880,  1.5185, -2.5982,  0.1016,
         -0.3507, -0.5002],
        [ 0.8593, -0.4353,  0.1071,  0.3676,  0.0844,  0.1094, -0.9263,  1.0166,
         -0.2961,  1.4854]], requires_grad=True)

In [22]:
o=torch.sigmoid(x@w.t()) #sigmoid激活函数，2维的
o,o.shape

(tensor([[0.9988, 0.2463]], grad_fn=<SigmoidBackward>), torch.Size([1, 2]))

In [24]:
loss=F.mse_loss(torch.ones(1,2),o) #均方差损失,实际值也是2维的，要对每个激活函数元素比较
loss,loss.shape

(tensor(0.2840, grad_fn=<MeanBackward0>), torch.Size([]))

In [27]:
torch.ones(1,2)

tensor([[1., 1.]])

In [28]:
loss.backward()
w.grad #梯度有[2,10]，shape和w一样

tensor([[ 7.2703e-07,  1.6904e-06,  4.6854e-07, -1.0084e-06,  4.4634e-07,
         -2.0258e-06,  2.6752e-06,  1.7831e-06, -1.0550e-06,  1.7076e-06],
        [ 7.2562e-02,  1.6871e-01,  4.6763e-02, -1.0064e-01,  4.4548e-02,
         -2.0219e-01,  2.6700e-01,  1.7797e-01, -1.0530e-01,  1.7043e-01]])