In [87]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## nn.L1Loss,绝对值误差，MAE
1. 计算公式
$$loss(\hat{y},y) = \left \| \hat{y} - y \right \|$$
2. torch参数
    ```python
    torch.nn.L1Loss(size_average=None, reduce=None, reduction='mean')
    ```
3. 参数解释
    + reduce，没有规约
    $$loss(\hat{y},y) = L = \left \{ L_{1},..., L_{m} \right \}$$
    + reduce，规约
    $$ loss(\hat{y},y) = \left \{ \begin{matrix}mean(L),reduction='mean' \\ sum(L),reudciton='sum'\end{matrix}\right.$$
    + 默认情况下
    $$reduce=True,size\_average=True,reduction='mean'$$ 
4. 测试代码
```python
# 测试代码
y_hat = torch.ones(10,1)
noise =  torch.randn(10,1)
y = torch.ones(10,1)
loss = nn.L1Loss()
print(loss(y_hat,y),noise)
```

## nn.SmoothL1Loss
1. 解释：
    + 也被称为Huber Loss
    + 当预测偏差小于 a 时，它采用平方误差,
    + 当预测偏差大于 a 时，采用的线性误差
    + a代表残差$a = \|y-\hat{y}\|$
2. 公式,L1loss代表$\delta=1$
$$\begin{split}L_\delta(a)=\left \{\begin{array}{ll}\frac12a^2,&\textrm{for } |a|\leq\delta,\\\delta\cdot(|a|-\frac12\delta),&\textrm{otherwise.}\end{array}\right.\end{split}$$
3. 代码
```python
torch.nn.SmoothL1Loss(size_average=None, reduce=None, reduction='mean')
```
4. 可视化Huber Loss
```python
import numpy as np
from matplotlib import pyplot as plt
def Phi(t, c):
    t = abs(t)
    flag = (t > c)
    return (~flag) * (0.5 * t ** 2) - (flag) * c * (0.5 * c - t)
fig = plt.figure(figsize=(5, 3.75))
ax = fig.add_subplot(111)
# x被定义为残差 
x = np.linspace(-10, 10, 100)
for c in (20, 2, 3, 5, 1000):
    y = Phi(x, c)
    ax.plot(x, y, '-k')
    ax.hold
    ax.plot(x, 0.5*x*x, '-b')
    if c > 10:
        s = r'\infty'
    else:
        s = str(c)
    ax.text(x[6], y[6], '$c=%s$' % s,
            ha='center', va='center',
            bbox=dict(boxstyle='round', ec='k', fc='w'))
ax.set_xlabel('$t$')
ax.set_ylabel(r'$\Phi(t)$')
plt.show()
```

## nn.MSELoss() 二次代价函数
1. 公式
$$\text{loss}(\mathbf{x}_i, \mathbf{y}_i) = (\mathbf{x}_i - \mathbf{y}_i)^2$$
2. 用法和MAE相同

## nn.BCELoss  二元交叉熵损失函数
1. 解释
    + 用来处理二分类问题，注意$y$的数值应当在0到1之间，需要利用sigmoid函数处理,默认是$mean$
$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],$$
        $$       \ell(x, y) = \begin{cases}
            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
        \end{cases}$$
2. 定义一下对数损失函数，对于二分类问题，类标签定义为：
$$y_{i} = \{0,1\}$$
计算得到样本为$1$的概率为$p$，由于是二分类问题，那么样本为$0$的概率为$1-p$,将两者合并在一起，可以表示为：
$$p(Y|X,\theta) =p^{y}(1-p)^{1-y} = \left\{\begin{matrix}p, y=1\\ 1-p,y=0\end{matrix}\right.$$
对于一个样本可以理解为，
    + 当类标签为1时，希望样本为$1$的概率$p$越大越好
    + 当类标签为0时，希望样本为$1$的概率$1-p$越大越好
对于一个批次的样本，可以得到优化目标：
$$Loss = \prod_{i=1}^{N} {p(y_{i}|x_{i},\theta)}$$
添加log等效转化：
$$Loss = \sum_{i=1}^{N} {{log}{\; p(y_{i}|x_{i},\theta)}}$$
$$Loss = \sum_{i=1}^{N} {{log}{\; p^{y}(1-p)^{1-y}}}$$
$$Loss = \sum_{i=1}^{N} [{{p*log}{\; y}+(1-p){log}{ \;(1-y)}}]$$
一般情况写可以转换成均值,并转化为最小化问题：
$$Loss = - \frac{1}{N} \sum_{i=1}^{N} [{{y*log}{\; p}+(1-y){log}{ \;(1-p)}}]$$
公式化展示：
$$\theta =  arg\underset{\theta}{min} {\; Loss}$$
3. <font color=red size=2.8>BCELoss里面的权重$w$方法暂时不知道有什么用 </font>

```python
# 代码
loss = nn.BCELoss(reduction='sum')
x = torch.ones(1,10)
y = torch.zeros(1,10)

x = torch.sigmoid(x)
y = torch.sigmoid(y)
a = loss(x,y)
print(a)
```

## nn.BCEWithLogitsLoss
1. 和 nn.BCELoss 功能相同，用于二分类但是加上了sigmod层，比直接加更加稳定。
$$\text{loss}(\mathbf{x}_i, \mathbf{y}_i) = - \boldsymbol{w}_i \left[{y}_i \log \mathbf \sigma{({x}_i)} + (1-{y}_i)\log(1-\sigma{({x}_i)}) \right ]$$


## nn.CrossEntropyLoss 
1. 用于多分类,注意该损失函数并不对称，也就是说prediction和label的位置不弄混。
2. 交叉熵损失函数包括三个部分
    + 第一部分：标签$y$的$ont-shot$编码
    + 第二部分：预测值$\hat{y}$的$softmax$转换
    + 第三部分：交叉熵的计算
3. 用公式表示
$$y = one\_hot(y)$$
$$\hat y = softmax(y) = \frac{ e^{x_{i}}}{\sum^{i=1}_{i=N}e^{x_{i}}}$$
$$loss(\hat{y},y) = -log {\; \hat y \cdot y}$$
4. <font color=red size=2.8>在Pytorch中使用是经常会出现一些Long和float的问题</font>

```python
import numpy as np
# from sklean.preprocessing import OneHotEncoder
def CrossEntropy(prediction,target):
    prediction = prediction.reshape(1,len(prediction))
    target = np.transpose(target.reshape(1,len(target)))
    
    log_label = np.log(prediction)
    
    data = -np.matmul(log_label,target)
    
    return data

def log_softmax(input):
    exp_input = np.exp(input)
    exp_input = np.log(np.array([val / sum(exp_input) for val in exp_input]))
    return exp_input

CrossEntropy(softmax(np.array([0.2,0.6,0.2])),np.array([1,0,0]))
```

## nn.NLLLoss
1. 多分类
2. 等价于 CrossEntropy = log_softmax + NLLLoss

In [183]:
m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()
def log_softmax(input):
    exp_input = np.exp(input)
    exp_input = np.log(np.array([val / sum(exp_input) for val in exp_input]))
    return exp_input

In [184]:
input = torch.randn(1,5,requires_grad=True)
target = torch.tensor([1,0,0])
# output = loss(torch.tensor(log_softmax()),target)
print(input)
print(m(input))
print()
print()
print(input)
input = input.detach().numpy()
print(input[0])
print(log_softmax(input[0]))

tensor([[-0.4794,  3.2893, -0.6448,  0.5649, -0.3041]], requires_grad=True)
tensor([[-3.8960, -0.1273, -4.0614, -2.8517, -3.7207]],
       grad_fn=<LogSoftmaxBackward>)


tensor([[-0.4794,  3.2893, -0.6448,  0.5649, -0.3041]], requires_grad=True)
[-0.47941372  3.2893145  -0.644789    0.56485087 -0.30408052]
[-3.8960045  -0.12727631 -4.06137983 -2.85173991 -3.72067135]


## <font color=red size=20>nn.KLDivLoss</font> 