In [1]:
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torchvision

## Batch normalization
1. feature scaling
    + $w_{1}$和$w_{2}$的大小范围不同，比如$w_{1}$的大小范围属于$[1,10]$,比如$w_{2}$的大小范围属于$[1,100]$,一般采用标准化进行处理
    + 在深度学习中，可以想象，如果feature scaling对于单层的输入有很好的效果，那么在多层神经网络中，同样的操作是不是会取得更好的结果，因此可以在每层网络输入之前都进行batch normal的处理。缓解了internal covariate shift
2. batch  normalization
    + 每次都会计算一个batch: $\frac{z^{i} - \mu }{\sigma}$,在反向传播的时候，反向传播的路径还是会经过$\mu$和$\sigma$
        + $ z^{i} = \frac{z^{i} - \mu }{\sigma}$
        + $ z^{i} = \gamma \bigodot z^{i} + \beta $
3. 使得输入的数据在深度神经网路传播的过程中更加稳定
4. nn.BatchNorm2d(num_feature),其中num_feature表示的是batch_size * num_feature * width * size,通道数channel

In [2]:
batch_size = 128
image_size = 32
num_channel = 3
num_z = 100 # number of noise input
number_epoch = 10
lr = 0.002

In [3]:
data_loader = data.DataLoader(
    dataset=datasets.CIFAR10(root='./data',download=True,
                             transform=transforms.Compose([
                                 transforms.Resize(image_size),
                                 transforms.CenterCrop(image_size),
                                 transforms.ToTensor(),
                                 transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                             ])),
    batch_size=128,
    shuffle=True,
    num_workers=2
)

Files already downloaded and verified


In [4]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

1. 反卷积
    + torch.nn.ConvTranspose2d(input=100, output=8, kernel_size = (4, 4), stride=1, padding=0)
2. 数学表达
    + 正常的卷积操作，比如有一个输入是$A_{4\times 4}$的矩阵：卷积核的大小为3，对数据进行展开转换成矩阵的乘法：
    $$Y_{4*1} = C_{4*16} \times X_{16*1}$$
    + 反卷积操作：
    $$X_{16*1} =C^{T}_{4*16} \times Y_{4*1}$$
    + Hint: 记住只有方阵才可逆，这里并不要求卷积核和反卷积核相同，只是反应一种矩阵操作。
3. 反卷积的尺寸大小计算
    + 定义一些参数：$H_{output}$表示输出图片的高度，$H_{input}$表示输入图片的高度，$H_{kernel}$表示卷积核的大小，最后得到的计算公式：
    $$H_{output} = (H_{input} - 1) * stride + H_{kernel} - 2 *padding$$

In [88]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            # input is Z, going into a convolution, init 32 * 32
            nn.ConvTranspose2d(100, 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(8),
            nn.ReLU(True),
            # state size.  (1 - 1) * 1 + 4 - 2*0=  4, 8 * 4 * 4
            nn.ConvTranspose2d( 8, 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(4),
            nn.ReLU(True),
            # state size.  (4 - 1) * 2 + 4 - 2 * 1 = 8 
            nn.ConvTranspose2d(4, 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(2),
            nn.ReLU(True),
            # state size. (8 - 1) * 2 + 4 - 2 * 1 = 16 
            nn.ConvTranspose2d(2, 3, 4, 2, 1, bias=False),
            nn.Tanh() # (16 - 1) * 2 + 4 - 2 * 1 = 32
        )

    def forward(self,x):
        insize = x.size(0)
        output = self.main(x)
        return output.view(insize,3,32,32)

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator,self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(3,10,kernel_size=4,stride=2,padding=1), # (32 - 4 + 2*1) / 2 + 1 = 16
            nn.ReLU(),
            
            nn.Conv2d(10,15,kernel_size=4,stride=2,padding=1),# (16 - 4 + 2*1) / 2 + 1 = 8
            nn.BatchNorm2d(15),
            nn.ReLU(),
            
            nn.Conv2d(15,5,kernel_size=4,stride=2,padding=1),# (8 - 4 + 2*1) / 2 + 1 = 4
            nn.BatchNorm2d(5),
            nn.ReLU(),
            
            nn.Conv2d(5,1,kernel_size=4),#  4 - 4 + 1 = 1
            nn.Sigmoid() # 输出依旧是概率数值
        )
    
    def forward(self,x):
        insize = x.size(0)
        output = self.main(x).view(insize,1)
        return output

In [89]:
gen = Generator()
dis = Discriminator()
criterion = nn.BCELoss()

optim_G = torch.optim.Adam(gen.parameters())
optim_D = torch.optim.Adam(dis.parameters())

## 为什么是 先更新判别器 后更新生成器 ？？？balck man's mark

In [93]:
for epoch in range(10):
    loss_g_sum = 0.0
    loss_d_sum = 0.0
    for step,(data,label) in enumerate(data_loader):
        data = data.to(device)
        size = data.size(0)
        z = torch.randn(size,100,1,1).to(device)
        
        ones_label = torch.ones(size,1).to(device)
        zeros_label = torch.zeros(size,1).to(device)
        
        """
        固定生成器G，更新判别器D
        """
        print(data.size())
        d_real = dis(data)
        d_fake = dis(gen(z))
        
        loss_real = criterion(d_real,ones_label)
        loss_fake = criterion(d_fake,zeros_label)
        
        loss_d = loss_real + loss_fake
        
        optim_D.zero_grad()
        loss_d.backward()
        optim_D.step()
        
        z = torch.randn(size,100,1,1).to(device)
        loss_g = criterion(dis(gen(z)),ones_label)
        
        optim_G.zero_grad()
        loss_g.backward()
        optim_G.step()
        
        loss_g_sum += loss_g.item()
        loss_d_sum += loss_d.item()
    
    print("Epoch %d G Loss %.3f, D Loss %.3f" % (epoch, loss_g_sum / step,loss_d_sum / step))        

torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])
torch.Size([128, 3, 32, 32])


KeyboardInterrupt: 

In [None]:
# class Net(nn.Module):
#     def __init__(self):
#         super(Net,self).__init__()
        
#         self.layer = nn.BatchNorm2d(8)
    
#     def forward(self,x):
#         output = self.layer(x)
#         return output

# real_batch = next(iter(data_loader))
# plt.figure(figsize=(8,8))
# plt.axis("off")
# plt.title("Training Images")
# plt.imshow(np.transpose(torchvision.utils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0)))


## 我现在理解的GANs
1. 在第二步，更新生成器的过程中实际上是将$P_{data}$和$P_{G}$之间的分布距离$JS-Divergence$转化成了最小化判别器损失函数的问题,或者说不断调整生成网络的参数，使得生成函数所得到的数据分布$P_{G}$不断接近真实分布$P_{data}$。
2. __生成器与判别器不均衡的问题，在编码的过程中常常会出现一些问题，就是判别器训练的太好，判别器的损失函数很快地就降为0。__