<a href="https://colab.research.google.com/github/njucs/notebook/blob/master/FirstTry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# 临时测试代码

import torch
print(torch.__version__)
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())
print(x)
print(y)
print(z)

1.9.0+cu102
torch.Size([5, 3])
torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])
tensor([[-0.0330,  1.2461,  1.2818,  0.5821],
        [-0.9219, -2.0599,  1.7669,  0.2533],
        [-0.9650, -0.8131,  2.0064, -0.3247],
        [-1.2489, -1.4372,  1.0121, -1.6097]])
tensor([-0.0330,  1.2461,  1.2818,  0.5821, -0.9219, -2.0599,  1.7669,  0.2533,
        -0.9650, -0.8131,  2.0064, -0.3247, -1.2489, -1.4372,  1.0121, -1.6097])
tensor([[-0.0330,  1.2461,  1.2818,  0.5821, -0.9219, -2.0599,  1.7669,  0.2533],
        [-0.9650, -0.8131,  2.0064, -0.3247, -1.2489, -1.4372,  1.0121, -1.6097]])


### **参考资料：**
https://pytorch.apachecn.org/docs/1.7/

### **准备工作**

In [1]:
# import 导入模块，每次使用模块中的函数都要是定是哪个模块
# from … import * 导入模块，每次使用模块中的函数直接用就可以了，因为已经知道该函数是哪个模块中的了。
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import torchvision as tv
from torchvision import models,transforms,datasets

# 查看Python解释器
import sys
print(sys.executable)

# 测试GPU是否可用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Using gpu: %s ' % torch.cuda.is_available())
use_gpu = torch.cuda.is_available()

# 把Tensor转成Image，方便可视化
'''
from torchvision.transforms import ToPILImage
show = ToPILImage()

x = torch.randn(300,500)
show(x)#.resize((100, 100))
'''

### **数据加载和预处理**
**Dataset**对象是一个数据集，可以按下标访问，返回形如(data, label)的数据。

**Dataloader**是一个可迭代的对象，它将dataset返回的每一条数据拼接成一个batch，并提供多线程加速优化和数据打乱等操作。当程序对dataset的所有数据遍历完一遍之后，相应的对Dataloader也完成了一次迭代。

In [None]:
import torchvision.transforms as transforms

# 定义对数据的预处理
transform = transforms.Compose([
        transforms.ToTensor(), # 转为Tensor
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), # 归一化
                             ])

# 训练集
trainset = tv.datasets.CIFAR10(
                    root='./data/tmp/', 
                    train=True, 
                    download=True,
                    transform=transform)

trainloader = torch.utils.data.DataLoader(
                    trainset, 
                    batch_size=4,
                    shuffle=True, 
                    num_workers=2)

# 测试集
testset = tv.datasets.CIFAR10(
                    './data/tmp/',
                    train=False, 
                    download=True, 
                    transform=transform)

testloader = torch.utils.data.DataLoader(
                    testset,
                    batch_size=4, 
                    shuffle=False,
                    num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
# 可以查看一下部分数据内容
'''
dataiter = iter(trainloader)
images, labels = dataiter.next() # 返回4张图片及标签
print(' '.join('%11s'%classes[labels[j]] for j in range(4)))
show(tv.utils.make_grid((images + 1) / 2)).resize((400,100))
#show(images[2]).resize((100,100))
'''

### **定义网络**
定义网络时，需要继承nn.Module，并实现它的forward方法，**把网络中具有可学习参数的层放在构造函数\__init__中**。如果某一层(如ReLU)不具有可学习的参数，则既可以放在构造函数中，也可以不放，但建议不放在其中，而在forward中使用nn.functional代替。

**只要在nn.Module的子类中定义了forward函数，backward函数就会自动被实现(利用autograd)**。在forward 函数中可使用任何tensor支持的函数，还可以使用if、for循环、print、log等Python语法，写法和标准的Python写法一致。

torch.nn只支持mini-batches，不支持一次只输入一个样本，即一次必须是一个batch。但如果只想输入一个样本，则用 input.unsqueeze(0)将batch_size设为１。即输入必须是N个samples，但N可以设为1。

In [None]:
class Net(nn.Module):
    def __init__(self):
        # nn.Module子类的函数必须在构造函数中执行父类的构造函数
        # 下式等价于nn.Module.__init__(self)
        super(Net, self).__init__()
        
        # 卷积层
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # 全连接层
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # reshape，‘-1’表示自适应
        x = x.view(x.size()[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
print(net)

if(use_gpu):
    net = net.cuda()

### **查看网络的可学习参数**

网络的可学习参数通过net.parameters()返回，net.named_parameters可同时返回可学习的参数及名称。

In [None]:
params = list(net.parameters())
print(params)

for name,parameters in net.named_parameters():
    print(name,':',parameters.size())

### **定义损失函数和优化器**

In [26]:
# 损失函数
criterion = nn.CrossEntropyLoss()
'''
criterion = nn.MSELoss() # 均方误差损失, 计算 output 和 target 之差的均方差.
criterion = nn.CrossEntropyLoss() # 交叉熵损失函数, 描述两个概率分布的差异, 当训练有 C 个类别的分类问题时很有效.
criterion = nn.KLDivLoss() # 计算 input 和 target 之间的 KL 散度. KL 散度可用于衡量不同的连续分布之间的距离, 在连续的输出分布的空间上(离散采样)上进行直接回归时很有效.
criterion = nn.BCELoss() # 二进制交叉熵损失 BCELoss. 二分类任务时的交叉熵计算函数. 注意目标的值的范围为0到1之间.
criterion = nn.MultiLabelMarginLoss() # 多标签分类损失 MultiLabelMarginLoss
criterion = nn.MultiLabelSoftMarginLoss() # 多标签 one-versus-all 损失
criterion = nn.CosineEmbeddingLoss() # cosine 损失
criterion = nn.MultiMarginLoss(p=1, margin=1.0) # 多类别分类的hinge损失
criterion = nn.TripletMarginLoss(margin=1.0, p=2.0, eps=1e-06, swap=False, reduction='mean') # 三元组损失
criterion = nn.NLLLoss() # 负对数似然损失. 用于训练 C 个类别的分类问题.
'''

# 优化器
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
'''
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adagrad(net.parameters(), lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0) # 一种自适应优化方法，是自适应的为各个参数分配不同的学习率
optimizer = optim.RMSprop(net.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) # 对Adagrad的一种改进，可缓解Adagrad学习率下降较快的问题
optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) # 结合了Momentum和RMSprop，并进行了偏差修正
'''

if(use_gpu):
    criterion = criterion.cuda()

### **训练网络并更新网络参数**

所有网络的训练流程都是类似的，不断地执行如下流程：

1. 输入数据
2. 前向传播+反向传播
3. 更新参数

In [None]:
torch.set_num_threads(8)
for epoch in range(20):  
    
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        
        # 输入数据
        inputs, labels = data
        if(use_gpu):
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        # 梯度清零
        optimizer.zero_grad()
        
        # forward + backward 
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()   
        
        # 更新参数 
        optimizer.step()
        
        # 打印log信息
        # loss 是一个scalar,需要使用loss.item()来获取数值，不能使用loss[0]
        running_loss += loss.item()
        if i % 2000 == 1999: # 每2000个batch打印一下训练状态
            print('[%d, %5d] loss: %.3f' \
                  % (epoch+1, i+1, running_loss / 2000))
            running_loss = 0.0
print('Finished Training')

### **测试网络**
测试部分看看效果

In [None]:
'''
dataiter = iter(testloader)
images, labels = dataiter.next() # 一个batch返回4张图片
images, labels = images.to(device), labels.to(device)

print('实际的label: ', ' '.join(\
            '%08s'%classes[labels[j]] for j in range(4)))
show(tv.utils.make_grid(images / 2 - 0.5)).resize((400,100))

# 计算图片在每个类别上的分数
outputs = net(images)
# 得分最高的那个类
_, predicted = torch.max(outputs.data, 1)

print('预测结果: ', ' '.join('%5s'\
            % classes[predicted[j]] for j in range(4)))
'''

完整的测试结果

In [None]:
correct = 0 # 预测正确的图片数
total = 0 # 总共的图片数

# 由于测试的时候不需要求导，可以暂时关闭autograd，提高速度，节约内存
with torch.no_grad():
    for data in testloader:
        images, labels = data
        if(use_gpu):
            images = images.cuda()
        outputs = net(images)
        if(use_gpu):
            outputs = outputs.cpu()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()

print('10000张测试集中的准确率为: %d %%' % (100 * correct / total))

### **其他常用技巧**

#### 绘制损失曲线（TBD）

In [None]:
# 绘制损失曲线

#### 注意力可视化（TBD）

In [None]:
# 注意力可视化
''' 这部分内容在Attention部分详细介绍
'''
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
ax.matshow(trained_attn, cmap='viridis')
ax.set_xticklabels([''] + sentences[0].split(), fontdict={'fontsize': 14})
ax.set_yticklabels([''] + sentences[1].split(), fontdict={'fontsize': 14})
plt.show()

#### 模型序列化

In [None]:
# 模型序列化

# 保存
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

# 加载
net = Net()
net.load_state_dict(torch.load(PATH))

# 在每一轮训练-验证过程中出现的最佳模型保存下来
if epoch_acc > best_acc:
    best_acc = epoch_acc
    best_model_wts = copy.deepcopy(model.state_dict())

# 最终加载最佳模型后返回
model.load_state_dict(best_model_wts)
return model

#### 模型微调

In [None]:
# 模型微调

'''
Two types of transfer learning: finetuning and feature extraction.
1. In finetuning, we start with a pretrained model and update all of the model’s
   parameters for our new task, in essence retraining the whole model. 
2. In feature extraction, we start with a pretrained model and only update the 
   final layer weights from which we derive predictions. 

In general both transfer learning methods follow the same few steps:
1. Initialize the pretrained model
2. Reshape the final layer(s) to have the same number of outputs as the number of classes in the new dataset
3. Define for the optimization algorithm which parameters we want to update during training
4. Run the training step

torch.autograd跟踪所有将其requires_grad标志设置为True的张量的操作。
对于不需要梯度的张量，将此属性设置为False会将其从梯度计算 DAG 中排除。
在 NN 中，不计算梯度的参数通常称为冻结参数。这种策略可以用来调整预训练网络。
'''

from torch import nn, optim

model = torchvision.models.resnet18(pretrained=True)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False

# Initialize and Reshape the Network
# this is not an automatic procedure and is unique to each model
# 在 resnet 中，分类器是最后一个线性层model.fc
# 我们可以简单地将其替换为充当我们的分类器的新线性层（默认情况下未冻结）
model.fc = nn.Linear(512, 10)
'''
其他常见模型的Reshape：
model.classifier[6] = nn.Linear(4096,num_classes) # Alexnet
model.classifier[6] = nn.Linear(4096,num_classes) # VGG
model.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1)) # Squeezenet
model.classifier = nn.Linear(1024, num_classes) # DenseNet

# Inception v3, to finetune this model we must reshape both layers.
model.AuxLogits.fc = nn.Linear(768, num_classes)
model.fc = nn.Linear(2048, num_classes)

# 以下是一个比较完整的Reshape代码段：
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
            
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    if model_name == "resnet":
        """ Resnet18
        """
        model_ft = models.resnet18(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "alexnet":
        """ Alexnet
        """
        model_ft = models.alexnet(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "vgg":
        """ VGG11_bn
        """
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "squeezenet":
        """ Squeezenet
        """
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = 224

    elif model_name == "densenet":
        """ Densenet
        """
        model_ft = models.densenet121(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "inception":
        """ Inception v3
        Be careful, expects (299,299) sized images and has auxiliary output
        """
        model_ft = models.inception_v3(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        # Handle the auxilary net
        num_ftrs = model_ft.AuxLogits.fc.in_features
        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
        # Handle the primary net
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs,num_classes)
        input_size = 299

    else:
        print("Invalid model name, exiting...")
        exit()

    return model_ft, input_size

# Initialize the model for this run
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True)

# Print the model we just instantiated
print(model_ft)
'''

# Optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)

#### 基于TensorBoard进行模型可视化

In [None]:
# 基于TensorBoard进行模型可视化

'''
Tensorboard的工作流程简单来说是：
1. 将代码运行过程中的，某些你关心的数据保存在一个文件夹中：这一步由代码中的writer完成
2. 再读取这个文件夹中的数据，用浏览器显示出来：这一步通过在命令行运行tensorboard完成。
'''

from torch.utils.tensorboard import SummaryWriter

### 设置 TensorBoard
writer = SummaryWriter('./path/to/log')

### 写入 TensorBoard

# 针对数值
# tag指定可视化时这个变量的名字
# scalar_value是你要存的值
# global_step可以理解为x轴坐标。
writer.add_scalar(tag, scalar_value, global_step=None, walltime=None)

# 针对图像
writer.add_image(tag, img_tensor, global_step=None, walltime=None, dataformats='CHW')
writer.add_images(tag, img_tensor, global_step=None, walltime=None, dataformats='NCHW')

# 可视化构建的模型
writer.add_graph(net, images)
writer.close()

# 通过add_embedding方法可视化高维数据的低维表示
class_labels = [classes[lab] for lab in labels]
features = images.view(-1, 28 * 28)
writer.add_embedding(features,
                    metadata=class_labels,
                    label_img=images.unsqueeze(1))
writer.close()

# 模型评估
writer.add_pr_curve(classes[class_index],
                    tensorboard_preds,
                    tensorboard_probs,
                    global_step=global_step)
writer.close()

### 可视化
'''
命令行：tensorboard --logdir=./path/to/the/folder --port 8123
打开浏览器，访问地址 http://localhost:8123/ 即可
'''
# 变量归类
# 命名变量的时候可以使用如下的格式，这样3个loss就会被显示在同一个section
writer.add_scalar('loss/loss1', loss1, epoch)
writer.add_scalar('loss/loss2', loss2, epoch)
writer.add_scalar('loss/loss3', loss3, epoch)

# 同时显示多个折线图
# 只需要将两个日志文件夹放到同一目录下，并在命令行运行
tensorboard --logdir=./path/to/the/root --port 8123

#### 并行和分布式训练

In [None]:
# 并行和分布式训练

'''
1. 最简单的是使用DataParallel在多个 GPU 上训练神经网络； 
   此功能将相同的模型复制到所有 GPU，其中每个 GPU 消耗输入数据的不同分区。
   但不适用于模型太大而无法容纳单个 GPU 的某些用例
2. 分布式数据并行训练（DDP）是一种广泛采用的单程序多数据训练范例。 
   使用 DDP，可以在每个流程上复制模型，并且每个模型副本都将获得一组不同的输入数据样本。 
   DDP 负责梯度通信，以保持模型副本同步，并使其与梯度计算重叠，以加快训练速度。
3. 基于 RPC 的分布式训练（RPC）开发来支持无法适应数据并行训练的常规训练结构，
   例如分布式管道并行性，参数服务器范式以及 DDP 与其他训练范式的组合。 
   它有助于管理远程对象的生命周期，并将自动微分引擎扩展到机器范围之外。
'''

#### 模型优化及超参调整

In [None]:
# 模型优化

### 使用Profiler进行性能调试
'''
用于跟踪代码中各种 PyTorch 操作的时间和内存成本，有助于识别模型中的性能瓶颈
使用 Profiler 会产生一些开销，并且最好仅用于调查代码
如果要对运行时进行基准测试，请记住将其删除。
'''
import torch
import numpy as np
from torch import nn
import torch.autograd.profiler as profiler

# 使用profiler.record_function("label")将每个子任务的代码包装在单独的带标签的上下文管理器中
# 在事件探查器输出中，子任务中所有操作的综合性能指标将显示在其相应的标签下
class MyModule(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        super(MyModule, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias)

    def forward(self, input, mask):
        with profiler.record_function("LINEAR PASS"):
            out = self.linear(input)

        with profiler.record_function("MASK INDICES"):
            threshold = out.sum(axis=1).mean().item()
            hi_idx = np.argwhere(mask.cpu().numpy() > threshold)
            hi_idx = torch.from_numpy(hi_idx).cuda()

        return out, hi_idx

# 在运行探查器之前，需要对 CUDA 进行预热，以确保进行准确的性能基准测试
# with_stack=True参数在跟踪中附加操作的文件和行号
model = MyModule(500, 10).cuda()
input = torch.rand(128, 500).cuda()
mask = torch.rand((500, 500, 500), dtype=torch.double).cuda()

model(input, mask) # warm-up
with profiler.profile(with_stack=True, profile_memory=True) as prof:
    out, idx = model(input, mask)

# 打印分析结果
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))

### 使用 Ray Tune 进行超参数调整
'''
Ray Tune 是用于分布式超参数调整的行业标准工具，包含最新的超参数搜索算法。

只需要添加一些细微的修改即可：
1. 在函数中包装数据加载和训练，
2. 使一些网络参数可配置，
3. 添加检查点（可选），
4. 定义用于模型调整的搜索空间

需要预先安装以下包：
ray[tune]：分布式超参数调整库
torchvision：用于数据转换器
'''

# 导入
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune ###
from ray.tune import CLIReporter ###
from ray.tune.schedulers import ASHAScheduler ###

# 数据加载器
# 传递一个全局数据目录，可以在不同的试验之间共享数据目录
def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

# 可配置的神经网络
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# 训练
def train_cifar(config, checkpoint_dir=None, data_dir=None):
# config参数将接收要训练的超参数
# checkpoint_dir参数用于还原检查点
# data_dir指定了加载和存储数据的目录

    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        # 保存一个检查点，然后将一些指标报告给 Ray Tune
        # Ray Tune 可以使用这些指标来决定哪种超参数配置可以带来最佳结果
        # 这些指标还可用于尽早停止效果不佳的试验，以避免浪费资源进行试验
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

# 测试集的准确率
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

# main函数
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)

    # 定义 Ray Tune 的搜索空间
    # tune.sample_from()函数可以定义自己的采样方法以获得超参数
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }

    # 使用ASHAScheduler，它将尽早终止效果不佳的测试
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    
    # 用functools.partial包装train_cifar函数以设置常量data_dir参数
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    # 训练完模型后，我们将找到表现最好的模型，并从检查点文件中加载训练后的网络
    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

#### 模型压缩

In [None]:
# 模型压缩

### 方法一：模型剪裁，基于torch.nn.utils.prune
import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F

# Step1: 选择一种剪裁技术（或通过子类化BasePruningMethod实现您自己的东西）
# Step2: 指定模块和该模块中要剪裁的参数的名称
# Step3: 使用所选剪裁技术所需的适当关键字参数，指定剪裁参数
prune.random_unstructured(module, name="weight", amount=0.3)
prune.l1_unstructured(module, name="bias", amount=3)

# 使剪裁永久化，使用torch.nn.utils.prune的remove函数
# 通过将参数weight重新分配给模型参数（剪裁后的版本）来使其永久不变
prune.remove(module, 'weight')

# 剪裁模型中的多个参数
# 通过指定所需的剪裁技术和参数，可以轻松地剪裁网络中的多个张量
for name, module in new_model.named_modules():
    # prune 20% of connections in all 2D-conv layers
    if isinstance(module, torch.nn.Conv2d):
        prune.l1_unstructured(module, name='weight', amount=0.2)
    # prune 40% of connections in all linear layers
    elif isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name='weight', amount=0.4)

# 全局裁剪
# 可能导致每个层的剪裁百分比不同
model = LeNet()

parameters_to_prune = (
    (model.conv1, 'weight'),
    (model.conv2, 'weight'),
    (model.fc1, 'weight'),
    (model.fc2, 'weight'),
    (model.fc3, 'weight'),
)

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,
)

### 方法二：动态量化，减小模型大小的简单方法，且对精度的影响有限

### 方法三：模型蒸馏