In [3]:
# 从原始图像文件开始读取
# 并将它们转换为张量格式
# https://www.kaggle.com/c/cifar-10
# https://www.zhihu.com/question/54883612/answer/130707137363

import os
import shutil
import random
from collections import Counter
from sched import scheduler

# 数据文件路径
data_files_dir = r'../data/kaggle-cifar-10'
train_images_dir = os.path.join(data_files_dir, 'train')
test_images_dir = os.path.join(data_files_dir, 'test')
train_labels_file_path = os.path.join(data_files_dir, 'trainLabels.csv')
submission_example_file_path = os.path.join(data_files_dir, 'sampleSubmission.csv')


def copy_files(file_path, target_dir):
    """
    将文件复制到目标目录
    :param file_path: 源文件路径
    :param target_dir: 目标文件夹
    """
    os.makedirs(target_dir, exist_ok=True)  # 确保目标目录存在，如果不存在则创建
    shutil.copy(file_path, target_dir)


def read_csv_labels(file_path) -> dict:
    """
    读取CSV文件，返回文件名到标签的字典
    :param file_path: CSV文件路径
    :return: 字典
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]  # 跳过第一行标题
    # 遍历每一行，将其去掉末尾的换行符，并按逗号分割成列表
    tokens = [line.strip().split(',') for line in lines]
    # 创建字典，键是文件名，值是标签
    return dict(((name, label) for name, label in tokens))


train_labels = read_csv_labels(train_labels_file_path)
print('# 训练样本 :', len(train_labels))
print('# 类别数 :', len(set(train_labels.values())))
dict(list(train_labels.items())[:10])  # 显示前10个样本的文件名和标签

# 训练样本 : 50000
# 类别数 : 10


{'1': 'frog',
 '2': 'truck',
 '3': 'truck',
 '4': 'deer',
 '5': 'automobile',
 '6': 'automobile',
 '7': 'bird',
 '8': 'horse',
 '9': 'ship',
 '10': 'cat'}

In [4]:
# 显示标签种类分布
Counter(train_labels.values())

Counter({'frog': 5000,
         'truck': 5000,
         'deer': 5000,
         'automobile': 5000,
         'bird': 5000,
         'horse': 5000,
         'ship': 5000,
         'cat': 5000,
         'dog': 5000,
         'airplane': 5000})

In [None]:



def reorganize_train_valid(whole_train_dir: str,
                           labels: dict,
                           valid_ratio: float = 0.2):
    """
    将训练集划分为训练集和验证集
    :param whole_train_dir: 训练集目录
    :param labels: 标签字典 {文件名: 类别}
    :param valid_ratio: 验证集比例
    """
    label_counts = Counter(labels.values())  # 统计训练数据集中每个类别的样本数
    min_count = min(label_counts.values())  # 获取样本最少的类别的样本数
    valid_count_per_label = max(1, int(min_count * valid_ratio))  # 每个类比在验证集中至少要有的样本数
    files_by_label = {}  # 按类组织文件
    for name, label in labels.items():
        if label not in files_by_label:
            files_by_label[label] = []
        else:
            files_by_label[label].append(name)

    for label, names in files_by_label.items():
        random.shuffle(names)
        valid_files = names[:valid_count_per_label]
        train_files = names[valid_count_per_label:]



In [5]:
def reorg_train_valid(data_dir, labels, valid_ratio):
    """
    将验证集从原始的训练集中拆分出来
    最终会把数据分成以下三类：
    - train_valid_test/train_valid/类别（全部数据，包括训练 + 验证）
    - train_valid_test/train/类别（训练数据）
    - train_valid_test/valid/类别（验证数据）
    """
    n = Counter(labels.values()).most_common()[-1][1]  # 样本最少的类别的样本数
    n_valid_per_label = max(1, int(n * valid_ratio))  # 每个类别放入验证集的样本数
    label_count = {}  # 用于记录已分配到验证集的样本数
    for train_file in os.listdir(os.path.join(data_dir, 'train')):
        label = labels[train_file.split('.')[0]]  # 去掉文件扩展名，查找标签
        fname = os.path.join(data_dir, 'train', train_file)  # 生成文件的完整路径
        copy_files(fname, os.path.join(data_dir, 'train_valid_test', 'train_valid', label))
        # 如果该类别的验证集样本数未达到`n_valid_per_label`，则放入 `valid` 目录
        if label not in label_count or label_count[label] < n_valid_per_label:
            copy_files(fname, os.path.join(data_dir, 'train_valid_test', 'valid', label))
            label_count[label] = label_count.get(label, 0) + 1  # 更新该类别的计数
        else:
            copy_files(fname, os.path.join(data_dir, 'train_valid_test', 'train', label))
    return n_valid_per_label  # 返回每个类别被划分到验证集的样本数量


def reorg_test(data_dir):
    """
    将测试集中的文件按照指定的目录结构复制到新的位置
    unknown 文件夹表示这些测试样本是未标记的
    """
    for test_file in os.listdir(os.path.join(data_dir, 'test')):
        copy_files(os.path.join(data_dir, 'test', test_file),
                   os.path.join(data_dir, 'train_valid_test', 'test', 'unknown'))


def reorg_cifar10(data_dir, valid_ratio):
    labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))
    reorg_train_valid(data_dir, labels, valid_ratio)
    reorg_test(data_dir)


BATCH_SIZE = 128
VALID_RATIO = 0.1
reorg_cifar10(data_files_dir, VALID_RATIO)

In [6]:
# 图像增强防止过拟合
from torchvision import transforms
from torchvision.datasets import ImageFolder

transform_train = transforms.Compose([
    transforms.Resize((40, 40)),
    transforms.RandomResizedCrop(size=32,
                                 scale=(0.64, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                         std=[0.2023, 0.1994, 0.2010]),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                         std=[0.2023, 0.1994, 0.2010]),
])

train_ds, train_valid_ds = [
    ImageFolder(os.path.join(data_files_dir, 'train_valid_test', folder), transform=transform_train) for folder in
    ['train', 'valid']]

valid_ds, test_ds = [ImageFolder(os.path.join(data_files_dir, 'train_valid_test', folder), transform=transform_test) for
                     folder in ['test', 'valid']]

In [7]:
from torch.utils.data import DataLoader

train_iter, train_valid_iter = [DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) for ds in
                                [train_ds, valid_ds]]
valid_iter = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_iter = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [None]:
# 模型
from torchvision import models
from torch import nn, optim


def get_net():
    net = models.resnet18(pretrained=False)  # 获取标准的ResNet18模型
    net.fc = nn.Linear(net.fc.in_features, 10)
    # CIFAR-10图像是32x32分辨率
    # 而标准ResNet期望的输入分辨率为224x224
    # 需要调整第一个卷积层，将其从7x7卷积（适合ImageNet的大图像）改为3x3卷积（更适合CIFAR-10的小图像）
    net.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    # 移除最大池化层，因为CIFAR-10图像太小
    net.maxpool = nn.Identity()
    return net


def train(net,
          train_iter,
          valid_iter,
          num_epochs,
          learning_rate,
          wd,
          devices,
          lr_period,
          lr_decay):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=wd)
    # 每隔 lr_period 轮对学习率进行衰减（乘以 lr_decay 的值）
    scheduler = optim.lr_scheduler.StepLR(optimizer, lr_period,lr_decay)
    num_batches, timer = len(train_iter), d2l.Timer()