In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
print(torch.__config__.show())

PyTorch built with:
  - GCC 9.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.6.0 (Git Hash 52b5f107dd9cf10910aaa19cb47f3abf9b349815)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 11.6
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86
  - CuDNN 8.3.2  (built against CUDA 11.5)
  - Magma 2.6.1
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.6, CUDNN_VERSION=8.3.2, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -D

In [16]:
from torch.utils.benchmark import Timer
import os
import itertools as it

def m2():
    seed=0
    num_threads=1
    sub_label="N/A"
    result_file=None
    env=None
    MIN_RUN_TIME = 1

    # conda_prefix = os.getenv("CONDA_PREFIX")
    # assert conda_prefix
    # if not torch.__file__.startswith(conda_prefix):
    #     raise ValueError(
    #         f"PyTorch mismatch: `import torch` resolved to `{torch.__file__}`, "
    #         f"which is not in the correct conda env: {conda_prefix}"
    #     )

    torch.manual_seed(seed)
    results = []
    for n in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 7, 96, 150, 225]:
        dtypes = (("Single", torch.float32), ("Double", torch.float64))
        shapes = (
            # Square MatMul
            ((n, n), (n, n), "(n x n) x (n x n)", "Matrix-Matrix Product"),

            # Matrix-Vector product
            ((n, n), (n, 1), "(n x n) x (n x 1)", "Matrix-Vector Product"),
        )
        for (dtype_name, dtype), (x_shape, y_shape, shape_str, blas_type) in it.product(dtypes, shapes):
            t = Timer(
                stmt="torch.mm(x, y)",
                label=f"torch.mm {shape_str} {blas_type} ({dtype_name})",
                sub_label=sub_label,
                description=f"n = {n}",
                env=os.path.split(env or "")[1] or None,
                globals={
                    "x": torch.rand(x_shape, dtype=dtype),
                    "y": torch.rand(y_shape, dtype=dtype),
                },
                num_threads=num_threads,
            ).blocked_autorange(min_run_time=MIN_RUN_TIME)
            results.append(t)
        print(results)


In [17]:
m2()

[<torch.utils.benchmark.utils.common.Measurement object at 0x7f2438346790>
torch.mm (n x n) x (n x n) Matrix-Matrix Product (Single): N/A
n = 4
  Median: 2.55 us
  IQR:    0.09 us (2.50 to 2.59)
  4 measurements, 100000 runs per measurement, 1 thread, <torch.utils.benchmark.utils.common.Measurement object at 0x7f2435ca17c0>
torch.mm (n x n) x (n x 1) Matrix-Vector Product (Single): N/A
n = 4
  Median: 2.11 us
  IQR:    0.04 us (2.10 to 2.14)
  5 measurements, 100000 runs per measurement, 1 thread, <torch.utils.benchmark.utils.common.Measurement object at 0x7f24b4161fd0>
torch.mm (n x n) x (n x n) Matrix-Matrix Product (Double): N/A
n = 4
  Median: 2.38 us
  IQR:    0.03 us (2.38 to 2.41)
  5 measurements, 100000 runs per measurement, 1 thread, <torch.utils.benchmark.utils.common.Measurement object at 0x7f2435ca18b0>
torch.mm (n x n) x (n x 1) Matrix-Vector Product (Double): N/A
n = 4
  Median: 2.17 us
  IQR:    0.05 us (2.15 to 2.20)
  5 measurements, 100000 runs per measurement, 1 thr

In [3]:
!nvidia-smi

Wed Dec 14 03:39:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
BATCH_SIZE = 523  	# batch_size即每批训练的样本数量
epochs = 20			# 循环次数
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")    #判断是否能在GPU上进行运算


In [11]:
print(DEVICE)
print(torch.backends.cudnn.enabled)
# torch.backends.cudnn.enabled=False
torch.backends.cudnn.enabled=False
print(torch.backends.cudnn.enabled)

cpu
True
False


In [3]:
train_loader = torch.utils.data.DataLoader(                 # vision.utils : 用于把形似 (3 x H x W) 的张量保存到硬盘中，给一个mini-batch的图像可以产生一个图像格网。
        datasets.MNIST('data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),       # 图像转化为Tensor
                           transforms.Normalize((0.1307,), (0.3081,))       # 标准化
                       ])),
        batch_size=BATCH_SIZE, shuffle=True)            # shuffle() 方法将序列的所有元素随机排序


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



In [4]:
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=BATCH_SIZE, shuffle=True)            # shuffle() 方法将序列的所有元素随机排序


In [5]:
class Net(nn.Module): # 继承model
    def __init__(self):
        super().__init__()
        # 28x28
        self.conv1=nn.Conv2d(1,12,5)         # 12, 24x24
        self.conv2=nn.Conv2d(12, 20,3)       #20, 10x10
        self.conv3=nn.Conv2d(20, 40,3)       #20, 10x10
        self.batchnorm2d = nn.BatchNorm2d(40)
        self.fc1=nn.Linear(40*8*8, 500)
        self.fc2=nn.Linear(500, 10)
    def forward(self, x):      #网络传播结构
        in_size=x.size(0)# in_size 为 batch_size（一个batch中的Sample数）
        # 卷积层 -> relu -> 最大池化
        out = self.conv1(x)     # 24
        out = F.relu(out)
        out = F.max_pool2d(out, 2, 2)  # 12
        out = self.conv2(out)  # 10
        out = F.relu(out)
        out = self.conv3(out)
        out = F.relu(out)
        out =  self.batchnorm2d(out)
        out = out.view(in_size, -1)    # view()函数作用是将一个多行的Tensor,拼接成行。
        # 输出前的预处理
        out = self.fc1(out)
        out = F.relu(out)
        out = self.fc2(out)
        # softmax
        out = F.log_softmax(out, dim=1)
        # 返回值 out
        return out


In [6]:
model = Net().to(DEVICE)
optimizer = optim.Adam(model.parameters())


In [7]:
def train(model, device, train_loader, optimizer, epoch):
    model.eval()
    for  epoch_i in range(epoch+1):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data, target      # CPU转GPU
            optimizer.zero_grad()               # 优化器清零
            output = model(data)                # 由model，计算输出值
            loss = F.nll_loss(output, target)   # 计算损失函数loss
            loss.backward()                     # loss反向传播
            optimizer.step()                    # 优化器优化
            if(batch_idx+1)%30 == 0:            # 输出结果
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch_i, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))


In [8]:
def test(model, device, test_loader):
    test_loss = 0                           # 损失函数初始化为0
    correct = 0                             # correct 计数分类正确的数目
    with torch.no_grad():           
        for data, target in test_loader:    # 遍历所有的data和target
            data, target = data.to(device), target.to(device)   # CPU -> GPU
            output = model(data)            # output为预测值，由model计算出
            test_loss += F.nll_loss(output, target, reduction='sum').item()     ### 将一批的损失相加
            pred = output.max(1, keepdim=True)[1]       ### 找到概率最大的下标
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)   # 总损失除数据集总数
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


In [10]:
import time
old=time.time()
train(model, DEVICE, train_loader, optimizer, 1)
print(time.time()-old)
test(model, DEVICE, test_loader)



Test set: Average loss: 0.0487, Accuracy: 9839/10000 (98%)

