The colab assumes there are 2 GPUs on a single node.

This can run on Kaggle Notebook.

# Setup

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi

# Dist basics on GPUs

In [None]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

In [None]:
# @title Process for collective communication

# 广播（Broadcast）：广播是一种将数据从一个源进程发送到所有其他进程的通信操作。在 torch.distributed 中，通过 broadcast(tensor, src=0) 可以实现该操作，将 rank 为 0 的进程中的数据广播到所有其他进程。广播操作能够确保所有进程拥有相同的数据，适合需要共享模型参数、初始化权重等场景。比如在分布式训练的初始化阶段，用于将主进程的模型参数广播到所有其他进程，保证训练从同样的初始参数开始。
# 规约（Reduce 和 All-Reduce）：规约操作是一种将多个进程的数据进行计算（如求和、求最大值等）的操作。常用的规约操作有两种，reduce()：一个进程（通常是主进程）收集并合并来自所有进程的数据；all_reduce()：所有进程同时得到合并后的数据。比如 all_reduce(tensor, op=ReduceOp.SUM) 会在所有进程中求和，并将结果存放在每个进程的 tensor 中。规约操作能有效减少通信负担，适用于大规模梯度汇总或模型权重更新。譬如在分布式训练中，all_reduce 常用于梯度求和，以确保在多个进程中的梯度保持一致，实现同步更新。
# 收集（Gather 和 All-Gather）：收集操作是将多个进程的数据收集到一个或多个进程的操作：gather()：将多个进程的数据收集到一个进程中。all_gather()：所有进程都收集到全部进程的数据。例如 all_gather(gathered_tensors, tensor) 会将所有进程中的 tensor 收集到每个进程的 gathered_tensors 列表中。收集操作方便对所有进程中的数据进行后续分析和处理。譬如做 evaluation 时，可以使用 all_gather 来汇总各个进程的中间结果。
# 散发（Scatter）：scatter() 操作是将一个进程的数据分散到多个进程中。例如在 rank 为 0 的进程中有一个包含若干子张量的列表，scatter() 可以将列表中的每个子张量分配给其他进程。适用于数据分发，将大型数据集或模型权重在多个进程中分散，以便每个进程可以处理不同的数据块。

def init_process(rank, world_size, backend="nccl"):
  device = f"cuda:{rank}"
  print(f"Starting process with {rank=}, {world_size=} {device=}")

  # Use the gloo backend for CPU-based distributed processing
  dist.init_process_group(backend="nccl", world_size=world_size, rank=rank)

  assert rank == dist.get_rank()
  assert world_size == dist.get_world_size()
  dist.barrier()
  print("{rank=} Finished init!!!")

  # Task 1 - all gather
  # It gathers information from all nodes.
  if rank == 0:
    print("\nTask 1 - all gather")
  process_info = (
      f"Process {rank} Information..."
  )
  max_len = 100
  process_info_tensor = torch.zeros(max_len, dtype=torch.int32).to(device)
  process_info_bytes = process_info.encode('utf-8')
  process_info_tensor[:len(process_info_bytes)] = torch.tensor([b for b in process_info_bytes], dtype=torch.int32)

  gathered_tensors = [torch.zeros(max_len, dtype=torch.int32).to(device) for _ in range(world_size)]

  dist.all_gather(gathered_tensors, process_info_tensor)

  if rank == 0:
    for t in gathered_tensors:
      info_bytes = t.to('cpu').numpy().astype('uint8').tobytes()
      info_str = info_bytes.decode('utf-8', 'ignore').strip('\x00')
      print(info_str)
  dist.barrier()
  print("{rank=} Finished step 1!!!")

  # Task 2 - all reduce (sum)
  if rank == 0:
    print("\nTask 2 - all reduce")
  tensor = torch.ones((4,)).to(device)
  dist.all_reduce(tensor)
  print(f"All reduce for all processes: in rank {rank}, tensor = {tensor}")
  dist.barrier()

  # Task 3 - all reduce (sum) in a sub-group.
  if rank == 0:
    print("\nTask 3 - all reduce for sub-group")
  sub_group_ranks = range(1, world_size, 2)
  sub_group = dist.new_group(ranks=sub_group_ranks)
  if rank in sub_group_ranks:
    sub_group_tensor = torch.ones((4,)).to(device)
    dist.all_reduce(sub_group_tensor, group=sub_group)
    print(f"Sub group all reduce: in rank {rank}, tensor = {sub_group_tensor}")
  dist.barrier()

  # Task 4 - all reduce (sum) in a sub-group, then sync results to the entire group.
  if rank == 0:
    print("\nRank 4 - all reduce (sum) in a sub-group, then sync results to the entire group.")
  group_1_sum = torch.tensor([1, 1, 1, 1]).to(device)
  group_2_sum = torch.tensor([1.5] * 4).to(device)
  group_1_ranks = list(range(world_size // 2))
  group_2_ranks = list(range(world_size // 2, world_size))
  group_1 = dist.new_group(ranks=group_1_ranks)
  group_2 = dist.new_group(ranks=group_2_ranks)
  if rank in group_1_ranks:
    dist.all_reduce(group_1_sum, group=group_1)
  else:
    dist.all_reduce(group_2_sum, group=group_2)
  # Communicate the sub-group sums to the entire group.
  dist.all_reduce(group_1_sum, op=dist.ReduceOp.MAX)
  dist.all_reduce(group_2_sum, op=dist.ReduceOp.MAX)
  print(f"In rank {rank}, {group_1_sum.to('cpu')=}, {group_2_sum.to('cpu')=}")

  # Finish
  print(f"\nFinishing process with {rank=}, {world_size=}")
  dist.destroy_process_group()

# Run the distributed processing

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12459' # You can choose a different port if 12355 is in use

world_size = 2

processes = []
for rank in range(world_size):
  p = mp.Process(target=init_process, args=(rank, world_size))
  p.start()
  processes.append(p)

for p in processes:
  p.join()

# PyTorch DDP Library Usage

* In this section, we demonstrate the usage of the PyTorch DDP library.

## Try 1 - Manually dist the training data

- We expect the trained model weights to be exactly the same as the single CPU scenario.


Single-GPU version

In [None]:
import torch

DEVICE = 'cuda:0'

torch.manual_seed(123)

# Init
input = torch.randn(20, 10).to(DEVICE) # (20, 10)
labels = torch.randn(20, 10).to(DEVICE)

loss_fn = torch.nn.MSELoss()

model = torch.nn.Linear(10, 10).to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=1)

for it in range(1):
  # forward
  optimizer.zero_grad()
  outputs = model(input)

  # backward
  loss_fn(outputs, labels).backward()
  optimizer.step()

  # check model params
  print(f"In epoch {it}")
  for name, param in model.named_parameters():
    if param.requires_grad:
      print(f"{name=}, {param.data=}")

### Multi-GPU version

In [None]:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP



def run_single_process(rank: int, world_size: int):
  DEVICE = f"cuda:{rank}"
  print(f"Starting process with {rank=}, {world_size=}, {DEVICE=}")

  # Use the NCCL backend for GPU-based distributed processing
  dist.init_process_group(backend="nccl", world_size=world_size, rank=rank)

  assert rank == dist.get_rank()
  assert world_size == dist.get_world_size()
  dist.barrier()

  split_data_size = 20 // world_size

  torch.manual_seed(123)

  # Create the train set.
  if rank == 0:
    inputs = torch.randn(20, 10).to(DEVICE)
    inputs_split_list = torch.split(inputs, split_data_size, dim=0)
    inputs_split_list = list(inputs_split_list)
    assert (20 // split_data_size) == len(inputs_split_list)

    targets = torch.randn(20, 10).to(DEVICE)
    targets_split_list = torch.split(targets, split_data_size, dim=0)
    targets_split_list = list(targets_split_list)
    assert (20 // split_data_size) == len(targets_split_list)
  else:
    inputs_split_list = None
    targets_split_list = None

  # Split the train set and send to the distributed workers.
  inputs_split = torch.zeros((split_data_size, 10), dtype=torch.float32).to(DEVICE)
  dist.scatter(inputs_split, inputs_split_list, src=0)
  inputs_split.to(DEVICE)

  targets_split = torch.zeros((split_data_size, 10), dtype=torch.float32).to(DEVICE)
  dist.scatter(targets_split, targets_split_list, src=0)
  targets_split.to(DEVICE)

  # Init the model
  model = torch.nn.Linear(10, 10).to(DEVICE)
  ddp_model = DDP(model, device_ids=None)
  loss_fn = torch.nn.MSELoss()
  optimizer = torch.optim.SGD(ddp_model.parameters(), lr=1)

  # forward
  optimizer.zero_grad()
  outputs = ddp_model(inputs_split)

  # backward
  loss_fn(outputs, targets_split).backward()

  # check model params
  # if rank == 0:
  #   print("Before backward")
  #   for name, param in ddp_model.named_parameters():
  #     if param.requires_grad:
  #       print(f"{name=}, {param.data=}")

  optimizer.step()

  # check model params
  if rank == 0:
    print("After backward")
    for name, param in ddp_model.named_parameters():
      if param.requires_grad:
        print(f"{name=}, {param.data=}")

  dist.destroy_process_group()

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355' # You can choose a different port if 12355 is in use

world_size = 2

processes = []
for rank in range(world_size):
  p = mp.Process(target=run_single_process, args=(rank, world_size))
  p.start()
  processes.append(p)

for p in processes:
  p.join()

## Try 2 - use distributed sampler to load the training data

In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
import torch.multiprocessing as mp
import torchvision

EPOCHS = 20

# This is the global bs. In DPP, it should guarantee the sum of bs on all devices equal to this #.
BATCH_SIZE = 16

WORLD_SIZE = 2

class ToyModel(nn.Module):
  def __init__(self):
      super(ToyModel, self).__init__()
      self.conv1 = nn.Conv2d(3, 6, 5)
      self.pool = nn.MaxPool2d(2, 2)
      self.conv2 = nn.Conv2d(6, 16, 5)
      self.fc1 = nn.Linear(16 * 5 * 5, 120)
      self.fc2 = nn.Linear(120, 84)
      self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
      x = self.pool(F.relu(self.conv1(x)))
      x = self.pool(F.relu(self.conv2(x)))
      x = x.view(-1, 16 * 5 * 5)
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return x

# # from the official doc: https://docs.pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# class ToyModel(nn.Module):
#   def __init__(self):
#       super().__init__()
#       self.conv1 = nn.Conv2d(3, 6, 5)
#       self.pool = nn.MaxPool2d(2, 2)
#       self.conv2 = nn.Conv2d(6, 16, 5)
#       self.fc1 = nn.Linear(16 * 5 * 5, 120)
#       self.fc2 = nn.Linear(120, 84)
#       self.fc3 = nn.Linear(84, 10)

#   def forward(self, x):
#       x = self.pool(F.relu(self.conv1(x)))
#       x = self.pool(F.relu(self.conv2(x)))
#       x = torch.flatten(x, 1) # flatten all dimensions except batch
#       x = F.relu(self.fc1(x))
#       x = F.relu(self.fc2(x))
#       x = self.fc3(x)
#       return x

dataset_transform = torchvision.transforms.Compose([
      torchvision.transforms.ToTensor(),
      torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  ])

### Multi GPUs version

In [4]:
# Otherwise the cell below will error out.
# See https://github.com/huggingface/accelerate/issues/940#issuecomment-1364114196
#
# To un-initialize the cuda, restart the runtime and avoid running other cells that init cuda.
assert not torch.cuda.is_initialized()

In [5]:
from torch.nn.parallel import DistributedDataParallel as DDP

def run_single_process(rank: int, world_size: int):
  DEVICE = f"cuda:{rank}"
    
  print(f"Starting process with {rank=}, {world_size=}, {DEVICE=}")

  # Use the NCCL backend for GPU-based distributed processing
  dist.init_process_group(backend="nccl", world_size=WORLD_SIZE, rank=rank)

  assert rank == dist.get_rank()
  assert world_size == dist.get_world_size()
  # dist.barrier()

  torch.manual_seed(123)

  # Init the model
  # 模型是先注册，再同步的。
  # 先定义一个 普通的model(nn.module) 然后再DDP(model)。
  # 第二个操作会把master节点(rank 0)的model的parameter和buffer给同步出去。
  #
  # DDP初始化（也就是model = DDP(model)这一步）
  # 1. 把parameter，buffer从master节点传到其他节点，使所有进程上的状态一致。
  #   注释：DDP通过这一步保证所有进程的初始状态一致。所以，请确保在这一步之后，你的代码不会再修改模型的任何东西了，包括添加、修改、删除parameter和buffer！
  # 2.（可能）如果有每个节点有多卡，则在每张卡上创建模型（类似DP）
  # 3. 把parameter进行分组，每一组称为一个bucket。临近的parameter在同一个bucket。
  #   注释：这是为了加速，在梯度通讯时，先计算、得到梯度的bucket会马上进行通讯，不必等到所有梯度计算结束才进行通讯。后面会详细介绍。
  # 4. 创建管理器reducer，给每个parameter注册梯度平均的hook。
  #   注释：这一步的具体实现是在C++代码里面的，即reducer.h文件。
  # 5.（可能）为可能的SyncBN层做准备
  #
  # 在每个step中，DDP模型都会做下面的事情：
  # 1. 采样数据，从dataloader得到一个batch的数据，用于当前计算（for data, label in dataloader）。
  #   注释：因为我们的dataloader使用了DistributedSampler，所以各个进程之间的数据是不会重复的。如果要确保DDP性能和单卡性能一致，这边需要保证在数据上，DDP模式下的一个epoch和单卡下的一个epoch是等效的。
  # 2. 进行网络的前向计算（prediction = model(data)）
  #   2.1 同步各进程状态
  #     2.1.1（可能）对单进程多卡复制模式，要在进程内同步多卡之间的parameter和buffer
  #     2.1.2 同步各进程之间的buffer。
  #   2.2 接下来才是进行真正的前向计算
  #   2.3（可能）当DDP参数find_unused_parameter为true时，其会在forward结束时，启动一个回溯，标记出所有没被用到的parameter，提前把这些设定为ready。
  #     注释：find_unused_parameter的默认值是false，因为其会拖慢速度。
  # 3. 计算梯度（loss.backward()）
  #   3.1 reducer外面：各个进程各自开始反向地计算梯度。
  #     3.1.1 注释：梯度是反向计算的，所以最后面的参数反而是最先得到梯度的。
  #   3.2 reducer外面：当某个parameter的梯度计算好了的时候，其之前注册的grad hook就会被触发，在reducer里把这个parameter的状态标记为ready。
  #   3.3 reducer里面：当某个bucket的所有parameter都是ready状态时，reducer会开始对这个bucket的所有parameter都开始一个异步的all-reduce梯度平均操作。
  #     注释：
  #       3.3.1 bucket的执行过程也是有顺序的，其顺序与parameter是相反的，即最先注册的parameter的bucket在最后面。
  #       3.3.2 所以，我们在创建module的时候，请务必把先进行计算的parameter注册在前面，后计算的在后面。不然，reducer会卡在某一个bucket等待，使训练时间延长！
  #         3.3.2.1 所谓的参数注册，其实就是创建网络层。也就是要求按照网络计算顺序，依次创建网络层。
  #   3.4 reducer里面：当所有bucket的梯度平均都结束后，reducer才会把得到的平均grad结果正式写入到parameter.grad里面。
  #   注释：这一步，感觉没有必要等全部结束之后才进行。可能得对照一下源码。
  # 4. 优化器optimizer应用gradient，更新参数（optimizer.step()）。
  #   注释：这一步，是和DDP没关系的。
  # model = ToyModel().to(DEVICE)
  model = ToyModel()
  model = model.to(DEVICE)
  ddp_model = DDP(model, device_ids=None)
  ddp_model.train()
  loss_fn = torch.nn.CrossEntropyLoss().to(DEVICE)
  print("The model is initiated!!!")

  # Init the optimizer.
  #
  # 我们可以看到，因为optimizer和DDP是没有关系的，所以optimizer初始状态的同一性是不被DDP保证的！
  # 大多数官方optimizer，其实现能保证从同样状态的model初始化时，其初始状态是相同的。
  # 所以这边我们只要保证在DDP模型创建后才初始化optimizer，就不用做额外的操作。
  # 但是，如果自定义optimizer，则需要你自己来保证其统一性！
  # 回顾一下文章最开始的代码，你会发现，optimizer确实是在DDP之后定义的。这个时候的模式已经是被初始化为相同的参数，所以能够保证优化器的初始状态是相同的。
  optimizer = torch.optim.SGD(ddp_model.parameters(), lr=0.001, momentum=0.9)

  # Dataset
  download_path = f"./data_{rank}"
  my_trainset = torchvision.datasets.CIFAR10(root=download_path, train=True, download=True, transform=dataset_transform)
  # DDP：使用DistributedSampler，DDP帮我们把细节都封装起来了。
  #      用，就完事儿！
  train_sampler = torch.utils.data.distributed.DistributedSampler(my_trainset)
  # DDP：需要注意的是，这里的batch_size指的是每个进程下的batch_size。
  #      也就是说，总batch_size是这里的batch_size再乘以并行数(world_size)。
  assert BATCH_SIZE % WORLD_SIZE == 0
  trainloader = torch.utils.data.DataLoader(my_trainset, batch_size=BATCH_SIZE//WORLD_SIZE, sampler=train_sampler)

  for epoch in range(EPOCHS):
    # The distributed training loss is not going to be the same as the single device training.
    # The reason is that the distributed sampler uses "epoch" as the sampling seed in each host.
    #
    # 不知道你有没有好奇，为什么给dataloader加一个DistributedSampler，就可以无缝对接DDP模式呢？
    # 其实原理很简单，就是给不同进程分配数据集的不重叠、不交叉部分。
    # 那么问题来了，每次epoch我们都会随机shuffle数据集，那么，不同进程之间要怎么保持shuffle后数据集的一致性呢？
    # DistributedSampler的实现方式是，不同进程会使用一个相同的随机数种子，这样shuffle出来的东西就能确保一致。
    #
    # 具体实现上，DistributedSampler使用当前epoch作为随机数种子，从而使得不同epoch下有不同的shuffle结果。
    # 所以，记得每次epoch开始前都要调用一下sampler的set_epoch方法，这样才能让数据集随机shuffle起来。
    trainloader.sampler.set_epoch(epoch)

    for it, (data, label) in enumerate(trainloader):
      data = data.to(DEVICE)
      label = label.to(DEVICE)

      # forward
      optimizer.zero_grad()
      outputs = ddp_model(data)

      # backward
      loss = loss_fn(outputs, label)
      # The updated gradients are communicated to all workers during backward()
      # It divides the models and params to buckets, and eagerly to communicate
      # the buckets whose gradient calculation are finished, to reduce latency.
      #
      # see more in https://zhuanlan.zhihu.com/p/485208899
      loss.backward()

      if rank == 0 and it % 1000 == 0:
        print(f"{epoch=}, {it=}, loss={loss.item():.3f}")

      optimizer.step()

  dist.destroy_process_group()



os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12365' # You can choose a different port if 12355 is in use

processes = []
for rank in range(WORLD_SIZE):
  p = mp.Process(target=run_single_process, args=(rank, WORLD_SIZE))
  p.start()
  processes.append(p)

for p in processes:
  p.join()

Starting process with rank=0, world_size=2, DEVICE='cuda:0'
Starting process with rank=1, world_size=2, DEVICE='cuda:1'
The model is initiated!!!
The model is initiated!!!
epoch=0, it=0, loss=2.284
epoch=0, it=1000, loss=2.184
epoch=0, it=2000, loss=2.077
epoch=0, it=3000, loss=1.255
epoch=1, it=0, loss=1.566
epoch=1, it=1000, loss=1.656
epoch=1, it=2000, loss=1.148
epoch=1, it=3000, loss=1.347
epoch=2, it=0, loss=1.631
epoch=2, it=1000, loss=1.296
epoch=2, it=2000, loss=1.237
epoch=2, it=3000, loss=1.030
epoch=3, it=0, loss=1.496
epoch=3, it=1000, loss=1.020
epoch=3, it=2000, loss=1.460
epoch=3, it=3000, loss=0.716
epoch=4, it=0, loss=0.923
epoch=4, it=1000, loss=0.725
epoch=4, it=2000, loss=0.722
epoch=4, it=3000, loss=2.054
epoch=5, it=0, loss=2.350
epoch=5, it=1000, loss=1.431
epoch=5, it=2000, loss=0.511
epoch=5, it=3000, loss=0.348
epoch=6, it=0, loss=0.767
epoch=6, it=1000, loss=1.566
epoch=6, it=2000, loss=0.967
epoch=6, it=3000, loss=1.096
epoch=7, it=0, loss=0.919
epoch=7, it

### Single GPU version

In [None]:
from torch.nn.parallel import DistributedDataParallel as DDP

DEVICE = 'cuda:0'

# Init the model
model = ToyModel().to(DEVICE)
model.train()
loss_fn = torch.nn.CrossEntropyLoss().to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Dataset
download_path = "./data"
my_trainset = torchvision.datasets.CIFAR10(root=download_path, train=True, download=True, transform=dataset_transform)
trainloader = torch.utils.data.DataLoader(my_trainset, batch_size=BATCH_SIZE)

for epoch in range(EPOCHS):
  for it, (data, label) in enumerate(trainloader):

    data = data.to(DEVICE)
    label = label.to(DEVICE)
    # forward
    optimizer.zero_grad()
    outputs = model(data)

    # backward
    loss = loss_fn(outputs, label)
    loss.backward()

    if it % 1000 == 0:
      print(f"{epoch=}, {it=}, loss={loss.item():.3f}")

    optimizer.step()

epoch=0, it=0, loss=2.295
epoch=0, it=1000, loss=2.300
epoch=0, it=2000, loss=2.129
epoch=0, it=3000, loss=1.813
epoch=1, it=0, loss=1.598
epoch=1, it=1000, loss=1.659
epoch=1, it=2000, loss=1.492
epoch=1, it=3000, loss=1.881
epoch=2, it=0, loss=1.255
epoch=2, it=1000, loss=1.708
epoch=2, it=2000, loss=1.389
epoch=2, it=3000, loss=1.752
epoch=3, it=0, loss=1.004
epoch=3, it=1000, loss=1.540
epoch=3, it=2000, loss=1.190
epoch=3, it=3000, loss=1.657
epoch=4, it=0, loss=0.866
epoch=4, it=1000, loss=1.366
epoch=4, it=2000, loss=0.996
epoch=4, it=3000, loss=1.607
epoch=5, it=0, loss=0.778
epoch=5, it=1000, loss=1.309
epoch=5, it=2000, loss=0.907
epoch=5, it=3000, loss=1.496
epoch=6, it=0, loss=0.729
epoch=6, it=1000, loss=1.210
epoch=6, it=2000, loss=0.931
epoch=6, it=3000, loss=1.399
epoch=7, it=0, loss=0.721
epoch=7, it=1000, loss=1.168
epoch=7, it=2000, loss=0.902
epoch=7, it=3000, loss=1.318
epoch=8, it=0, loss=0.663
epoch=8, it=1000, loss=1.116
epoch=8, it=2000, loss=0.879
epoch=8, it=