sgemm和openblas结果不一致,导致推理结果相差很远 #14

kismit · 2022-08-22T07:25:53Z

如题, 使用EMLL sgemm计算的结果和Openblas cblas_sgemm计算的结果有较小差异,但是会导致模型推理结果不正常, 然而使用EMLL 动态量化再s8s32gemm计算最后反量化的方式, 推理结果是正常的, 这两者间的差异是什么?
代码如下:

  enum QuantType
  {
    NO_QUANT = 0,
    SYMMETRIC,
    ASYMMETRIC
  };

inline int emll_s8s32gemm(bool transpose_a, bool transpose_b,
                            dim_t m, dim_t n, dim_t k,
                            const int8_t *a,
                            const int8_t *b,
                            float beta,
                            int32_t *c)
  {
    int status;
    if (!transpose_a && !transpose_b)
    {
      status = s8s32gemm(0, 0, b, a, c, n, m, k, beta, 0);
    }
    else if (transpose_a && !transpose_b)
    {
      status = s8s32gemm(0, 1, b, a, c, n, m, k, beta, 0);
    }
    else if (!transpose_a && transpose_b)
    {
      status = s8s32gemm(1, 0, b, a, c, n, m, k, beta, 0);
    }
    else // transpose_a && transpose_b
    {
      status = s8s32gemm(1, 1, b, a, c, n, m, k, beta, 0);
    }

    return status;
  }

  inline int emll_u8u32gemm(bool transpose_a, bool transpose_b,
                            dim_t m, dim_t n, dim_t k,
                            const uint8_t *a,
                            const uint8_t *b,
                            float beta,
                            uint32_t *c)
  {
    int status;
    if (!transpose_a && !transpose_b)
    {
      status = u8u32gemm(0, 0, b, a, c, n, m, k, beta, 0);
    }
    else if (transpose_a && !transpose_b)
    {
      status = u8u32gemm(0, 1, b, a, c, n, m, k, beta, 0);
    }
    else if (!transpose_a && transpose_b)
    {
      status = u8u32gemm(1, 0, b, a, c, n, m, k, beta, 0);
    }
    else // transpose_a && transpose_b
    {
      status = u8u32gemm(1, 1, b, a, c, n, m, k, beta, 0);
    }

    return status;
  }

  int emll_sgemm(bool transpose_a, bool transpose_b,
                 dim_t m, dim_t n, dim_t k,
                 float alpha,
                 const float *a,
                 const float *b,
                 float beta,
                 float *c,
                 QuantType quant_type)
  {
    int status;

    float *a_f = nullptr;
    if (alpha != 1.0f)
    {
      a_f = static_cast<float *>(allocator.allocate(m * k * sizeof(float)));
      cpu::parallel_for(0, m * k, cpu::GRAIN_SIZE / 2, [&](dim_t begin, dim_t end) {
        for (dim_t i = begin; i < end; ++i)
        {
          a_f[i] = static_cast<float>(alpha * a[i]);
        }
      });
    }

    if (quant_type == QuantType::NO_QUANT) // 这种方法结果不对!!!
    {
      if (!transpose_a && !transpose_b)
      {
        // std::cout << "!!! !transpose_a && !transpose_b" << std::endl;
        if (a_f != nullptr)
        {
          status = sgemm(0, 0, b, a_f, c, n, m, k, beta, 0);
        }
        else
        {
          status = sgemm(0, 0, b, a, c, n, m, k, beta, 0);
        }
      }
      else if (transpose_a && !transpose_b)
      {
        // std::cout << "@@@ transpose_a && !transpose_b" << std::endl;
        if (a_f != nullptr)
        {
          status = sgemm(0, 1, b, a_f, c, n, m, k, beta, 0);
        }
        else
        {
          status = sgemm(0, 1, b, a, c, n, m, k, beta, 0);
        }
      }
      else if (!transpose_a && transpose_b)
      {
        // std::cout << "### !transpose_a && transpose_b" << std::endl;
        if (a_f != nullptr)
        {
          status = sgemm(1, 0, b, a_f, c, n, m, k, beta, 0);
        }
        else
        {
          status = sgemm(1, 0, b, a, c, n, m, k, beta, 0);
        }
      }
      else // transpose_a && transpose_b
      {
        // std::cout << "$$$ transpose_a && transpose_b" << std::endl;
        if (a_f != nullptr)
        {
          status = sgemm(1, 1, b, a_f, c, n, m, k, beta, 0);
        }
        else
        {
          status = sgemm(1, 1, b, a, c, n, m, k, beta, 0);
        }
      }
    }
    else if (quant_type == QuantType::SYMMETRIC)
    {
      int8_t *const a_s = static_cast<int8_t *>(allocator.allocate(m * k * sizeof(int8_t)));
      int8_t *const b_s = static_cast<int8_t *>(allocator.allocate(n * k * sizeof(int8_t)));
      int32_t *const c_qs = static_cast<int32_t *>(allocator.allocate(m * n * sizeof(int32_t)));

      float scale_a, scale_b;

      if (a_f != nullptr)
      {
        quantize_symmetric_f32_s8(a_f, a_s, &scale_a, m * k, 0, -1);
      }
      else
      {
        quantize_symmetric_f32_s8(a, a_s, &scale_a, m * k, 0, -1);
      }

      quantize_symmetric_f32_s8(b, b_s, &scale_b, n * k, 0, -1);

      status = emll_s8s32gemm(transpose_a, transpose_b,
                              m, n, k, a_s, b_s, beta, c_qs);
      if (status != 0)
      {
        fprintf(stderr, "u8u32gemm returns error code %d\n", status);
      }
      else
      {
        dequantize_symmetric_f32_s32(c_qs, c, scale_a * scale_b, m * n);
      }

      allocator.free(a_s);
      allocator.free(b_s);
      allocator.free(c_qs);
    }
    else // ASYMMETRIC  
    {
      uint8_t *const a_u = static_cast<uint8_t *>(allocator.allocate(m * k * sizeof(uint8_t)));
      uint8_t *const b_u = static_cast<uint8_t *>(allocator.allocate(n * k * sizeof(uint8_t)));
      int32_t *const c_qu = static_cast<int32_t *>(allocator.allocate(m * n * sizeof(int32_t)));

      uint32_t *const a_sum = (uint32_t *)(allocator.allocate(m * sizeof(uint32_t)));
      uint32_t *const b_sum = (uint32_t *)(allocator.allocate(n * sizeof(uint32_t)));

      float scale_a, scale_b;
      uint8_t zero_point_a, zero_point_b;

      if (a_f != nullptr)
      {
        quantize_asymmetric_f32_u8(a_f, a_u, &zero_point_a, &scale_a, m * k, 0, -1);
      }
      else
      {
        quantize_asymmetric_f32_u8(a, a_u, &zero_point_a, &scale_a, m * k, 0, -1);
      }

      quantize_asymmetric_f32_u8(b, b_u, &zero_point_b, &scale_b, n * k, 0, -1);

      status = emll_u8u32gemm(transpose_a, transpose_b,
                              m, n, k, a_u, b_u, beta, (uint32_t*)c_qu);

      if (status != 0)
      {
        fprintf(stderr, "u8u32gemm returns error code %d\n", status);
      }
      else
      {
        /* sum row/col of source matrices (along K dim) */
        u8u32_sum(a_u, (uint32_t*)(a_sum), m, k, 0);
        u8u32_sum(b_u, (uint32_t*)(b_sum), k, n, 1);
        /* bias the result of 8->32 bit GEMM */
        bias_int32_t(c_qu,
                     (int32_t)zero_point_a * (int32_t)zero_point_b * (int32_t)k,
                     (int32_t *)(a_sum), -(int32_t)zero_point_b,
                     (int32_t *)(b_sum), -(int32_t)zero_point_a, m, n);
        /* dequantitize the result */
        /* dequant(input_addr, output_addr, scale, array_length) */
        dequantize_symmetric_f32_s32(c_qu, c, scale_a * scale_b, m * n);
      }

      allocator.free(a_u);
      allocator.free(b_u);
      allocator.free(c_qu);
      allocator.free(a_sum);
      allocator.free(b_sum);
    }

    if (a_f != nullptr)
    {
      allocator.free(a_f);
    }

    return status;
  }

The text was updated successfully, but these errors were encountered:

netease-youdao · 2022-09-05T11:18:03Z

您好，可以先看一下emll_sgemm后的c元素中有没有NAN，如果有的话可能是传入的参数c指向了未初始化的内存（其中元素被解析为浮点数时可能出现INF或NAN，和零相乘得到NAN，则乘法结束后c的部分元素成为NAN），我们会在后续的版本修复这个问题。如需尽快解决，可以尝试在调用emll_sgemm之前对目标矩阵进行全部填零。

std::fill(c, c + m * n, 0.0f);

若不是以上的问题，那请问方便给一下不量化和量化的结果之间的相似度（余弦和欧式）吗？

kismit · 2022-09-09T01:03:55Z

根据这个修改问题已解决,非常感谢!

netease-youdao closed this as completed Nov 21, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sgemm和openblas结果不一致,导致推理结果相差很远 #14

sgemm和openblas结果不一致,导致推理结果相差很远 #14

kismit commented Aug 22, 2022

netease-youdao commented Sep 5, 2022 •

edited

Loading

kismit commented Sep 9, 2022

sgemm和openblas结果不一致,导致推理结果相差很远 #14

sgemm和openblas结果不一致,导致推理结果相差很远 #14

Comments

kismit commented Aug 22, 2022

netease-youdao commented Sep 5, 2022 • edited Loading

kismit commented Sep 9, 2022

netease-youdao commented Sep 5, 2022 •

edited

Loading