From 500fd0aae944af9fd2b2a8808d770b993dc6061c Mon Sep 17 00:00:00 2001
From: Stuart Abercrombie <stuart.abercrombie@nanoporetech.com>
Date: Mon, 30 Oct 2023 14:38:15 +0000
Subject: [PATCH 1/3] Minor improvements to CPU beam search

---
 CMakeLists.txt                |  2 -
 dorado/decode/CPUDecoder.cpp  |  3 +-
 dorado/decode/beam_search.cpp | 54 ++++++++++++++---------
 dorado/decode/beam_search.h   |  1 -
 dorado/decode/fast_hash.h     | 83 -----------------------------------
 dorado/nn/MetalCRFModel.cpp   |  2 +-
 6 files changed, 35 insertions(+), 110 deletions(-)
 delete mode 100644 dorado/decode/fast_hash.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbe6bad1..95bb2086 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,8 +200,6 @@ set(LIB_SOURCE_FILES
     dorado/demux/BarcodeClassifierSelector.cpp
     dorado/demux/BarcodeClassifierSelector.h
     dorado/decode/beam_search.cpp
-    dorado/decode/fast_hash.cpp
-    dorado/decode/fast_hash.h
     dorado/decode/beam_search.h
     dorado/decode/CPUDecoder.cpp
     dorado/decode/CPUDecoder.h
diff --git a/dorado/decode/CPUDecoder.cpp b/dorado/decode/CPUDecoder.cpp
index 64ddb4da..9d5536d1 100644
--- a/dorado/decode/CPUDecoder.cpp
+++ b/dorado/decode/CPUDecoder.cpp
@@ -131,8 +131,7 @@ std::vector<DecodedChunk> CPUDecoder::beam_search(const torch::Tensor& scores,
                     for (int i = 0; i < t_num_chunks; i++) {
                         auto decode_result = beam_search_decode(
                                 t_scores[i], bwd[i], posts[i], options.beam_width, options.beam_cut,
-                                options.blank_score, options.q_shift, options.q_scale,
-                                options.temperature, 1.0f);
+                                options.blank_score, options.q_shift, options.q_scale, 1.0f);
                         chunk_results[t_first_chunk + i] = DecodedChunk{
                                 std::get<0>(decode_result),
                                 std::get<1>(decode_result),
diff --git a/dorado/decode/beam_search.cpp b/dorado/decode/beam_search.cpp
index cc505205..e1881e44 100644
--- a/dorado/decode/beam_search.cpp
+++ b/dorado/decode/beam_search.cpp
@@ -1,6 +1,5 @@
 #include "beam_search.h"
 
-#include "fast_hash.h"
 #include "utils/simd.h"
 
 #include <math.h>
@@ -39,9 +38,9 @@ struct BeamFrontElement {
     bool stay;
 };
 
-float log_sum_exp(float x, float y, float t) {
-    float abs_diff = std::abs(x - y) / t;
-    return std::max(x, y) + ((abs_diff < 17.0f) ? (std::log1p(std::exp(-abs_diff)) * t) : 0.0f);
+float log_sum_exp(float x, float y) {
+    float abs_diff = std::abs(x - y);
+    return std::max(x, y) + ((abs_diff < 17.0f) ? (std::log1p(std::exp(-abs_diff))) : 0.0f);
 }
 
 int get_num_states(size_t num_trans_states) {
@@ -102,6 +101,22 @@ std::tuple<std::string, std::string> generate_sequence(const std::vector<uint8_t
     return make_tuple(sequence, qstring);
 }
 
+// Incorporates NUM_NEW_BITS into a Castagnoli CRC32, aka CRC32C
+// (not the same polynomial as CRC32 as used in zip/ethernet).
+template <int NUM_NEW_BITS>
+uint32_t crc32c(uint32_t crc, uint32_t new_bits) {
+    // Note that this is the reversed polynomial.
+    constexpr uint32_t POLYNOMIAL = 0x82f63b78u;
+    for (int i = 0; i < NUM_NEW_BITS; ++i) {
+        auto b = (new_bits ^ crc) & 1;
+        crc >>= 1;
+        if (b)
+            crc ^= POLYNOMIAL;
+        new_bits >>= 1;
+    }
+    return crc;
+}
+
 }  // anonymous namespace
 
 template <typename T>
@@ -117,7 +132,6 @@ float beam_search(const T* const scores,
                   std::vector<int32_t>& states,
                   std::vector<uint8_t>& moves,
                   std::vector<float>& qual_data,
-                  float temperature,
                   float score_scale) {
     const size_t num_states = 1 << num_state_bits;
     const auto states_mask = static_cast<state_t>(num_states - 1);
@@ -127,9 +141,9 @@ float beam_search(const T* const scores,
     }
 
     // Some values we need
-    constexpr uint64_t HASH_SEED = 0x880355f21e6d1965ULL;
+    constexpr uint32_t CRC_SEED = 0x12345678u;
     const float log_beam_cut =
-            (beam_cut > 0.0f) ? (temperature * logf(beam_cut)) : std::numeric_limits<float>::max();
+            (beam_cut > 0.0f) ? logf(beam_cut) : std::numeric_limits<float>::max();
 
     // Create the beam.  We need to keep beam_width elements for each block, plus the initial state
     std::vector<BeamElement> beam_vector(max_beam_width * (num_blocks + 1));
@@ -163,7 +177,7 @@ float beam_search(const T* const scores,
          state++) {
         if (back_guide[state] >= beam_init_threshold) {
             // Note that this first element has a prev_element_index of 0
-            prev_beam_front[beam_element] = {fasthash::chainfasthash64(HASH_SEED, state),
+            prev_beam_front[beam_element] = {crc32c<32>(CRC_SEED, state),
                                              static_cast<state_t>(state), 0, false};
             prev_scores[beam_element] = 0.0f;
             ++beam_element;
@@ -208,8 +222,8 @@ float beam_search(const T* const scores,
         // Essentially a k=1 Bloom filter, indicating the presence of steps with particular
         // sequence hashes.  Avoids comparing stay hashes against all possible progenitor
         // states where none of them has the requisite sequence hash.
-        const uint64_t HASH_PRESENT_BITS = 4096;
-        const uint64_t HASH_PRESENT_MASK = HASH_PRESENT_BITS - 1;
+        const uint32_t HASH_PRESENT_BITS = 4096;
+        const uint32_t HASH_PRESENT_MASK = HASH_PRESENT_BITS - 1;
         std::bitset<HASH_PRESENT_BITS> step_hash_present;  // Default constructor zeros content.
 
         // Generate list of candidate elements for this timestep (block).
@@ -219,7 +233,7 @@ float beam_search(const T* const scores,
             const auto& previous_element = prev_beam_front[prev_elem_idx];
 
             // Expand all the possible steps
-            for (size_t new_base = 0; new_base < NUM_BASES; new_base++) {
+            for (int new_base = 0; new_base < NUM_BASES; new_base++) {
                 state_t new_state =
                         (state_t((previous_element.state << NUM_BASE_BITS) & states_mask) |
                          new_base);
@@ -228,7 +242,8 @@ float beam_search(const T* const scores,
                         (((previous_element.state << NUM_BASE_BITS) >> num_state_bits)));
                 float new_score = prev_scores[prev_elem_idx] + fetch_block_score(move_idx) +
                                   static_cast<float>(block_back_scores[new_state]);
-                uint64_t new_hash = fasthash::chainfasthash64(previous_element.hash, new_state);
+                uint32_t new_hash = crc32c<2>(previous_element.hash, new_base);
+
                 step_hash_present[new_hash & HASH_PRESENT_MASK] = true;
 
                 // Add new element to the candidate list
@@ -266,18 +281,16 @@ float beam_search(const T* const scores,
                         current_beam_front[step_elem_idx].hash) {
                         if (current_scores[stay_elem_idx] > current_scores[step_elem_idx]) {
                             // Fold the step into the stay
-                            const float folded_score =
-                                    log_sum_exp(current_scores[stay_elem_idx],
-                                                current_scores[step_elem_idx], temperature);
+                            const float folded_score = log_sum_exp(current_scores[stay_elem_idx],
+                                                                   current_scores[step_elem_idx]);
                             current_scores[stay_elem_idx] = folded_score;
                             max_score = std::max(max_score, folded_score);
                             // The step element will end up last, sorted by score
                             current_scores[step_elem_idx] = std::numeric_limits<float>::lowest();
                         } else {
                             // Fold the stay into the step
-                            const float folded_score =
-                                    log_sum_exp(current_scores[stay_elem_idx],
-                                                current_scores[step_elem_idx], temperature);
+                            const float folded_score = log_sum_exp(current_scores[stay_elem_idx],
+                                                                   current_scores[step_elem_idx]);
                             current_scores[step_elem_idx] = folded_score;
                             max_score = std::max(max_score, folded_score);
                             // The stay element will end up last, sorted by score
@@ -508,7 +521,6 @@ std::tuple<std::string, std::string, std::vector<uint8_t>> beam_search_decode(
         float fixed_stay_score,
         float q_shift,
         float q_scale,
-        float temperature,
         float byte_score_scale) {
     const int num_blocks = int(scores_t.size(0));
     const int num_states = get_num_states(scores_t.size(1));
@@ -542,14 +554,14 @@ std::tuple<std::string, std::string, std::vector<uint8_t>> beam_search_decode(
 
         beam_search<float>(scores, scores_block_stride, back_guides, posts, num_state_bits,
                            num_blocks, max_beam_width, beam_cut, fixed_stay_score, states, moves,
-                           qual_data, temperature, 1.0f);
+                           qual_data, 1.0f);
     } else if (scores_t.dtype() == torch::kInt8) {
         const auto scores = scores_block_contig.data_ptr<int8_t>();
         const auto back_guides = back_guides_contig->data_ptr<float>();
         const auto posts = posts_contig->data_ptr<float>();
         beam_search<int8_t>(scores, scores_block_stride, back_guides, posts, num_state_bits,
                             num_blocks, max_beam_width, beam_cut, fixed_stay_score, states, moves,
-                            qual_data, temperature, byte_score_scale);
+                            qual_data, byte_score_scale);
     } else {
         throw std::runtime_error(std::string("beam_search_decode: unsupported tensor type ") +
                                  std::string(scores_t.dtype().name()));
diff --git a/dorado/decode/beam_search.h b/dorado/decode/beam_search.h
index e52fd691..fc4ede75 100644
--- a/dorado/decode/beam_search.h
+++ b/dorado/decode/beam_search.h
@@ -17,5 +17,4 @@ std::tuple<std::string, std::string, std::vector<uint8_t>> beam_search_decode(
         float fixed_stay_score,
         float q_shift,
         float q_scale,
-        float temperature,
         float byte_score_scale);
diff --git a/dorado/decode/fast_hash.h b/dorado/decode/fast_hash.h
deleted file mode 100644
index a5105a59..00000000
--- a/dorado/decode/fast_hash.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* The MIT License
-   Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
-   Permission is hereby granted, free of charge, to any person
-   obtaining a copy of this software and associated documentation
-   files (the "Software"), to deal in the Software without
-   restriction, including without limitation the rights to use, copy,
-   modify, merge, publish, distribute, sublicense, and/or sell copies
-   of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/*  Modifications licensed by MIT licence.
- *  Copyright (C) 2020 Oxford Nanopore Technologies
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdio>
-
-namespace fasthash {
-
-// Compression function for Merkle-Damgard construction.
-// This function is generated using the framework provided.
-inline uint64_t mix(uint64_t h) {
-    h ^= h >> 23;
-    h *= 0x2127599bf4325c37ULL;
-    h ^= h >> 47;
-    return h;
-}
-
-/**
- * fasthash32 - 32-bit implementation of fasthash
- * @buf:  data buffer
- * @len:  data size
- * @seed: the seed
- */
-uint32_t fasthash32(const void *buf, size_t len, uint32_t seed);
-
-/**
- * fasthash64 - 64-bit implementation of fasthash
- * @buf:  data buffer
- * @len:  data size
- * @seed: the seed
- */
-uint64_t fasthash64(const void *buf, size_t len, uint64_t seed);
-
-/**  Chain a new value to hash
- *
- *   `fasthash64` specialised to case of calculating the new hash
- *   from the previous data when a new value is appended.
- *
- *   Note:
- *       It is assumed that the value added is always 64bits wide,
- *   unlike `fasthash64` which combines blocks of bytes into 64bits
- *   values.
- *
- *   Args:
- *       hash (uint64_t): Hash of previous data
- *       val  (uint64_t): Value to chain to hash
- *
- *   Returns:
- *       uint64_t: New hash with value added
- **/
-inline uint64_t chainfasthash64(uint64_t hash, uint64_t val) {
-    const uint64_t m = 0x880355f21e6d1965ULL;
-
-    hash ^= mix(val);
-    hash *= m;
-    return mix(hash);
-}
-
-}  // namespace fasthash
diff --git a/dorado/nn/MetalCRFModel.cpp b/dorado/nn/MetalCRFModel.cpp
index 607f7c5c..9407bbed 100644
--- a/dorado/nn/MetalCRFModel.cpp
+++ b/dorado/nn/MetalCRFModel.cpp
@@ -891,7 +891,7 @@ class MetalCaller {
                     m_bwd.at(out_buf_idx)[buf_chunk_idx], m_posts.at(out_buf_idx)[buf_chunk_idx],
                     m_decoder_options.beam_width, m_decoder_options.beam_cut,
                     m_decoder_options.blank_score, m_decoder_options.q_shift,
-                    m_decoder_options.q_scale, m_decoder_options.temperature, score_scale);
+                    m_decoder_options.q_scale, score_scale);
 
             (*task->out_chunks)[chunk_idx] = DecodedChunk{sequence, qstring, moves};
 

From e7f5a5094bbf59928cdaf0ca34943f3ca5afed71 Mon Sep 17 00:00:00 2001
From: Stuart Abercrombie <stuart.abercrombie@nanoporetech.com>
Date: Mon, 30 Oct 2023 17:50:39 +0000
Subject: [PATCH 2/3] Remove fast_hash.cpp

---
 dorado/decode/fast_hash.cpp | 79 -------------------------------------
 1 file changed, 79 deletions(-)
 delete mode 100644 dorado/decode/fast_hash.cpp

diff --git a/dorado/decode/fast_hash.cpp b/dorado/decode/fast_hash.cpp
deleted file mode 100644
index 4b662ebb..00000000
--- a/dorado/decode/fast_hash.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* The MIT License
-   Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
-   Permission is hereby granted, free of charge, to any person
-   obtaining a copy of this software and associated documentation
-   files (the "Software"), to deal in the Software without
-   restriction, including without limitation the rights to use, copy,
-   modify, merge, publish, distribute, sublicense, and/or sell copies
-   of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/*  Modifications licensed by MIT licence.
- *  Copyright (C) 2020 Oxford Nanopore Technologies
- */
-
-#include "fast_hash.h"
-
-namespace fasthash {
-
-uint64_t fasthash64(const void *buf, size_t len, uint64_t seed) {
-    const uint64_t m = 0x880355f21e6d1965ULL;
-    const uint64_t *pos = (const uint64_t *)buf;
-    const uint64_t *end = pos + (len / 8);
-    const unsigned char *pos2;
-    uint64_t h = seed ^ (len * m);
-    uint64_t v;
-
-    while (pos != end) {
-        v = *pos++;
-        v = mix(v);
-        h ^= v;
-        h *= m;
-    }
-
-    pos2 = (const unsigned char *)pos;
-    v = 0;
-
-    switch (len & 7) {
-    case 7:
-        v ^= (uint64_t)pos2[6] << 48;
-    case 6:
-        v ^= (uint64_t)pos2[5] << 40;
-    case 5:
-        v ^= (uint64_t)pos2[4] << 32;
-    case 4:
-        v ^= (uint64_t)pos2[3] << 24;
-    case 3:
-        v ^= (uint64_t)pos2[2] << 16;
-    case 2:
-        v ^= (uint64_t)pos2[1] << 8;
-    case 1:
-        v ^= (uint64_t)pos2[0];
-        v = mix(v);
-        h ^= v;
-        h *= m;
-    }
-    h = mix(h);
-    return h;
-}
-
-uint32_t fasthash32(const void *buf, size_t len, uint32_t seed) {
-    // the following trick converts the 64-bit hashcode to Fermat
-    // residue, which shall retain information from both the higher
-    // and lower parts of hashcode.
-    uint64_t h = fasthash64(buf, len, seed);
-    return uint32_t(h - (h >> 32));
-}
-
-}  // namespace fasthash

From 267aa022b4c4a5935b6ed2a9afab3afa939a8d0c Mon Sep 17 00:00:00 2001
From: Stuart Abercrombie <stuart.abercrombie@nanoporetech.com>
Date: Mon, 30 Oct 2023 17:56:05 +0000
Subject: [PATCH 3/3] Address review comments.  Correct hash type.

---
 dorado/decode/beam_search.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dorado/decode/beam_search.cpp b/dorado/decode/beam_search.cpp
index e1881e44..2eb91021 100644
--- a/dorado/decode/beam_search.cpp
+++ b/dorado/decode/beam_search.cpp
@@ -32,7 +32,7 @@ struct BeamElement {
 // This is the data we need to retain for only the previous timestep (block) in the beam
 // (and what we construct for the new timestep)
 struct BeamFrontElement {
-    uint64_t hash;
+    uint32_t hash;
     state_t state;
     uint8_t prev_element_index;
     bool stay;
@@ -242,7 +242,7 @@ float beam_search(const T* const scores,
                         (((previous_element.state << NUM_BASE_BITS) >> num_state_bits)));
                 float new_score = prev_scores[prev_elem_idx] + fetch_block_score(move_idx) +
                                   static_cast<float>(block_back_scores[new_state]);
-                uint32_t new_hash = crc32c<2>(previous_element.hash, new_base);
+                uint32_t new_hash = crc32c<NUM_BASE_BITS>(previous_element.hash, new_base);
 
                 step_hash_present[new_hash & HASH_PRESENT_MASK] = true;