From 500fd0aae944af9fd2b2a8808d770b993dc6061c Mon Sep 17 00:00:00 2001 From: Stuart Abercrombie Date: Mon, 30 Oct 2023 14:38:15 +0000 Subject: [PATCH 1/3] Minor improvements to CPU beam search --- CMakeLists.txt | 2 - dorado/decode/CPUDecoder.cpp | 3 +- dorado/decode/beam_search.cpp | 54 ++++++++++++++--------- dorado/decode/beam_search.h | 1 - dorado/decode/fast_hash.h | 83 ----------------------------------- dorado/nn/MetalCRFModel.cpp | 2 +- 6 files changed, 35 insertions(+), 110 deletions(-) delete mode 100644 dorado/decode/fast_hash.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dbe6bad1..95bb2086 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,8 +200,6 @@ set(LIB_SOURCE_FILES dorado/demux/BarcodeClassifierSelector.cpp dorado/demux/BarcodeClassifierSelector.h dorado/decode/beam_search.cpp - dorado/decode/fast_hash.cpp - dorado/decode/fast_hash.h dorado/decode/beam_search.h dorado/decode/CPUDecoder.cpp dorado/decode/CPUDecoder.h diff --git a/dorado/decode/CPUDecoder.cpp b/dorado/decode/CPUDecoder.cpp index 64ddb4da..9d5536d1 100644 --- a/dorado/decode/CPUDecoder.cpp +++ b/dorado/decode/CPUDecoder.cpp @@ -131,8 +131,7 @@ std::vector CPUDecoder::beam_search(const torch::Tensor& scores, for (int i = 0; i < t_num_chunks; i++) { auto decode_result = beam_search_decode( t_scores[i], bwd[i], posts[i], options.beam_width, options.beam_cut, - options.blank_score, options.q_shift, options.q_scale, - options.temperature, 1.0f); + options.blank_score, options.q_shift, options.q_scale, 1.0f); chunk_results[t_first_chunk + i] = DecodedChunk{ std::get<0>(decode_result), std::get<1>(decode_result), diff --git a/dorado/decode/beam_search.cpp b/dorado/decode/beam_search.cpp index cc505205..e1881e44 100644 --- a/dorado/decode/beam_search.cpp +++ b/dorado/decode/beam_search.cpp @@ -1,6 +1,5 @@ #include "beam_search.h" -#include "fast_hash.h" #include "utils/simd.h" #include @@ -39,9 +38,9 @@ struct BeamFrontElement { bool stay; }; -float log_sum_exp(float x, float y, float t) { - float abs_diff = std::abs(x - y) / t; - return std::max(x, y) + ((abs_diff < 17.0f) ? (std::log1p(std::exp(-abs_diff)) * t) : 0.0f); +float log_sum_exp(float x, float y) { + float abs_diff = std::abs(x - y); + return std::max(x, y) + ((abs_diff < 17.0f) ? (std::log1p(std::exp(-abs_diff))) : 0.0f); } int get_num_states(size_t num_trans_states) { @@ -102,6 +101,22 @@ std::tuple generate_sequence(const std::vector +uint32_t crc32c(uint32_t crc, uint32_t new_bits) { + // Note that this is the reversed polynomial. + constexpr uint32_t POLYNOMIAL = 0x82f63b78u; + for (int i = 0; i < NUM_NEW_BITS; ++i) { + auto b = (new_bits ^ crc) & 1; + crc >>= 1; + if (b) + crc ^= POLYNOMIAL; + new_bits >>= 1; + } + return crc; +} + } // anonymous namespace template @@ -117,7 +132,6 @@ float beam_search(const T* const scores, std::vector& states, std::vector& moves, std::vector& qual_data, - float temperature, float score_scale) { const size_t num_states = 1 << num_state_bits; const auto states_mask = static_cast(num_states - 1); @@ -127,9 +141,9 @@ float beam_search(const T* const scores, } // Some values we need - constexpr uint64_t HASH_SEED = 0x880355f21e6d1965ULL; + constexpr uint32_t CRC_SEED = 0x12345678u; const float log_beam_cut = - (beam_cut > 0.0f) ? (temperature * logf(beam_cut)) : std::numeric_limits::max(); + (beam_cut > 0.0f) ? logf(beam_cut) : std::numeric_limits::max(); // Create the beam. We need to keep beam_width elements for each block, plus the initial state std::vector beam_vector(max_beam_width * (num_blocks + 1)); @@ -163,7 +177,7 @@ float beam_search(const T* const scores, state++) { if (back_guide[state] >= beam_init_threshold) { // Note that this first element has a prev_element_index of 0 - prev_beam_front[beam_element] = {fasthash::chainfasthash64(HASH_SEED, state), + prev_beam_front[beam_element] = {crc32c<32>(CRC_SEED, state), static_cast(state), 0, false}; prev_scores[beam_element] = 0.0f; ++beam_element; @@ -208,8 +222,8 @@ float beam_search(const T* const scores, // Essentially a k=1 Bloom filter, indicating the presence of steps with particular // sequence hashes. Avoids comparing stay hashes against all possible progenitor // states where none of them has the requisite sequence hash. - const uint64_t HASH_PRESENT_BITS = 4096; - const uint64_t HASH_PRESENT_MASK = HASH_PRESENT_BITS - 1; + const uint32_t HASH_PRESENT_BITS = 4096; + const uint32_t HASH_PRESENT_MASK = HASH_PRESENT_BITS - 1; std::bitset step_hash_present; // Default constructor zeros content. // Generate list of candidate elements for this timestep (block). @@ -219,7 +233,7 @@ float beam_search(const T* const scores, const auto& previous_element = prev_beam_front[prev_elem_idx]; // Expand all the possible steps - for (size_t new_base = 0; new_base < NUM_BASES; new_base++) { + for (int new_base = 0; new_base < NUM_BASES; new_base++) { state_t new_state = (state_t((previous_element.state << NUM_BASE_BITS) & states_mask) | new_base); @@ -228,7 +242,8 @@ float beam_search(const T* const scores, (((previous_element.state << NUM_BASE_BITS) >> num_state_bits))); float new_score = prev_scores[prev_elem_idx] + fetch_block_score(move_idx) + static_cast(block_back_scores[new_state]); - uint64_t new_hash = fasthash::chainfasthash64(previous_element.hash, new_state); + uint32_t new_hash = crc32c<2>(previous_element.hash, new_base); + step_hash_present[new_hash & HASH_PRESENT_MASK] = true; // Add new element to the candidate list @@ -266,18 +281,16 @@ float beam_search(const T* const scores, current_beam_front[step_elem_idx].hash) { if (current_scores[stay_elem_idx] > current_scores[step_elem_idx]) { // Fold the step into the stay - const float folded_score = - log_sum_exp(current_scores[stay_elem_idx], - current_scores[step_elem_idx], temperature); + const float folded_score = log_sum_exp(current_scores[stay_elem_idx], + current_scores[step_elem_idx]); current_scores[stay_elem_idx] = folded_score; max_score = std::max(max_score, folded_score); // The step element will end up last, sorted by score current_scores[step_elem_idx] = std::numeric_limits::lowest(); } else { // Fold the stay into the step - const float folded_score = - log_sum_exp(current_scores[stay_elem_idx], - current_scores[step_elem_idx], temperature); + const float folded_score = log_sum_exp(current_scores[stay_elem_idx], + current_scores[step_elem_idx]); current_scores[step_elem_idx] = folded_score; max_score = std::max(max_score, folded_score); // The stay element will end up last, sorted by score @@ -508,7 +521,6 @@ std::tuple> beam_search_decode( float fixed_stay_score, float q_shift, float q_scale, - float temperature, float byte_score_scale) { const int num_blocks = int(scores_t.size(0)); const int num_states = get_num_states(scores_t.size(1)); @@ -542,14 +554,14 @@ std::tuple> beam_search_decode( beam_search(scores, scores_block_stride, back_guides, posts, num_state_bits, num_blocks, max_beam_width, beam_cut, fixed_stay_score, states, moves, - qual_data, temperature, 1.0f); + qual_data, 1.0f); } else if (scores_t.dtype() == torch::kInt8) { const auto scores = scores_block_contig.data_ptr(); const auto back_guides = back_guides_contig->data_ptr(); const auto posts = posts_contig->data_ptr(); beam_search(scores, scores_block_stride, back_guides, posts, num_state_bits, num_blocks, max_beam_width, beam_cut, fixed_stay_score, states, moves, - qual_data, temperature, byte_score_scale); + qual_data, byte_score_scale); } else { throw std::runtime_error(std::string("beam_search_decode: unsupported tensor type ") + std::string(scores_t.dtype().name())); diff --git a/dorado/decode/beam_search.h b/dorado/decode/beam_search.h index e52fd691..fc4ede75 100644 --- a/dorado/decode/beam_search.h +++ b/dorado/decode/beam_search.h @@ -17,5 +17,4 @@ std::tuple> beam_search_decode( float fixed_stay_score, float q_shift, float q_scale, - float temperature, float byte_score_scale); diff --git a/dorado/decode/fast_hash.h b/dorado/decode/fast_hash.h deleted file mode 100644 index a5105a59..00000000 --- a/dorado/decode/fast_hash.h +++ /dev/null @@ -1,83 +0,0 @@ -/* The MIT License - Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, copy, - modify, merge, publish, distribute, sublicense, and/or sell copies - of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Modifications licensed by MIT licence. - * Copyright (C) 2020 Oxford Nanopore Technologies - */ - -#pragma once - -#include -#include - -namespace fasthash { - -// Compression function for Merkle-Damgard construction. -// This function is generated using the framework provided. -inline uint64_t mix(uint64_t h) { - h ^= h >> 23; - h *= 0x2127599bf4325c37ULL; - h ^= h >> 47; - return h; -} - -/** - * fasthash32 - 32-bit implementation of fasthash - * @buf: data buffer - * @len: data size - * @seed: the seed - */ -uint32_t fasthash32(const void *buf, size_t len, uint32_t seed); - -/** - * fasthash64 - 64-bit implementation of fasthash - * @buf: data buffer - * @len: data size - * @seed: the seed - */ -uint64_t fasthash64(const void *buf, size_t len, uint64_t seed); - -/** Chain a new value to hash - * - * `fasthash64` specialised to case of calculating the new hash - * from the previous data when a new value is appended. - * - * Note: - * It is assumed that the value added is always 64bits wide, - * unlike `fasthash64` which combines blocks of bytes into 64bits - * values. - * - * Args: - * hash (uint64_t): Hash of previous data - * val (uint64_t): Value to chain to hash - * - * Returns: - * uint64_t: New hash with value added - **/ -inline uint64_t chainfasthash64(uint64_t hash, uint64_t val) { - const uint64_t m = 0x880355f21e6d1965ULL; - - hash ^= mix(val); - hash *= m; - return mix(hash); -} - -} // namespace fasthash diff --git a/dorado/nn/MetalCRFModel.cpp b/dorado/nn/MetalCRFModel.cpp index 607f7c5c..9407bbed 100644 --- a/dorado/nn/MetalCRFModel.cpp +++ b/dorado/nn/MetalCRFModel.cpp @@ -891,7 +891,7 @@ class MetalCaller { m_bwd.at(out_buf_idx)[buf_chunk_idx], m_posts.at(out_buf_idx)[buf_chunk_idx], m_decoder_options.beam_width, m_decoder_options.beam_cut, m_decoder_options.blank_score, m_decoder_options.q_shift, - m_decoder_options.q_scale, m_decoder_options.temperature, score_scale); + m_decoder_options.q_scale, score_scale); (*task->out_chunks)[chunk_idx] = DecodedChunk{sequence, qstring, moves}; From e7f5a5094bbf59928cdaf0ca34943f3ca5afed71 Mon Sep 17 00:00:00 2001 From: Stuart Abercrombie Date: Mon, 30 Oct 2023 17:50:39 +0000 Subject: [PATCH 2/3] Remove fast_hash.cpp --- dorado/decode/fast_hash.cpp | 79 ------------------------------------- 1 file changed, 79 deletions(-) delete mode 100644 dorado/decode/fast_hash.cpp diff --git a/dorado/decode/fast_hash.cpp b/dorado/decode/fast_hash.cpp deleted file mode 100644 index 4b662ebb..00000000 --- a/dorado/decode/fast_hash.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* The MIT License - Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, copy, - modify, merge, publish, distribute, sublicense, and/or sell copies - of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Modifications licensed by MIT licence. - * Copyright (C) 2020 Oxford Nanopore Technologies - */ - -#include "fast_hash.h" - -namespace fasthash { - -uint64_t fasthash64(const void *buf, size_t len, uint64_t seed) { - const uint64_t m = 0x880355f21e6d1965ULL; - const uint64_t *pos = (const uint64_t *)buf; - const uint64_t *end = pos + (len / 8); - const unsigned char *pos2; - uint64_t h = seed ^ (len * m); - uint64_t v; - - while (pos != end) { - v = *pos++; - v = mix(v); - h ^= v; - h *= m; - } - - pos2 = (const unsigned char *)pos; - v = 0; - - switch (len & 7) { - case 7: - v ^= (uint64_t)pos2[6] << 48; - case 6: - v ^= (uint64_t)pos2[5] << 40; - case 5: - v ^= (uint64_t)pos2[4] << 32; - case 4: - v ^= (uint64_t)pos2[3] << 24; - case 3: - v ^= (uint64_t)pos2[2] << 16; - case 2: - v ^= (uint64_t)pos2[1] << 8; - case 1: - v ^= (uint64_t)pos2[0]; - v = mix(v); - h ^= v; - h *= m; - } - h = mix(h); - return h; -} - -uint32_t fasthash32(const void *buf, size_t len, uint32_t seed) { - // the following trick converts the 64-bit hashcode to Fermat - // residue, which shall retain information from both the higher - // and lower parts of hashcode. - uint64_t h = fasthash64(buf, len, seed); - return uint32_t(h - (h >> 32)); -} - -} // namespace fasthash From 267aa022b4c4a5935b6ed2a9afab3afa939a8d0c Mon Sep 17 00:00:00 2001 From: Stuart Abercrombie Date: Mon, 30 Oct 2023 17:56:05 +0000 Subject: [PATCH 3/3] Address review comments. Correct hash type. --- dorado/decode/beam_search.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dorado/decode/beam_search.cpp b/dorado/decode/beam_search.cpp index e1881e44..2eb91021 100644 --- a/dorado/decode/beam_search.cpp +++ b/dorado/decode/beam_search.cpp @@ -32,7 +32,7 @@ struct BeamElement { // This is the data we need to retain for only the previous timestep (block) in the beam // (and what we construct for the new timestep) struct BeamFrontElement { - uint64_t hash; + uint32_t hash; state_t state; uint8_t prev_element_index; bool stay; @@ -242,7 +242,7 @@ float beam_search(const T* const scores, (((previous_element.state << NUM_BASE_BITS) >> num_state_bits))); float new_score = prev_scores[prev_elem_idx] + fetch_block_score(move_idx) + static_cast(block_back_scores[new_state]); - uint32_t new_hash = crc32c<2>(previous_element.hash, new_base); + uint32_t new_hash = crc32c(previous_element.hash, new_base); step_hash_present[new_hash & HASH_PRESENT_MASK] = true;