From e90b81ff88432d27237bffac5f6c8df3ef5b9bcb Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 12 Dec 2023 10:55:11 +0000
Subject: [PATCH 1/5] [INSTX-3094] Use find_program() to find autotools
 programs

We can't assume that these are available when building through Xcode
since it doesn't know about brew.

Also remove the reconf logic that doesn't appear to be needed.
---
 cmake/Htslib.cmake | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/cmake/Htslib.cmake b/cmake/Htslib.cmake
index 077100a4..f3141130 100644
--- a/cmake/Htslib.cmake
+++ b/cmake/Htslib.cmake
@@ -11,18 +11,14 @@ if(NOT TARGET htslib) # lazy include guard
             "INTERFACE_INCLUDE_DIRECTORIES" ${HTSLIB_DIR})
         target_link_directories(htslib INTERFACE ${HTSLIB_DIR})
     else()
-        message(STATUS "Building htslib")
-        set(HTSLIB_DIR ${DORADO_3RD_PARTY_SOURCE}/htslib CACHE STRING
-                    "Path to htslib repo")
-        set(MAKE_COMMAND make)
-        set(AUTOCONF_COMMAND autoconf)
-        execute_process(COMMAND bash -c "autoconf -V | sed 's/.* //; q'"
-                OUTPUT_VARIABLE AUTOCONF_VERS)
-        if(AUTOCONF_VERS VERSION_GREATER_EQUAL 2.70)
-            set(AUTOCONF_COMMAND autoreconf --install)
-        endif()
+        message(STATUS "Setting up htslib build")
+        set(HTSLIB_DIR ${DORADO_3RD_PARTY_SOURCE}/htslib CACHE STRING "Path to htslib repo")
         set(htslib_PREFIX ${CMAKE_BINARY_DIR}/3rdparty/htslib)
 
+        find_program(MAKE_COMMAND make REQUIRED)
+        find_program(AUTOCONF_COMMAND autoconf REQUIRED)
+        find_program(AUTOHEADER_COMMAND autoheader REQUIRED)
+
         # The Htslib build apparently requires BUILD_IN_SOURCE=1, which is a problem when
         # switching between build targets because htscodecs object files aren't regenerated.
         # To avoid this, copy the source tree to a build-specific directory and do the build there.
@@ -42,7 +38,7 @@ if(NOT TARGET htslib) # lazy include guard
                 PREFIX ${HTSLIB_BUILD}
                 SOURCE_DIR ${HTSLIB_BUILD}/htslib
                 BUILD_IN_SOURCE 1
-                CONFIGURE_COMMAND autoheader
+                CONFIGURE_COMMAND ${AUTOHEADER_COMMAND}
                 COMMAND ${AUTOCONF_COMMAND}
                 COMMAND ./configure --disable-bz2 --disable-lzma --disable-libcurl --disable-s3 --disable-gcs ${CONFIGURE_FLAGS}
                 BUILD_COMMAND ${MAKE_COMMAND} install prefix=${htslib_PREFIX}

From dd24d0740ff48c08494d3bbd75ba1f772be691e9 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 12 Dec 2023 11:23:42 +0000
Subject: [PATCH 2/5] [INSTX-3094] Fix precision issues spotted by Xcode

Xcode adds some additional flags on top of the ones we set, though it
seems these are the only issues that get flagged up.
---
 dorado/decode/beam_search.cpp      |  2 +-
 dorado/modbase/modbase_encoder.cpp |  2 +-
 dorado/nn/MetalCRFModel.cpp        | 16 +++++++++-------
 dorado/nn/Runners.cpp              |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/dorado/decode/beam_search.cpp b/dorado/decode/beam_search.cpp
index 7144fa90..b7393c5f 100644
--- a/dorado/decode/beam_search.cpp
+++ b/dorado/decode/beam_search.cpp
@@ -324,7 +324,7 @@ float beam_search(const T* const scores,
             // 8 fold unrolled version has the small upside that both loads
             // can be done with a single ldp instruction.
             const int kUnroll = 8;
-            for (int i = new_elem_count / kUnroll; i; --i) {
+            for (int i = int(new_elem_count) / kUnroll; i; --i) {
                 // True comparison sets lane bits to 0xffffffff, or -1 in two's complement,
                 // which we subtract to increment our counts.
                 float32x4_t scores_x4_a = vld1q_f32(score_ptr);
diff --git a/dorado/modbase/modbase_encoder.cpp b/dorado/modbase/modbase_encoder.cpp
index 3ccc3b7f..6c28020c 100644
--- a/dorado/modbase/modbase_encoder.cpp
+++ b/dorado/modbase/modbase_encoder.cpp
@@ -146,7 +146,7 @@ inline std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
         for (int i = base_st; i < base_en; ++i) {
             for (size_t kmer_pos = 0; kmer_pos < size_t(kmer_len); ++kmer_pos) {
                 auto base = seq[seq_pos + kmer_pos];
-                uint32_t base_oh = (base == -1) ? 0ul : (1ul << (base << 3));
+                uint32_t base_oh = (base == -1) ? uint32_t{} : (uint32_t{1} << (base << 3));
                 // memcpy will be translated to a single 32 bit write.
                 std::memcpy(output_ptr, &base_oh, sizeof(base_oh));
                 output_ptr += 4;
diff --git a/dorado/nn/MetalCRFModel.cpp b/dorado/nn/MetalCRFModel.cpp
index ebc714d2..f09f1ef4 100644
--- a/dorado/nn/MetalCRFModel.cpp
+++ b/dorado/nn/MetalCRFModel.cpp
@@ -495,8 +495,10 @@ struct MetalBlockImpl : Module {
             lstm_label.back()++;
             command_buffer = m_command_queue->commandBuffer();
 
-            const int kResBufSize = dtype_bytes * kernel_simd_groups * 2 * kTileSize * kTileSize;
-            const int kOutBufSize = dtype_bytes * kernel_simd_groups * kTileSize * kTileSize;
+            const int kResBufSize =
+                    static_cast<int>(dtype_bytes * kernel_simd_groups * 2 * kTileSize * kTileSize);
+            const int kOutBufSize =
+                    static_cast<int>(dtype_bytes * kernel_simd_groups * kTileSize * kTileSize);
             const std::vector<int> tg_buffer_lens{kResBufSize, kOutBufSize};
             for (const auto &args_lstm : m_args_lstm) {
                 const std::vector<MTL::Buffer *> buffers{args_lstm.get(), mat_working_mem.get(),
@@ -650,11 +652,11 @@ class MetalCaller {
                      static_cast<size_t>(m_states) * sizeof(int16_t) +  // Posts
                      static_cast<size_t>(m_states) * sizeof(float));    // Back guides.
             spdlog::debug("decode_buffer_size_per_elem {}", decode_buffer_size_per_elem);
-            const int max_batch_size = std::clamp(
+            const int max_batch_size = static_cast<int>(std::clamp(
                     utils::pad_to(physical_memory / (2 * decode_buffer_size_per_elem),
                                   static_cast<size_t>(MTL_CORE_BATCH_SIZE)),
                     static_cast<size_t>(MTL_CORE_BATCH_SIZE),
-                    static_cast<size_t>(MTL_CORE_BATCH_SIZE * get_mtl_device_core_count()));
+                    static_cast<size_t>(MTL_CORE_BATCH_SIZE * get_mtl_device_core_count())));
             spdlog::debug("max_batch_size {}", max_batch_size);
 
             // Subject to the above memory constraint, impose a minimum batch size
@@ -671,8 +673,8 @@ class MetalCaller {
                                               static_cast<float>(kNumSmallerSizes);
             for (int i = 0; i < kNumSmallerSizes; ++i) {
                 const int test_batch_size =
-                        utils::pad_to(min_batch_size + static_cast<size_t>(i * test_size_increment),
-                                      static_cast<size_t>(MTL_CORE_BATCH_SIZE));
+                        utils::pad_to(min_batch_size + static_cast<int>(i * test_size_increment),
+                                      static_cast<int>(MTL_CORE_BATCH_SIZE));
                 test_batch_sizes.insert(test_batch_size);
             }
 
@@ -684,7 +686,7 @@ class MetalCaller {
 
             // Iterate through batch size candidates to find the most efficient one.
             int best_batch_size = -1;
-            int best_us_per_batch_element = std::numeric_limits<int>::max();
+            long long best_us_per_batch_element = std::numeric_limits<long long>::max();
             for (int batch_size : test_batch_sizes) {
                 spdlog::debug("Trying batch size {}", batch_size);
                 set_chunk_batch_size(model_config, state_dict, benchmark_chunk_size, batch_size);
diff --git a/dorado/nn/Runners.cpp b/dorado/nn/Runners.cpp
index c4787fe0..94ed34f6 100644
--- a/dorado/nn/Runners.cpp
+++ b/dorado/nn/Runners.cpp
@@ -61,7 +61,7 @@ std::pair<std::vector<dorado::Runner>, size_t> create_basecall_runners(
 #if DORADO_GPU_BUILD
 #ifdef __APPLE__
     else if (device == "metal") {
-        auto caller = dorado::create_metal_caller(model_config, chunk_size, batch_size);
+        auto caller = dorado::create_metal_caller(model_config, int(chunk_size), int(batch_size));
         for (size_t i = 0; i < num_gpu_runners; i++) {
             runners.push_back(std::make_shared<dorado::MetalModelRunner>(caller));
         }

From b256cbd2e775d22a7ae9293c4449753164030d6f Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 12 Dec 2023 11:25:22 +0000
Subject: [PATCH 3/5] [INSTX-3094] Remove inline in .cpp file

The inline keyword doesn't mean that the code should be inlined: it
means that it's OK for multiple instances of this symbol to exist and
that the linker can pick an arbitrary one at link time, and I don't
want to think what that means for symbols in an anonymous namespace.
---
 dorado/modbase/modbase_encoder.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dorado/modbase/modbase_encoder.cpp b/dorado/modbase/modbase_encoder.cpp
index 6c28020c..5a14f916 100644
--- a/dorado/modbase/modbase_encoder.cpp
+++ b/dorado/modbase/modbase_encoder.cpp
@@ -129,12 +129,12 @@ int ModBaseEncoder::compute_sample_pos(int base_pos) const {
 namespace {
 
 // Fallback path for non-AVX / kmer lengths not specifically optimised.
-inline std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
-                                               const std::vector<int>& seq_mappings,
-                                               int bases_before,
-                                               int bases_after,
-                                               int context_samples,
-                                               int kmer_len) {
+std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
+                                        const std::vector<int>& seq_mappings,
+                                        int bases_before,
+                                        int bases_after,
+                                        int context_samples,
+                                        int kmer_len) {
     const size_t seq_len = seq.size() - bases_before - bases_after;
     std::vector<int8_t> output(kmer_len * utils::BaseInfo::NUM_BASES * context_samples);
 

From 867a5c3348e25da6a97d16bbbd43645e05052a0f Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 12 Dec 2023 11:58:58 +0000
Subject: [PATCH 4/5] [INSTX-3094] Update minimap2 to fix build errors

---
 dorado/3rdparty/ont-minimap2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dorado/3rdparty/ont-minimap2 b/dorado/3rdparty/ont-minimap2
index 0ca16202..ffea5e27 160000
--- a/dorado/3rdparty/ont-minimap2
+++ b/dorado/3rdparty/ont-minimap2
@@ -1 +1 @@
-Subproject commit 0ca1620283f30f98e573cbed5e5a6cd061f9a2f2
+Subproject commit ffea5e27a4b46b00e0d100f68fd06032ad9e8caf

From 3b681931ab970d09f45ebba2f3156c65359095a0 Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Tue, 12 Dec 2023 17:54:50 +0000
Subject: [PATCH 5/5] [INSTX-3094] Bring back autoreconf but only for non-Xcode
 generators

autoreconf fails to find aclocal when running under Xcode, so assume
that the old method works (which it seems to on latest brew).
---
 cmake/Htslib.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/Htslib.cmake b/cmake/Htslib.cmake
index f3141130..caabc1a3 100644
--- a/cmake/Htslib.cmake
+++ b/cmake/Htslib.cmake
@@ -18,6 +18,11 @@ if(NOT TARGET htslib) # lazy include guard
         find_program(MAKE_COMMAND make REQUIRED)
         find_program(AUTOCONF_COMMAND autoconf REQUIRED)
         find_program(AUTOHEADER_COMMAND autoheader REQUIRED)
+        execute_process(COMMAND bash -c "${AUTOCONF_COMMAND} -V | sed 's/.* //; q'"
+            OUTPUT_VARIABLE AUTOCONF_VERS)
+        if (AUTOCONF_VERS VERSION_GREATER_EQUAL 2.70 AND NOT CMAKE_GENERATOR STREQUAL "Xcode")
+            set(AUTOCONF_COMMAND autoreconf --install)
+        endif()
 
         # The Htslib build apparently requires BUILD_IN_SOURCE=1, which is a problem when
         # switching between build targets because htscodecs object files aren't regenerated.