Skip to content

Commit

Permalink
Merge branch 'INSTX-3094-xcode-build' into 'master'
Browse files Browse the repository at this point in the history
Get Xcode builds working

See merge request machine-learning/dorado!765
  • Loading branch information
blawrence-ont committed Dec 13, 2023
2 parents 00c00f9 + 3b68193 commit 6c984a0
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 27 deletions.
21 changes: 11 additions & 10 deletions cmake/Htslib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@ if(NOT TARGET htslib) # lazy include guard
"INTERFACE_INCLUDE_DIRECTORIES" ${HTSLIB_DIR})
target_link_directories(htslib INTERFACE ${HTSLIB_DIR})
else()
message(STATUS "Building htslib")
set(HTSLIB_DIR ${DORADO_3RD_PARTY_SOURCE}/htslib CACHE STRING
"Path to htslib repo")
set(MAKE_COMMAND make)
set(AUTOCONF_COMMAND autoconf)
execute_process(COMMAND bash -c "autoconf -V | sed 's/.* //; q'"
OUTPUT_VARIABLE AUTOCONF_VERS)
if(AUTOCONF_VERS VERSION_GREATER_EQUAL 2.70)
message(STATUS "Setting up htslib build")
set(HTSLIB_DIR ${DORADO_3RD_PARTY_SOURCE}/htslib CACHE STRING "Path to htslib repo")
set(htslib_PREFIX ${CMAKE_BINARY_DIR}/3rdparty/htslib)

find_program(MAKE_COMMAND make REQUIRED)
find_program(AUTOCONF_COMMAND autoconf REQUIRED)
find_program(AUTOHEADER_COMMAND autoheader REQUIRED)
execute_process(COMMAND bash -c "${AUTOCONF_COMMAND} -V | sed 's/.* //; q'"
OUTPUT_VARIABLE AUTOCONF_VERS)
if (AUTOCONF_VERS VERSION_GREATER_EQUAL 2.70 AND NOT CMAKE_GENERATOR STREQUAL "Xcode")
set(AUTOCONF_COMMAND autoreconf --install)
endif()
set(htslib_PREFIX ${CMAKE_BINARY_DIR}/3rdparty/htslib)

# The Htslib build apparently requires BUILD_IN_SOURCE=1, which is a problem when
# switching between build targets because htscodecs object files aren't regenerated.
Expand All @@ -42,7 +43,7 @@ if(NOT TARGET htslib) # lazy include guard
PREFIX ${HTSLIB_BUILD}
SOURCE_DIR ${HTSLIB_BUILD}/htslib
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND autoheader
CONFIGURE_COMMAND ${AUTOHEADER_COMMAND}
COMMAND ${AUTOCONF_COMMAND}
COMMAND ./configure --disable-bz2 --disable-lzma --disable-libcurl --disable-s3 --disable-gcs ${CONFIGURE_FLAGS}
BUILD_COMMAND ${MAKE_COMMAND} install prefix=${htslib_PREFIX}
Expand Down
2 changes: 1 addition & 1 deletion dorado/3rdparty/ont-minimap2
2 changes: 1 addition & 1 deletion dorado/decode/beam_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ float beam_search(const T* const scores,
// 8 fold unrolled version has the small upside that both loads
// can be done with a single ldp instruction.
const int kUnroll = 8;
for (int i = new_elem_count / kUnroll; i; --i) {
for (int i = int(new_elem_count) / kUnroll; i; --i) {
// True comparison sets lane bits to 0xffffffff, or -1 in two's complement,
// which we subtract to increment our counts.
float32x4_t scores_x4_a = vld1q_f32(score_ptr);
Expand Down
14 changes: 7 additions & 7 deletions dorado/modbase/modbase_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ int ModBaseEncoder::compute_sample_pos(int base_pos) const {
namespace {

// Fallback path for non-AVX / kmer lengths not specifically optimised.
inline std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
const std::vector<int>& seq_mappings,
int bases_before,
int bases_after,
int context_samples,
int kmer_len) {
std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
const std::vector<int>& seq_mappings,
int bases_before,
int bases_after,
int context_samples,
int kmer_len) {
const size_t seq_len = seq.size() - bases_before - bases_after;
std::vector<int8_t> output(kmer_len * utils::BaseInfo::NUM_BASES * context_samples);

Expand All @@ -146,7 +146,7 @@ inline std::vector<int8_t> encode_kmer_generic(const std::vector<int>& seq,
for (int i = base_st; i < base_en; ++i) {
for (size_t kmer_pos = 0; kmer_pos < size_t(kmer_len); ++kmer_pos) {
auto base = seq[seq_pos + kmer_pos];
uint32_t base_oh = (base == -1) ? 0ul : (1ul << (base << 3));
uint32_t base_oh = (base == -1) ? uint32_t{} : (uint32_t{1} << (base << 3));
// memcpy will be translated to a single 32 bit write.
std::memcpy(output_ptr, &base_oh, sizeof(base_oh));
output_ptr += 4;
Expand Down
16 changes: 9 additions & 7 deletions dorado/nn/MetalCRFModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,10 @@ struct MetalBlockImpl : Module {
lstm_label.back()++;
command_buffer = m_command_queue->commandBuffer();

const int kResBufSize = dtype_bytes * kernel_simd_groups * 2 * kTileSize * kTileSize;
const int kOutBufSize = dtype_bytes * kernel_simd_groups * kTileSize * kTileSize;
const int kResBufSize =
static_cast<int>(dtype_bytes * kernel_simd_groups * 2 * kTileSize * kTileSize);
const int kOutBufSize =
static_cast<int>(dtype_bytes * kernel_simd_groups * kTileSize * kTileSize);
const std::vector<int> tg_buffer_lens{kResBufSize, kOutBufSize};
for (const auto &args_lstm : m_args_lstm) {
const std::vector<MTL::Buffer *> buffers{args_lstm.get(), mat_working_mem.get(),
Expand Down Expand Up @@ -650,11 +652,11 @@ class MetalCaller {
static_cast<size_t>(m_states) * sizeof(int16_t) + // Posts
static_cast<size_t>(m_states) * sizeof(float)); // Back guides.
spdlog::debug("decode_buffer_size_per_elem {}", decode_buffer_size_per_elem);
const int max_batch_size = std::clamp(
const int max_batch_size = static_cast<int>(std::clamp(
utils::pad_to(physical_memory / (2 * decode_buffer_size_per_elem),
static_cast<size_t>(MTL_CORE_BATCH_SIZE)),
static_cast<size_t>(MTL_CORE_BATCH_SIZE),
static_cast<size_t>(MTL_CORE_BATCH_SIZE * get_mtl_device_core_count()));
static_cast<size_t>(MTL_CORE_BATCH_SIZE * get_mtl_device_core_count())));
spdlog::debug("max_batch_size {}", max_batch_size);

// Subject to the above memory constraint, impose a minimum batch size
Expand All @@ -671,8 +673,8 @@ class MetalCaller {
static_cast<float>(kNumSmallerSizes);
for (int i = 0; i < kNumSmallerSizes; ++i) {
const int test_batch_size =
utils::pad_to(min_batch_size + static_cast<size_t>(i * test_size_increment),
static_cast<size_t>(MTL_CORE_BATCH_SIZE));
utils::pad_to(min_batch_size + static_cast<int>(i * test_size_increment),
static_cast<int>(MTL_CORE_BATCH_SIZE));
test_batch_sizes.insert(test_batch_size);
}

Expand All @@ -684,7 +686,7 @@ class MetalCaller {

// Iterate through batch size candidates to find the most efficient one.
int best_batch_size = -1;
int best_us_per_batch_element = std::numeric_limits<int>::max();
long long best_us_per_batch_element = std::numeric_limits<long long>::max();
for (int batch_size : test_batch_sizes) {
spdlog::debug("Trying batch size {}", batch_size);
set_chunk_batch_size(model_config, state_dict, benchmark_chunk_size, batch_size);
Expand Down
2 changes: 1 addition & 1 deletion dorado/nn/Runners.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ std::pair<std::vector<dorado::Runner>, size_t> create_basecall_runners(
#if DORADO_GPU_BUILD
#ifdef __APPLE__
else if (device == "metal") {
auto caller = dorado::create_metal_caller(model_config, chunk_size, batch_size);
auto caller = dorado::create_metal_caller(model_config, int(chunk_size), int(batch_size));
for (size_t i = 0; i < num_gpu_runners; i++) {
runners.push_back(std::make_shared<dorado::MetalModelRunner>(caller));
}
Expand Down

0 comments on commit 6c984a0

Please sign in to comment.