Skip to content

Commit

Permalink
Merge branch 'jdaw/fix-window-bat-execution' into 'master'
Browse files Browse the repository at this point in the history
fix bug with kstring free on windows in summary tool

See merge request machine-learning/dorado!740
  • Loading branch information
tijyojwad committed Nov 30, 2023
2 parents 16e5b6a + e259c6a commit b0767a6
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 19 deletions.
2 changes: 1 addition & 1 deletion dorado/alignment/Minimap2Aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,4 @@ void Minimap2Aligner::add_tags(bam1_t* record,
bam_aux_append(record, "rl", 'i', sizeof(buf->rep_len), (uint8_t*)&buf->rep_len);
}

} // namespace dorado::alignment
} // namespace dorado::alignment
18 changes: 8 additions & 10 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ std::string convert_nt16_to_str(uint8_t* bseq, size_t slen) {

namespace dorado::utils {

kstring_t allocate_kstring() {
kstring_t str = {0, 0, NULL};
ks_resize(&str, 1'000'000);
return str;
}

void add_rg_hdr(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::vector<std::string>& barcode_kits,
Expand Down Expand Up @@ -134,7 +140,7 @@ std::map<std::string, std::string> get_read_group_info(sam_hdr_t* header, const
throw std::runtime_error("no read groups in file");
}

kstring_t rg = {0, 0, NULL};
kstring_t rg = allocate_kstring();
std::map<std::string, std::string> read_group_info;

for (int i = 0; i < num_read_groups; ++i) {
Expand Down Expand Up @@ -228,15 +234,7 @@ std::map<std::string, std::string> extract_pg_keys_from_hdr(const std::string fi
if (!header) {
throw std::runtime_error("Could not open header from file: " + filename);
}
kstring_t val = {0, 0, NULL};
// The kstring is pre-allocated with 1MB of space to work around a Windows DLL
// cross-heap segmentation fault. The ks_resize/ks_free functions from htslib
// are inline functions. When htslib is shipped as a DLL, some of these functions
// are inlined into the DLL code through other htslib APIs. But those same functions
// also get inlined into dorado code when invoked directly. As a result, it's possible
// that an htslib APIs resizes a string using the DLL code. But when a ks_free
// is attempted on it from dorado, there's cross-heap behavior and a segfault occurs.
ks_resize(&val, 1'000'000);
kstring_t val = allocate_kstring();
for (auto& k : keys) {
auto ret = sam_hdr_find_tag_id(header.get(), "PG", NULL, NULL, k.c_str(), &val);
if (ret != 0) {
Expand Down
16 changes: 16 additions & 0 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector>

struct sam_hdr_t;
struct kstring_t;

namespace dorado::utils {

Expand Down Expand Up @@ -154,4 +155,19 @@ uint32_t ref_pos_consumed(uint32_t n_cigar, const uint32_t* cigar, uint32_t quer
*/
std::string cigar2str(uint32_t n_cigar, const uint32_t* cigar);

/*
* Allocate kstring_t which is already resized to hold 1 MB of data.
*
* This is done to work around a Windows DLL cross-heap segmentation fault.
* The ks_resize/ks_free functions from htslib
* are inline functions. When htslib is shipped as a DLL, some of these functions
* are inlined into the DLL code through other htslib APIs. But those same functions
* also get inlined into dorado code when invoked directly. As a result, it's possible
* that an htslib APIs resizes a string using the DLL code. But when a ks_free
* is attempted on it from dorado, there's cross-heap behavior and a segfault occurs.
*
* @return kstring_t struct
*/
kstring_t allocate_kstring();

} // namespace dorado::utils
8 changes: 1 addition & 7 deletions tests/BamUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,7 @@ class WrappedKString {
kstring_t m_str = KS_INITIALIZE;

public:
WrappedKString() {
// On Windows |sam_hdr_find_tag_id| lives in a DLL and uses a different heap, but
// |ks_free| is inline so when we call it we crash trying to free unknown memory. To
// work around this we resize the kstring to a big value in our code so no resizing
// happens inside the htslib library.
ks_resize(&m_str, 1'000'000);
}
WrappedKString() { m_str = utils::allocate_kstring(); }
~WrappedKString() { ks_free(&m_str); }

kstring_t *get() { return &m_str; }
Expand Down
12 changes: 11 additions & 1 deletion tests/test_simple_basecaller_execution.bat
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,36 @@ set batch=384

echo dorado basecaller test stage
%dorado_bin% download --model %model%
if %errorlevel% neq 0 exit /b %errorlevel%
%dorado_bin% basecaller %model% tests/data/pod5 -b %batch% --emit-fastq > ref.fq
if %errorlevel% neq 0 exit /b %errorlevel%
%dorado_bin% basecaller %model% tests/data/pod5 -b %batch% --modified-bases 5mCG --emit-moves --reference ref.fq --emit-sam > calls.sam
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado summary test stage
%dorado_bin% summary calls.sam
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado aligner test stage
%dorado_bin% aligner ref.fq calls.sam > aligned-calls.bam
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado duplex basespace test stage
%dorado_bin% duplex basespace tests/data/basespace/pairs.bam --threads 1 --pairs tests/data/basespace/pairs.txt > calls.bam
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado demux test stage
%dorado_bin% demux tests/data/barcode_demux/double_end_variant/EXP-PBC096_BC83.fastq --threads 1 --kit-name EXP-PBC096 --output-dir ./demux
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado auto model basecaller test stage
%dorado_bin% basecaller %model_speed% tests/data/pod5/dna_r9.4.1_e8/ -b %batch% --emit-fastq > ref.fq
if %errorlevel% neq 0 exit /b %errorlevel%
%dorado_bin% basecaller %model_speed%,5mCG tests/data/pod5/dna_r9.4.1_e8/ -b %batch% --emit-moves --reference ref.fq --emit-sam > calls.sam
if %errorlevel% neq 0 exit /b %errorlevel%

echo dorado auto summary test stage
%dorado_bin% summary calls.sam
if %errorlevel% neq 0 exit /b %errorlevel%

echo finished
echo finished

0 comments on commit b0767a6

Please sign in to comment.