Skip to content

Commit

Permalink
Merge branch 'jdaw/fix-resume' into 'master'
Browse files Browse the repository at this point in the history
[resume] Fix resume loading for split reads

Closes DOR-599

See merge request machine-learning/dorado!879
  • Loading branch information
tijyojwad committed Mar 8, 2024
2 parents 3929003 + 1a41f1c commit b31e5c8
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 4 deletions.
13 changes: 11 additions & 2 deletions dorado/read_pipeline/ResumeLoaderNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ void ResumeLoaderNode::copy_completed_reads() {
// Iterate over all reads and write to sink.
try {
while (reader.read()) {
std::string read_id = bam_get_qname(reader.record);
std::string read_id;
// If a split read is found, use the parent read id to
// resume basecalling since that's the read id found in
// the raw dataset.
auto pid_tag = bam_aux_get(reader.record.get(), "pi");
if (pid_tag) {
read_id = std::string(bam_aux2Z(pid_tag));
} else {
read_id = bam_get_qname(reader.record);
}
m_processed_read_ids.insert(read_id);
m_sink.push_message(BamPtr(bam_dup1(reader.record.get())));
if (is_safe_to_log && m_processed_read_ids.size() % 100 == 0) {
Expand All @@ -56,7 +65,7 @@ void ResumeLoaderNode::copy_completed_reads() {
// properly formatted records.
}
std::cerr << "\r";
spdlog::info("> {} reads found in resume file.", m_processed_read_ids.size());
spdlog::info("> {} original read ids found in resume file.", m_processed_read_ids.size());

hts_set_log_level(initial_hts_log_level);
}
Expand Down
5 changes: 3 additions & 2 deletions tests/ResumeLoaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ namespace fs = std::filesystem;
TEST_CASE(TEST_GROUP) {
std::vector<dorado::Message> messages;
MessageSinkToVector sink(100, messages);
fs::path aligner_test_dir = fs::path(get_data_dir("aligner_test"));
fs::path aligner_test_dir = fs::path(get_data_dir("resume_loader"));
auto sam = aligner_test_dir / "basecall.sam";

dorado::ResumeLoaderNode loader(sink, sam.string());
loader.copy_completed_reads();
sink.terminate(dorado::DefaultFlushOptions());
CHECK(messages.size() == 1);
CHECK(messages.size() == 2);
auto read_ids = loader.get_processed_read_ids();
CHECK(read_ids.count("002bd127-db82-436f-b828-28567c3d505d") == 1);
CHECK(read_ids.count("ccccdddd-db82-436f-b828-28567c3d505d") == 1);
}
5 changes: 5 additions & 0 deletions tests/data/resume_loader/basecall.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@HD VN:1.6 SO:unknown
@PG ID:basecaller PN:dorado VN:0.5.0+5fa4de73+dirty CL:dorado basecaller dna_r9.4.1_e8_hac@v3.3 ./tests/data/pod5 -x cpu --modified-bases 5mCG --emit-sam
@RG ID:a16f403b6a3655419511bf356ce3b40b65abfae4_dna_r9.4.1_e8_hac@v3.3 PU:PAK21298 PM:PAPAP48 DT:2022-04-27T16:47:57.305+00:00 PL:ONT DS:basecall_model=dna_r9.4.1_e8_hac@v3.3 modbase_models=dna_r9.4.1_e8_hac@v3.3_5mCG@v0.1 runid=a16f403b6a3655419511bf356ce3b40b65abfae4 LB:no_sample SM:no_sample
002bd127-db82-436f-b828-28567c3d505d 4 * 0 0 * * 0 0 CCACTTTTTGCCCTTTACGTAAGCACTTGGCATGGCCGCCTTAGCCGCCCCTTGGCTCCCAGCACACATAGCCCATACCAGCCCACCCCTTGGCATGTCCTTATCAGCGCCCTTCCCCGCCCACCCCCCCGCCCCACCCACACCGCCCCGGCCGCTCCCCCACCACCCATAGCCCGCGCGCGTAAAAGCCCGCCCCCCCCCCCCACCCACATGCAGCCCTGTCAAAGCCCGCCCCGTAGGCCCGCGCCCATCAGCCCAGCCCCATTGTGCATGTCCCCCGGCCCGCCCCCCACCCTTCCCCCCCCCACCTGTGCCTGGCCGCCGCCTCCCATCCACCCGCCACCACTTCCCCTGTGCCCCACTTCCAGCCCGGCCCAGCCATCCCCGCCCCCCCCCCCACCACCGCCCCCCGGCCAGCCCCCGCCCCCGCCCAGCCCGCCAGGAAAACATCAAGCCCCATGCCCCCACCCGCTGCCCCGCGCCCCCGCCCAGCCACACCCACCCCGCCCCCACCCGCCCCCACCGGCCATGGCGCAGCCCGGCAGCTTCCCCGGCCACGCCGCCCCCTTCGCAGCCCACCCCGCAGCCCGCCTGCCAGCCCATCCCGCTCCCCCGGCGCCCCAGCCGCCCGCCCACCCCGCGCGCACCCGCCCCGCCCGCTCCCCCCCGCCCACCCGCCCTTTCCTTGCATCTGCTCCCCCCACCCCACCCCCGCGCCCATCGCCCCCCGCCCACCCCCGCGCCCACCCCCTCCCTTCCCCTTCCCACCCACCCACCCCCGCCCCGCCCCACTTGCCCCACCCGCCCCCGCCCCCCCCGCGCGCTCCTCCCCACACACGGCTTCTGACCAGCTTCGCTTTAAGCCCTGCAAGCGCTCCCCACCACTTACGTAAAGCCGGTAGCCAGCGCACCCCGCCCGTCATCGGCCATCCAAGCCGGCCGGCCTTCCCCCGACCGCCCCTTAAAGCGCGCCCCGGCACCGTAAGCCCGGCCACCCGCGCCCCCCCCACCCCAGCCCCCCCCCGGCCCGCCGTGGCCTAAGCCCGGCCCCCGGCCCAGCCTTGCCGGCCTTCTCGGCCGCCGGCGCGCCAGCCCAGCCATCCGCCTAGCCAAGCGCCCCGGCCGGCGCTCCAGGCCGGCCCCGGCCGCCGGCCTTAGCCGTTTCACTTCCCACCGGCCAAAGCCCGGCATCATGCTCCTTACCGTAAGCTTCCCCGGCTTAGCCGGCAAGCGAAAACGCCCGCCCGGCGTAGCGCTCCCCGGCCATGACCGCCGGCCGTAGCCCTTTCCGGCCGGCGGCCCCGGCCATCGGCCAAGCGCACCACCGAAATGGCCCCGTGACCCATCTCGGCCGCTCTACGCAAGCCAAAGCCGGCCCACCCCCGGCCACCCGCCCTTGCTTCCTTAGCCAAACACGCATGCCCACCCAGCGCGCGCTCCACCCCCCCCGCCCGCGCCCTTCGCCCGCCACCCCACCAGCGCACCGCCCGCTGCCTTACCGCTTAGCCCCAAAACGCCCTTCCGGCGCGCTTCCTTAGCCACCCGCTTGCAACTAAGCACCGCCTACCGCTTGCCGCCGACGCTACGGCCATCCCACCGCTTCCTTCCGTAACCTGTCGCCTTACCGCCCGCGCCCGGCCTGCAAGCCAACCGGCCGGCTACCACCCCCGTAGCCCAAAGCCTGCTTGCCCCGGCCGCTGTCCCCGCCCGCCCCCACCCCGGCCAAGCATCATCCTCCTTCGCCCTTTCCTTGCGCTCCCCACCGTAAGCACTGTCATCTTCCATCACCCTTAAACCCAATACTTGCCCCGCTCAAGTTTCGCCATCTCATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGAATTTTAAAAAAAAAAAAAA ((&'&'++)))'))+'&&%$$%$*--,+'''((&%%%$$$&%$$%$$$&()'''&&*''((%%%%'&(((***&''(())*+--0./.-)'&$$%&&&&$%%%&'&%%%%%%&'((,+&&&&'()(),..-**.//--++'''(&&%&&&(((')*&',,,*+++--*'&&&*,)&%%%')&%$&'))''&())-146321-*2.''&&%$%''((*+*,+))&'*,-,)))),+%%&$$&'&''((()&%%%(,--/,+,+*'%%%$$$#%%$%%+-))&%'(((((,0/,%&&%$%%.+0/-,(%%'$$###$)+-//*,+&'&$$,,,('&$'()-*+)*(&&%%%%+))(%$#$''&%%&&$$%%%&&%'&%%$%'%(+,210//4830(+.010,,'))$(252()'&&'*)))('(((()))((*+*''**'&(&%%%&%$##%%%())*,,)$#$%&.1/**+('&%$$**((())++(&%%&'''*%(&&''(%&)+++('+&%%&%%$%,*''(('((*&$##$&%&%''(+**(((&'''*)'&%(&*&&%$%'('(('%$$%%'(((('()(('&''''&&##$$%)*+*,--*(''''&&&'&%'()+01+'''*,*-014*.--./-,&%&&')*'*/034/--+*('&',)--+&&%'*-/0../-++**(''&%%&'&&%&(),,01410852/0/0,**((---*&&&25<=<655../001.++'&'&*&%,+)'$$%%%%%)(((((1/....---..023333456*)))**'''())047.(('(//,**'(+-.//-,*%&'&&'())*+,+)''&&&$%&)*%$$#$#$$(,*''''&###%&'('&$#%&$$$%&)*)))%$$$###'))*-,*%$%%$#%%$$$'('''*))(((%%%&'$$'('&&%%'%%&&'')'''$$$$'*,,)&%&&*,('+--.+/-''%&%%)(%$%&&%&%%'')'())*'''&()())*--,.//)),+(''(0/-()*'&'&',)'('&$$$+)(%$%&%'((*,-*%%$'(&&&'(*(((&&%&&''%$####%'''&&&%%$&$&&'*(''()&%$%'($'%$$$%$&$$&'()&%$&'''(%%%%$$%&'))*104,..&&%%&&&')&%%%%$$%&%$&'(((())***+(&%&)))&%%$%)$$$#$&%$$%$$$####+*)()*))(%%&&%$&)(&$#%(%$$&%%$$&''&&%&'((&%$$&-'&&*-*))(&%%%*+&+*'&&&$$'''('(*,3.''''&,,(%&''(''(*'&&$$()&$#%'./01**'%&&&%$$%(**&&%%%&'($&$%##%%##$0%$$##$$&'(,*'(%%#%%&+'&%&',(''(&&'%)&%%$$###''&%&%$$%$&$%##$$$##$%%')***''&'**&%)**(+++*(%&*(')*)***0.+***+21111)++++'%%%%%%&$$&%')('&&%%$$&(()**((&&&%%%%&&-''',/((+'%'('))+,-4-*+*(()&%%%%%%(%%$$#$$&&(''(%*++--*&%$%$$$$$$(&%$'&&(%##%&&'())))&&&&&&%&'''&(*--('(('('&%%%&'**()***.((''&&''(*(((%%$%%'(&%%#$###'***41*'&&&*'''''%$$''&&%%%)))(''%%$$##%''&'''''')1-*''&%%&&'(%$%%'('+*&%$$%%'%&&%##*+('''(''&''))+**''&&%('''%&%$%%$$$$&')(*/*&%&&%%%%*(%%$$$$$$$&(('**&%%%%$$$$$$$$#$$#%&&&&&&'&&&&%%%%%%%%%&&&&&&&&&%#$&$#$%$"###""#$$%$$$$$##" qs:i:6 du:f:11.7655 ns:i:47062 ts:i:10 mx:i:2 ch:i:2647 st:Z:2022-04-27T19:56:52.425+00:00 rn:i:12088 fn:Z:single_na24385.pod5 sm:f:55.9973 sd:f:81.9378 sv:Z:med_mad dx:i:0 RG:Z:a16f403b6a3655419511bf356ce3b40b65abfae4_dna_r9.4.1_e8_hac@v3.3
aaaabbbb-db82-436f-b828-28567c3d505d 4 * 0 0 * * 0 0 CCACTTTTTGCCCTTTACGTAAGCACTTGGCATGGCCGCCTTAGCCGCCCCTTGGCTCCCAGCACACATAGCCCATACCAGCCCACCCCTTGGCATGTCCTTATCAGCGCCCTTCCCCGCCCACCCCCCCGCCCCACCCACACCGCCCCGGCCGCTCCCCCACCACCCATAGCCCGCGCGCGTAAAAGCCCGCCCCCCCCCCCCACCCACATGCAGCCCTGTCAAAGCCCGCCCCGTAGGCCCGCGCCCATCAGCCCAGCCCCATTGTGCATGTCCCCCGGCCCGCCCCCCACCCTTCCCCCCCCCACCTGTGCCTGGCCGCCGCCTCCCATCCACCCGCCACCACTTCCCCTGTGCCCCACTTCCAGCCCGGCCCAGCCATCCCCGCCCCCCCCCCCACCACCGCCCCCCGGCCAGCCCCCGCCCCCGCCCAGCCCGCCAGGAAAACATCAAGCCCCATGCCCCCACCCGCTGCCCCGCGCCCCCGCCCAGCCACACCCACCCCGCCCCCACCCGCCCCCACCGGCCATGGCGCAGCCCGGCAGCTTCCCCGGCCACGCCGCCCCCTTCGCAGCCCACCCCGCAGCCCGCCTGCCAGCCCATCCCGCTCCCCCGGCGCCCCAGCCGCCCGCCCACCCCGCGCGCACCCGCCCCGCCCGCTCCCCCCCGCCCACCCGCCCTTTCCTTGCATCTGCTCCCCCCACCCCACCCCCGCGCCCATCGCCCCCCGCCCACCCCCGCGCCCACCCCCTCCCTTCCCCTTCCCACCCACCCACCCCCGCCCCGCCCCACTTGCCCCACCCGCCCCCGCCCCCCCCGCGCGCTCCTCCCCACACACGGCTTCTGACCAGCTTCGCTTTAAGCCCTGCAAGCGCTCCCCACCACTTACGTAAAGCCGGTAGCCAGCGCACCCCGCCCGTCATCGGCCATCCAAGCCGGCCGGCCTTCCCCCGACCGCCCCTTAAAGCGCGCCCCGGCACCGTAAGCCCGGCCACCCGCGCCCCCCCCACCCCAGCCCCCCCCCGGCCCGCCGTGGCCTAAGCCCGGCCCCCGGCCCAGCCTTGCCGGCCTTCTCGGCCGCCGGCGCGCCAGCCCAGCCATCCGCCTAGCCAAGCGCCCCGGCCGGCGCTCCAGGCCGGCCCCGGCCGCCGGCCTTAGCCGTTTCACTTCCCACCGGCCAAAGCCCGGCATCATGCTCCTTACCGTAAGCTTCCCCGGCTTAGCCGGCAAGCGAAAACGCCCGCCCGGCGTAGCGCTCCCCGGCCATGACCGCCGGCCGTAGCCCTTTCCGGCCGGCGGCCCCGGCCATCGGCCAAGCGCACCACCGAAATGGCCCCGTGACCCATCTCGGCCGCTCTACGCAAGCCAAAGCCGGCCCACCCCCGGCCACCCGCCCTTGCTTCCTTAGCCAAACACGCATGCCCACCCAGCGCGCGCTCCACCCCCCCCGCCCGCGCCCTTCGCCCGCCACCCCACCAGCGCACCGCCCGCTGCCTTACCGCTTAGCCCCAAAACGCCCTTCCGGCGCGCTTCCTTAGCCACCCGCTTGCAACTAAGCACCGCCTACCGCTTGCCGCCGACGCTACGGCCATCCCACCGCTTCCTTCCGTAACCTGTCGCCTTACCGCCCGCGCCCGGCCTGCAAGCCAACCGGCCGGCTACCACCCCCGTAGCCCAAAGCCTGCTTGCCCCGGCCGCTGTCCCCGCCCGCCCCCACCCCGGCCAAGCATCATCCTCCTTCGCCCTTTCCTTGCGCTCCCCACCGTAAGCACTGTCATCTTCCATCACCCTTAAACCCAATACTTGCCCCGCTCAAGTTTCGCCATCTCATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGGAATTTTAAAAAAAAAAAAAA ((&'&'++)))'))+'&&%$$%$*--,+'''((&%%%$$$&%$$%$$$&()'''&&*''((%%%%'&(((***&''(())*+--0./.-)'&$$%&&&&$%%%&'&%%%%%%&'((,+&&&&'()(),..-**.//--++'''(&&%&&&(((')*&',,,*+++--*'&&&*,)&%%%')&%$&'))''&())-146321-*2.''&&%$%''((*+*,+))&'*,-,)))),+%%&$$&'&''((()&%%%(,--/,+,+*'%%%$$$#%%$%%+-))&%'(((((,0/,%&&%$%%.+0/-,(%%'$$###$)+-//*,+&'&$$,,,('&$'()-*+)*(&&%%%%+))(%$#$''&%%&&$$%%%&&%'&%%$%'%(+,210//4830(+.010,,'))$(252()'&&'*)))('(((()))((*+*''**'&(&%%%&%$##%%%())*,,)$#$%&.1/**+('&%$$**((())++(&%%&'''*%(&&''(%&)+++('+&%%&%%$%,*''(('((*&$##$&%&%''(+**(((&'''*)'&%(&*&&%$%'('(('%$$%%'(((('()(('&''''&&##$$%)*+*,--*(''''&&&'&%'()+01+'''*,*-014*.--./-,&%&&')*'*/034/--+*('&',)--+&&%'*-/0../-++**(''&%%&'&&%&(),,01410852/0/0,**((---*&&&25<=<655../001.++'&'&*&%,+)'$$%%%%%)(((((1/....---..023333456*)))**'''())047.(('(//,**'(+-.//-,*%&'&&'())*+,+)''&&&$%&)*%$$#$#$$(,*''''&###%&'('&$#%&$$$%&)*)))%$$$###'))*-,*%$%%$#%%$$$'('''*))(((%%%&'$$'('&&%%'%%&&'')'''$$$$'*,,)&%&&*,('+--.+/-''%&%%)(%$%&&%&%%'')'())*'''&()())*--,.//)),+(''(0/-()*'&'&',)'('&$$$+)(%$%&%'((*,-*%%$'(&&&'(*(((&&%&&''%$####%'''&&&%%$&$&&'*(''()&%$%'($'%$$$%$&$$&'()&%$&'''(%%%%$$%&'))*104,..&&%%&&&')&%%%%$$%&%$&'(((())***+(&%&)))&%%$%)$$$#$&%$$%$$$####+*)()*))(%%&&%$&)(&$#%(%$$&%%$$&''&&%&'((&%$$&-'&&*-*))(&%%%*+&+*'&&&$$'''('(*,3.''''&,,(%&''(''(*'&&$$()&$#%'./01**'%&&&%$$%(**&&%%%&'($&$%##%%##$0%$$##$$&'(,*'(%%#%%&+'&%&',(''(&&'%)&%%$$###''&%&%$$%$&$%##$$$##$%%')***''&'**&%)**(+++*(%&*(')*)***0.+***+21111)++++'%%%%%%&$$&%')('&&%%$$&(()**((&&&%%%%&&-''',/((+'%'('))+,-4-*+*(()&%%%%%%(%%$$#$$&&(''(%*++--*&%$%$$$$$$(&%$'&&(%##%&&'())))&&&&&&%&'''&(*--('(('('&%%%&'**()***.((''&&''(*(((%%$%%'(&%%#$###'***41*'&&&*'''''%$$''&&%%%)))(''%%$$##%''&'''''')1-*''&%%&&'(%$%%'('+*&%$$%%'%&&%##*+('''(''&''))+**''&&%('''%&%$%%$$$$&')(*/*&%&&%%%%*(%%$$$$$$$&(('**&%%%%$$$$$$$$#$$#%&&&&&&'&&&&%%%%%%%%%&&&&&&&&&%#$&$#$%$"###""#$$%$$$$$##" qs:i:6 du:f:11.7655 ns:i:47062 ts:i:10 mx:i:2 ch:i:2647 st:Z:2022-04-27T19:56:52.425+00:00 rn:i:12088 fn:Z:single_na24385.pod5 sm:f:55.9973 sd:f:81.9378 sv:Z:med_mad dx:i:0 RG:Z:a16f403b6a3655419511bf356ce3b40b65abfae4_dna_r9.4.1_e8_hac@v3.3 pi:Z:ccccdddd-db82-436f-b828-28567c3d505d

0 comments on commit b31e5c8

Please sign in to comment.