From 5f24cb02249c5ac637882161e4e704f3eb2a37ce Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 10 Oct 2023 17:44:53 +0100 Subject: [PATCH 01/20] [DOR-327] Support for Twist barcodes Add experimental support for Twist barcodes, which have different sequences for the front and rear sequences of the same barcode. Note this is not different flank sequences, but the actual barcode itself. i.e. the barcode is formed of 2 unique 10bp strings one ligating to the front and another ligating to the rear. --- dorado/utils/barcode_kits.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 47738ed8..0cfbace2 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -331,6 +331,17 @@ const std::unordered_map kit_info_map = { }, {}, }}, + {"TWIST-PGX", + {"PGx", + true, + true, + "AATGATACGGCGACCACCGAGATCTACAC", + "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "ATCTCGTATGCCGTCTTCTGCTTG", + {"A02_01", "B02_02", "C02_03", "D02_04", "E02_05", "F02_06", "G02_07", "H02_08"}, + {"AA02_01", "BB02_02", "CC02_03", "DD02_04", "EE02_05", "FF02_06", "GG02_07", + "HH02_08"}}}, }; const std::unordered_map barcodes = { @@ -560,6 +571,23 @@ const std::unordered_map barcodes = { {"NB94", "GATTGTCCTCAAACTGCCACCTAC"}, {"NB95", "CCTGTCTGGAAGAAGAATGGACTT"}, {"NB96", "CTGAACGGTCATAGAGTCCACCAT"}, + // Twist barcodes + {"A02_01", "CCGACTTAGT"}, + {"AA02_01", "AATAGCCTCA"}, + {"B02_02", "TTCTGCATCG"}, + {"BB02_02", "CTGCAATCGG"}, + {"C02_03", "GGAAGTGCCA"}, + {"CC02_03", "CCTGAGTTAT"}, + {"D02_04", "AGATTCAACC"}, + {"DD02_04", "GACGTCCAGA"}, + {"E02_05", "TTCAGGAGAT"}, + {"EE02_05", "GAATAATCGG"}, + {"F02_06", "AAGGCGTCTG"}, + {"FF02_06", "CGGAGTGTGT"}, + {"G02_07", "ACGCTTGACA"}, + {"GG02_07", "TTACCGACCG"}, + {"H02_08", "CATGAAGTGA"}, + {"HH02_08", "AGTGTTCGCC"}, }; } // namespace From 26b5fdd3ecb86c6691c549038e0c99b22d87c634 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:38:49 +0100 Subject: [PATCH 02/20] add well 04 kits --- dorado/utils/barcode_kits.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 0cfbace2..c21fbb69 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -331,7 +331,7 @@ const std::unordered_map kit_info_map = { }, {}, }}, - {"TWIST-PGX", + {"TWIST-PGX02", {"PGx", true, true, @@ -342,6 +342,17 @@ const std::unordered_map kit_info_map = { {"A02_01", "B02_02", "C02_03", "D02_04", "E02_05", "F02_06", "G02_07", "H02_08"}, {"AA02_01", "BB02_02", "CC02_03", "DD02_04", "EE02_05", "FF02_06", "GG02_07", "HH02_08"}}}, + {"TWIST-PGX04", + {"PGx", + true, + true, + "AATGATACGGCGACCACCGAGATCTACAC", + "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "ATCTCGTATGCCGTCTTCTGCTTG", + {"A04_01", "B04_02", "C04_03", "D04_04", "E04_05", "F04_06", "G04_07", "H04_08"}, + {"AA04_01", "BB04_02", "CC04_03", "DD04_04", "EE04_05", "FF04_06", "GG04_07", + "HH04_08"}}}, }; const std::unordered_map barcodes = { @@ -588,6 +599,22 @@ const std::unordered_map barcodes = { {"GG02_07", "TTACCGACCG"}, {"H02_08", "CATGAAGTGA"}, {"HH02_08", "AGTGTTCGCC"}, + {"A04_01", "TTACTTACCG"}, + {"AA04_01", "TGTTCCTAGA"}, + {"B04_02", "CTATGCCTTA"}, + {"BB04_02", "CTCTCGAGGT"}, + {"C04_03", "GGAAGGTACG"}, + {"CC04_03", "CTGTACGGTA"}, + {"D04_04", "GAGGAGACGT"}, + {"DD04_04", "CTTATGGCAA"}, + {"E04_05", "TATCCTGACG"}, + {"EE04_05", "TCCGCATAGC"}, + {"F04_06", "TATCCTGACG"}, + {"FF04_06", "GCAAGCACCT"}, + {"G04_07", "GAAGACCGCT"}, + {"GG04_07", "GCCTGTCCTA"}, + {"H04_08", "CAACGTGGAC"}, + {"HH04_08", "ACTGTCTATC"}, }; } // namespace From 6efb973ef5709af4586364e0e2f70d33d3099bd3 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 1 Nov 2023 20:24:30 +0000 Subject: [PATCH 03/20] add all TWIST barcodes to single kit --- dorado/utils/barcode_kits.cpp | 315 ++++++++++++++++++++++++++-------- 1 file changed, 241 insertions(+), 74 deletions(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index c21fbb69..8180b8b6 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -309,50 +309,57 @@ const std::unordered_map kit_info_map = { }}, // VMK4 {"VSK-VMK004", + {"VMK4", + false, + false, + RBK4_FRONT, + RBK4_REAR, + "", + "", + {"BC01", "BC02", "BC03", "BC04", "BC05", "BC06", "BC07", "BC08", "BC09", "BC10"}, + {}}}, + {"TWIST-ALL", { - "VMK4", - false, - false, - RBK4_FRONT, - RBK4_REAR, - "", - "", - { - "BC01", - "BC02", - "BC03", - "BC04", - "BC05", - "BC06", - "BC07", - "BC08", - "BC09", - "BC10", - }, - {}, + "PGx", + true, + true, + "AATGATACGGCGACCACCGAGATCTACAC", + "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "ATCTCGTATGCCGTCTTCTGCTTG", + {"AA01F_01", "AB01F_02", "AC01F_03", "AD01F_04", "AE01F_05", "AF01F_06", + "AG01F_07", "AH01F_08", "AA02F_09", "AB02F_10", "AC02F_11", "AD02F_12", + "AE02F_13", "AF02F_14", "AG02F_15", "AH02F_16", "AA03F_17", "AB03F_18", + "AC03F_19", "AD03F_20", "AE03F_21", "AF03F_22", "AG03F_23", "AH03F_24", + "AA04F_25", "AB04F_26", "AC04F_27", "AD04F_28", "AE04F_29", "AF04F_30", + "AG04F_31", "AH04F_32", "AA05F_33", "AB05F_34", "AC05F_35", "AD05F_36", + "AE05F_37", "AF05F_38", "AG05F_39", "AH05F_40", "AA06F_41", "AB06F_42", + "AC06F_43", "AD06F_44", "AE06F_45", "AF06F_46", "AG06F_47", "AH06F_48", + "AA07F_49", "AB07F_50", "AC07F_51", "AD07F_52", "AE07F_53", "AF07F_54", + "AG07F_55", "AH07F_56", "AA08F_57", "AB08F_58", "AC08F_59", "AD08F_60", + "AE08F_61", "AF08F_62", "AG08F_63", "AH08F_64", "AA09F_65", "AB09F_66", + "AC09F_67", "AD09F_68", "AE09F_69", "AF09F_70", "AG09F_71", "AH09F_72", + "AA10F_73", "AB10F_74", "AC10F_75", "AD10F_76", "AE10F_77", "AF10F_78", + "AG10F_79", "AH10F_80", "AA11F_81", "AB11F_82", "AC11F_83", "AD11F_84", + "AE11F_85", "AF11F_86", "AG11F_87", "AH11F_88", "AA12F_89", "AB12F_90", + "AC12F_91", "AD12F_92", "AE12F_93", "AF12F_94", "AG12F_95", "AH12F_96"}, + {"AA01R_01", "AB01R_02", "AC01R_03", "AD01R_04", "AE01R_05", "AF01R_06", + "AG01R_07", "AH01R_08", "AA02R_09", "AB02R_10", "AC02R_11", "AD02R_12", + "AE02R_13", "AF02R_14", "AG02R_15", "AH02R_16", "AA03R_17", "AB03R_18", + "AC03R_19", "AD03R_20", "AE03R_21", "AF03R_22", "AG03R_23", "AH03R_24", + "AA04R_25", "AB04R_26", "AC04R_27", "AD04R_28", "AE04R_29", "AF04R_30", + "AG04R_31", "AH04R_32", "AA05R_33", "AB05R_34", "AC05R_35", "AD05R_36", + "AE05R_37", "AF05R_38", "AG05R_39", "AH05R_40", "AA06R_41", "AB06R_42", + "AC06R_43", "AD06R_44", "AE06R_45", "AF06R_46", "AG06R_47", "AH06R_48", + "AA07R_49", "AB07R_50", "AC07R_51", "AD07R_52", "AE07R_53", "AF07R_54", + "AG07R_55", "AH07R_56", "AA08R_57", "AB08R_58", "AC08R_59", "AD08R_60", + "AE08R_61", "AF08R_62", "AG08R_63", "AH08R_64", "AA09R_65", "AB09R_66", + "AC09R_67", "AD09R_68", "AE09R_69", "AF09R_70", "AG09R_71", "AH09R_72", + "AA10R_73", "AB10R_74", "AC10R_75", "AD10R_76", "AE10R_77", "AF10R_78", + "AG10R_79", "AH10R_80", "AA11R_81", "AB11R_82", "AC11R_83", "AD11R_84", + "AE11R_85", "AF11R_86", "AG11R_87", "AH11R_88", "AA12R_89", "AB12R_90", + "AC12R_91", "AD12R_92", "AE12R_93", "AF12R_94", "AG12R_95", "AH12R_96"}, }}, - {"TWIST-PGX02", - {"PGx", - true, - true, - "AATGATACGGCGACCACCGAGATCTACAC", - "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", - "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "ATCTCGTATGCCGTCTTCTGCTTG", - {"A02_01", "B02_02", "C02_03", "D02_04", "E02_05", "F02_06", "G02_07", "H02_08"}, - {"AA02_01", "BB02_02", "CC02_03", "DD02_04", "EE02_05", "FF02_06", "GG02_07", - "HH02_08"}}}, - {"TWIST-PGX04", - {"PGx", - true, - true, - "AATGATACGGCGACCACCGAGATCTACAC", - "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", - "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "ATCTCGTATGCCGTCTTCTGCTTG", - {"A04_01", "B04_02", "C04_03", "D04_04", "E04_05", "F04_06", "G04_07", "H04_08"}, - {"AA04_01", "BB04_02", "CC04_03", "DD04_04", "EE04_05", "FF04_06", "GG04_07", - "HH04_08"}}}, }; const std::unordered_map barcodes = { @@ -583,38 +590,198 @@ const std::unordered_map barcodes = { {"NB95", "CCTGTCTGGAAGAAGAATGGACTT"}, {"NB96", "CTGAACGGTCATAGAGTCCACCAT"}, // Twist barcodes - {"A02_01", "CCGACTTAGT"}, - {"AA02_01", "AATAGCCTCA"}, - {"B02_02", "TTCTGCATCG"}, - {"BB02_02", "CTGCAATCGG"}, - {"C02_03", "GGAAGTGCCA"}, - {"CC02_03", "CCTGAGTTAT"}, - {"D02_04", "AGATTCAACC"}, - {"DD02_04", "GACGTCCAGA"}, - {"E02_05", "TTCAGGAGAT"}, - {"EE02_05", "GAATAATCGG"}, - {"F02_06", "AAGGCGTCTG"}, - {"FF02_06", "CGGAGTGTGT"}, - {"G02_07", "ACGCTTGACA"}, - {"GG02_07", "TTACCGACCG"}, - {"H02_08", "CATGAAGTGA"}, - {"HH02_08", "AGTGTTCGCC"}, - {"A04_01", "TTACTTACCG"}, - {"AA04_01", "TGTTCCTAGA"}, - {"B04_02", "CTATGCCTTA"}, - {"BB04_02", "CTCTCGAGGT"}, - {"C04_03", "GGAAGGTACG"}, - {"CC04_03", "CTGTACGGTA"}, - {"D04_04", "GAGGAGACGT"}, - {"DD04_04", "CTTATGGCAA"}, - {"E04_05", "TATCCTGACG"}, - {"EE04_05", "TCCGCATAGC"}, - {"F04_06", "TATCCTGACG"}, - {"FF04_06", "GCAAGCACCT"}, - {"G04_07", "GAAGACCGCT"}, - {"GG04_07", "GCCTGTCCTA"}, - {"H04_08", "CAACGTGGAC"}, - {"HH04_08", "ACTGTCTATC"}, + {"AA01F_01", "CCAATATTCG"}, + {"AA01R_01", "TATCTTCAGC"}, + {"AB01F_02", "CGCAGACAAC"}, + {"AB01R_02", "TGCACGGATA"}, + {"AC01F_03", "TCGGAGCAGA"}, + {"AC01R_03", "GGTTGATAGA"}, + {"AD01F_04", "GAGTCCGTAG"}, + {"AD01R_04", "ACTCCTGCCT"}, + {"AE01F_05", "ATGTTCACGT"}, + {"AE01R_05", "CCGATAGTCG"}, + {"AF01F_06", "TTCGATGGTT"}, + {"AF01R_06", "CAAGATCGAA"}, + {"AG01F_07", "TATCCGTGCA"}, + {"AG01R_07", "AGGCTCCTTC"}, + {"AH01F_08", "AAGCGCAGAG"}, + {"AH01R_08", "ATACGGATAG"}, + {"AA02F_09", "CCGACTTAGT"}, + {"AA02R_09", "AATAGCCTCA"}, + {"AB02F_10", "TTCTGCATCG"}, + {"AB02R_10", "CTGCAATCGG"}, + {"AC02F_11", "GGAAGTGCCA"}, + {"AC02R_11", "CCTGAGTTAT"}, + {"AD02F_12", "AGATTCAACC"}, + {"AD02R_12", "GACGTCCAGA"}, + {"AE02F_13", "TTCAGGAGAT"}, + {"AE02R_13", "GAATAATCGG"}, + {"AF02F_14", "AAGGCGTCTG"}, + {"AF02R_14", "CGGAGTGTGT"}, + {"AG02F_15", "ACGCTTGACA"}, + {"AG02R_15", "TTACCGACCG"}, + {"AH02F_16", "CATGAAGTGA"}, + {"AH02R_16", "AGTGTTCGCC"}, + {"AA03F_17", "TTACGACCTG"}, + {"AA03R_17", "CTACGTTCTT"}, + {"AB03F_18", "ATGCAAGCCG"}, + {"AB03R_18", "TCGACACGAA"}, + {"AC03F_19", "CTCCGTATAC"}, + {"AC03R_19", "CCGATAACTT"}, + {"AD03F_20", "GAATCTGGTC"}, + {"AD03R_20", "TTGGACATCG"}, + {"AE03F_21", "CGGTCGGTAA"}, + {"AE03R_21", "AACGTTGAGA"}, + {"AF03F_22", "TCTGCTAATG"}, + {"AF03R_22", "GGCCAGTGAA"}, + {"AG03F_23", "CTCTTATTCG"}, + {"AG03R_23", "ATGTCTCCGG"}, + {"AH03F_24", "CACCTCTAGC"}, + {"AH03R_24", "GAAGGCGTTC"}, + {"AA04F_25", "TTACTTACCG"}, + {"AA04R_25", "TGTTCCTAGA"}, + {"AB04F_26", "CTATGCCTTA"}, + {"AB04R_26", "CTCTCGAGGT"}, + {"AC04F_27", "GGAAGGTACG"}, + {"AC04R_27", "CTGTACGGTA"}, + {"AD04F_28", "GAGGAGACGT"}, + {"AD04R_28", "CTTATGGCAA"}, + {"AE04F_29", "ACGCAAGGCA"}, + {"AE04R_29", "TCCGCATAGC"}, + {"AF04F_30", "TATCCTGACG"}, + {"AF04R_30", "GCAAGCACCT"}, + {"AG04F_31", "GAAGACCGCT"}, + {"AG04R_31", "GCCTGTCCTA"}, + {"AH04F_32", "CAACGTGGAC"}, + {"AH04R_32", "ACTGTCTATC"}, + {"AA05F_33", "TAAGTGCTCG"}, + {"AA05R_33", "CGTCCATGTA"}, + {"AB05F_34", "CACATCGTAG"}, + {"AB05R_34", "CTAACTGCAA"}, + {"AC05F_35", "ACTACCGAGG"}, + {"AC05R_35", "TGCTTGTGGT"}, + {"AD05F_36", "GATGTGTTCT"}, + {"AD05R_36", "TGTAAGCACA"}, + {"AE05F_37", "AAGTGTCGTA"}, + {"AE05R_37", "CTCGTTGCGT"}, + {"AF05F_38", "GGAGAACCAC"}, + {"AF05R_38", "GCTAGAGGTG"}, + {"AG05F_39", "TGTACGAACT"}, + {"AG05R_39", "AAGCGGAGAA"}, + {"AH05F_40", "GGATGAGTGC"}, + {"AH05R_40", "AATGACGCTG"}, + {"AA06F_41", "TAGTAGGACA"}, + {"AA06R_41", "TTGGTACGCG"}, + {"AB06F_42", "ACGCCTCGTT"}, + {"AB06R_42", "TGAAGGTGAA"}, + {"AC06F_43", "CACCGCTGTT"}, + {"AC06R_43", "GTAGTGGCTT"}, + {"AD06F_44", "TCTATAGCGG"}, + {"AD06R_44", "CGTAACAGAA"}, + {"AE06F_45", "CCGATGGACA"}, + {"AE06R_45", "AAGGCCATAA"}, + {"AF06F_46", "TTCAACATGC"}, + {"AF06R_46", "TTCATAGACC"}, + {"AG06F_47", "GGAGTAACGC"}, + {"AG06R_47", "CCAACTCCGA"}, + {"AH06F_48", "AGCCTTAGCG"}, + {"AH06R_48", "CACGAGTATG"}, + {"AA07F_49", "TTACCTCAGT"}, + {"AA07R_49", "CCGCTACCAA"}, + {"AB07F_50", "CAGGCATTGT"}, + {"AB07R_50", "CTGAACCTCC"}, + {"AC07F_51", "GTGTTCCACG"}, + {"AC07R_51", "GGCCTTGTTA"}, + {"AD07F_52", "TTGATCCGCC"}, + {"AD07R_52", "TTAACGCAGA"}, + {"AE07F_53", "GGAGGCTGAT"}, + {"AE07R_53", "AGGTAGTGCG"}, + {"AF07F_54", "AACGTGACAA"}, + {"AF07R_54", "CGTGTAACTT"}, + {"AG07F_55", "CACAAGCTCC"}, + {"AG07R_55", "ACTTGTGACG"}, + {"AH07F_56", "CCGTGTTGTC"}, + {"AH07R_56", "CCATGCGTTG"}, + {"AA08F_57", "TTGAGCCAGC"}, + {"AA08R_57", "CCTTGTAGCG"}, + {"AB08F_58", "GCGTTACAGA"}, + {"AB08R_58", "ACATACGTGA"}, + {"AC08F_59", "TCCAGACATT"}, + {"AC08R_59", "CTTGATATCC"}, + {"AD08F_60", "TCGAACTCTT"}, + {"AD08R_60", "CAGCCGATGT"}, + {"AE08F_61", "ACCTTCTCGG"}, + {"AE08R_61", "TCATGCGCTA"}, + {"AF08F_62", "AGACGCCAAC"}, + {"AF08R_62", "ACTCCGTCCA"}, + {"AG08F_63", "CAACCGTAAT"}, + {"AG08R_63", "GACAGCCTTG"}, + {"AH08F_64", "TTATGCGTTG"}, + {"AH08R_64", "CGGTTATCTG"}, + {"AA09F_65", "CTATGAGAAC"}, + {"AA09R_65", "TACTCCACGG"}, + {"AB09F_66", "AAGTTACACG"}, + {"AB09R_66", "ACTTCCGGCA"}, + {"AC09F_67", "GCAATGTGAG"}, + {"AC09R_67", "GTGAAGCTGC"}, + {"AD09F_68", "CGAAGTCGCA"}, + {"AD09R_68", "TTGCTCTTCT"}, + {"AE09F_69", "CCTGATTCAA"}, + {"AE09R_69", "AACGCACGTA"}, + {"AF09F_70", "TAGAACGTGC"}, + {"AF09R_70", "TTACTGCAGG"}, + {"AG09F_71", "TTCGCAAGGT"}, + {"AG09R_71", "CCAGTTGAGG"}, + {"AH09F_72", "TTAATGCCGA"}, + {"AH09R_72", "TGTGCGTTAA"}, + {"AA10F_73", "AGAACAGAGT"}, + {"AA10R_73", "ACTAGTGCTT"}, + {"AB10F_74", "CCATCTGTTC"}, + {"AB10R_74", "CGTGGAACAC"}, + {"AC10F_75", "TTCGTAGGTG"}, + {"AC10R_75", "ATGGAAGTGG"}, + {"AD10F_76", "GCACGGTACA"}, + {"AD10R_76", "TGAGATCACA"}, + {"AE10F_77", "TGTCAAGAGG"}, + {"AE10R_77", "GTCCTTGGTG"}, + {"AF10F_78", "TCTAAGGTAC"}, + {"AF10R_78", "GAGCGTGGAA"}, + {"AG10F_79", "GAACGGAGAC"}, + {"AG10R_79", "CACACGCTGT"}, + {"AH10F_80", "CGCTACCATC"}, + {"AH10R_80", "TGGTTGTACA"}, + {"AA11F_81", "TTACGGTAAC"}, + {"AA11R_81", "ATCACTCACA"}, + {"AB11F_82", "TTCAGATGGA"}, + {"AB11R_82", "CGGAGGTAGA"}, + {"AC11F_83", "TAGCATCTGT"}, + {"AC11R_83", "GAGTTGACAA"}, + {"AD11F_84", "GGACGAGATC"}, + {"AD11R_84", "GCCGAACTTG"}, + {"AE11F_85", "AGGTTCTGTT"}, + {"AE11R_85", "AGGCCTCACA"}, + {"AF11F_86", "CATACTCGTG"}, + {"AF11R_86", "TCTCTGTTAG"}, + {"AG11F_87", "CCGGATACCA"}, + {"AG11R_87", "TCCGACGATT"}, + {"AH11F_88", "ATGTCCACCG"}, + {"AH11R_88", "AGGCTATGTT"}, + {"AA12F_89", "CACCAAGTGG"}, + {"AA12R_89", "CGTTCTCTTG"}, + {"AB12F_90", "TTGAGTACAC"}, + {"AB12R_90", "TTGTCTATGG"}, + {"AC12F_91", "CGGTTCCGTA"}, + {"AC12R_91", "GATGGATACA"}, + {"AD12F_92", "GGAGGTCCTA"}, + {"AD12R_92", "CACTTAGGCG"}, + {"AE12F_93", "CCTGCTTGGA"}, + {"AE12R_93", "ACACTGGCTA"}, + {"AF12F_94", "TTCACGTCAG"}, + {"AF12R_94", "ATCGCCACTG"}, + {"AG12F_95", "AACATAGCCT"}, + {"AG12R_95", "CTGACGTGAA"}, + {"AH12F_96", "TGACATAGTC"}, + {"AH12R_96", "TCAATCGTCT"}, }; } // namespace From db36e998c117406689f29f5148d13dddf022c4ae Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 13 Dec 2023 22:09:34 +0000 Subject: [PATCH 04/20] only capture numbers from rear of string for normalized bc name --- dorado/utils/barcode_kits.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 8180b8b6..4b6e9d34 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -815,12 +815,15 @@ std::string barcode_kits_list_str() { std::string normalize_barcode_name(const std::string& barcode_name) { std::string digits = ""; - for (const auto& c : barcode_name) { - if (std::isdigit(static_cast(c))) { - digits += c; + for (auto rit = barcode_name.rbegin(); rit != barcode_name.rend(); ++rit) { + if (std::isdigit(static_cast(*rit))) { + digits += *rit; + } else { + break; } } + std::reverse(digits.begin(), digits.end()); return "barcode" + digits; } From 2be9bd11173dc041f60ba50cc9bcee0611f33ae9 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 13 Dec 2023 23:23:27 +0000 Subject: [PATCH 05/20] update TWIST bc sequences --- dorado/utils/barcode_kits.cpp | 415 ++++++++++++++++++---------------- 1 file changed, 219 insertions(+), 196 deletions(-) diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index 4b6e9d34..d24cc63f 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -323,10 +323,10 @@ const std::unordered_map kit_info_map = { "PGx", true, true, - "AATGATACGGCGACCACCGAGATCTACAC", - "ACACTCTTTCCCTACACGACGCTCTTCCGATCT", - "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "ATCTCGTATGCCGTCTTCTGCTTG", + "CATACGAGAT", + "GTGACTGGAG", + "AGATCTACAC", + "ACACTCTTTC", {"AA01F_01", "AB01F_02", "AC01F_03", "AD01F_04", "AE01F_05", "AF01F_06", "AG01F_07", "AH01F_08", "AA02F_09", "AB02F_10", "AC02F_11", "AD02F_12", "AE02F_13", "AF02F_14", "AG02F_15", "AH02F_16", "AA03F_17", "AB03F_18", @@ -360,6 +360,29 @@ const std::unordered_map kit_info_map = { "AE11R_85", "AF11R_86", "AG11R_87", "AH11R_88", "AA12R_89", "AB12R_90", "AC12R_91", "AD12R_92", "AE12R_93", "AF12R_94", "AG12R_95", "AH12R_96"}, }}, + {"TWIST-FRONT", + {"PGx", + false, + false, + "CATACGAGAT", + "GTGACTGGAG", + "", + "", + {"AA01F_01", "AB01F_02", "AC01F_03", "AD01F_04", "AE01F_05", "AF01F_06", "AG01F_07", + "AH01F_08", "AA02F_09", "AB02F_10", "AC02F_11", "AD02F_12", "AE02F_13", "AF02F_14", + "AG02F_15", "AH02F_16", "AA03F_17", "AB03F_18", "AC03F_19", "AD03F_20", "AE03F_21", + "AF03F_22", "AG03F_23", "AH03F_24", "AA04F_25", "AB04F_26", "AC04F_27", "AD04F_28", + "AE04F_29", "AF04F_30", "AG04F_31", "AH04F_32", "AA05F_33", "AB05F_34", "AC05F_35", + "AD05F_36", "AE05F_37", "AF05F_38", "AG05F_39", "AH05F_40", "AA06F_41", "AB06F_42", + "AC06F_43", "AD06F_44", "AE06F_45", "AF06F_46", "AG06F_47", "AH06F_48", "AA07F_49", + "AB07F_50", "AC07F_51", "AD07F_52", "AE07F_53", "AF07F_54", "AG07F_55", "AH07F_56", + "AA08F_57", "AB08F_58", "AC08F_59", "AD08F_60", "AE08F_61", "AF08F_62", "AG08F_63", + "AH08F_64", "AA09F_65", "AB09F_66", "AC09F_67", "AD09F_68", "AE09F_69", "AF09F_70", + "AG09F_71", "AH09F_72", "AA10F_73", "AB10F_74", "AC10F_75", "AD10F_76", "AE10F_77", + "AF10F_78", "AG10F_79", "AH10F_80", "AA11F_81", "AB11F_82", "AC11F_83", "AD11F_84", + "AE11F_85", "AF11F_86", "AG11F_87", "AH11F_88", "AA12F_89", "AB12F_90", "AC12F_91", + "AD12F_92", "AE12F_93", "AF12F_94", "AG12F_95", "AH12F_96"}, + {}}}, }; const std::unordered_map barcodes = { @@ -590,198 +613,198 @@ const std::unordered_map barcodes = { {"NB95", "CCTGTCTGGAAGAAGAATGGACTT"}, {"NB96", "CTGAACGGTCATAGAGTCCACCAT"}, // Twist barcodes - {"AA01F_01", "CCAATATTCG"}, - {"AA01R_01", "TATCTTCAGC"}, - {"AB01F_02", "CGCAGACAAC"}, - {"AB01R_02", "TGCACGGATA"}, - {"AC01F_03", "TCGGAGCAGA"}, - {"AC01R_03", "GGTTGATAGA"}, - {"AD01F_04", "GAGTCCGTAG"}, - {"AD01R_04", "ACTCCTGCCT"}, - {"AE01F_05", "ATGTTCACGT"}, - {"AE01R_05", "CCGATAGTCG"}, - {"AF01F_06", "TTCGATGGTT"}, - {"AF01R_06", "CAAGATCGAA"}, - {"AG01F_07", "TATCCGTGCA"}, - {"AG01R_07", "AGGCTCCTTC"}, - {"AH01F_08", "AAGCGCAGAG"}, - {"AH01R_08", "ATACGGATAG"}, - {"AA02F_09", "CCGACTTAGT"}, - {"AA02R_09", "AATAGCCTCA"}, - {"AB02F_10", "TTCTGCATCG"}, - {"AB02R_10", "CTGCAATCGG"}, - {"AC02F_11", "GGAAGTGCCA"}, - {"AC02R_11", "CCTGAGTTAT"}, - {"AD02F_12", "AGATTCAACC"}, - {"AD02R_12", "GACGTCCAGA"}, - {"AE02F_13", "TTCAGGAGAT"}, - {"AE02R_13", "GAATAATCGG"}, - {"AF02F_14", "AAGGCGTCTG"}, - {"AF02R_14", "CGGAGTGTGT"}, - {"AG02F_15", "ACGCTTGACA"}, - {"AG02R_15", "TTACCGACCG"}, - {"AH02F_16", "CATGAAGTGA"}, - {"AH02R_16", "AGTGTTCGCC"}, - {"AA03F_17", "TTACGACCTG"}, - {"AA03R_17", "CTACGTTCTT"}, - {"AB03F_18", "ATGCAAGCCG"}, - {"AB03R_18", "TCGACACGAA"}, - {"AC03F_19", "CTCCGTATAC"}, - {"AC03R_19", "CCGATAACTT"}, - {"AD03F_20", "GAATCTGGTC"}, - {"AD03R_20", "TTGGACATCG"}, - {"AE03F_21", "CGGTCGGTAA"}, - {"AE03R_21", "AACGTTGAGA"}, - {"AF03F_22", "TCTGCTAATG"}, - {"AF03R_22", "GGCCAGTGAA"}, - {"AG03F_23", "CTCTTATTCG"}, - {"AG03R_23", "ATGTCTCCGG"}, - {"AH03F_24", "CACCTCTAGC"}, - {"AH03R_24", "GAAGGCGTTC"}, - {"AA04F_25", "TTACTTACCG"}, - {"AA04R_25", "TGTTCCTAGA"}, - {"AB04F_26", "CTATGCCTTA"}, - {"AB04R_26", "CTCTCGAGGT"}, - {"AC04F_27", "GGAAGGTACG"}, - {"AC04R_27", "CTGTACGGTA"}, - {"AD04F_28", "GAGGAGACGT"}, - {"AD04R_28", "CTTATGGCAA"}, - {"AE04F_29", "ACGCAAGGCA"}, - {"AE04R_29", "TCCGCATAGC"}, - {"AF04F_30", "TATCCTGACG"}, - {"AF04R_30", "GCAAGCACCT"}, - {"AG04F_31", "GAAGACCGCT"}, - {"AG04R_31", "GCCTGTCCTA"}, - {"AH04F_32", "CAACGTGGAC"}, - {"AH04R_32", "ACTGTCTATC"}, - {"AA05F_33", "TAAGTGCTCG"}, - {"AA05R_33", "CGTCCATGTA"}, - {"AB05F_34", "CACATCGTAG"}, - {"AB05R_34", "CTAACTGCAA"}, - {"AC05F_35", "ACTACCGAGG"}, - {"AC05R_35", "TGCTTGTGGT"}, - {"AD05F_36", "GATGTGTTCT"}, - {"AD05R_36", "TGTAAGCACA"}, - {"AE05F_37", "AAGTGTCGTA"}, - {"AE05R_37", "CTCGTTGCGT"}, - {"AF05F_38", "GGAGAACCAC"}, - {"AF05R_38", "GCTAGAGGTG"}, - {"AG05F_39", "TGTACGAACT"}, - {"AG05R_39", "AAGCGGAGAA"}, - {"AH05F_40", "GGATGAGTGC"}, - {"AH05R_40", "AATGACGCTG"}, - {"AA06F_41", "TAGTAGGACA"}, - {"AA06R_41", "TTGGTACGCG"}, - {"AB06F_42", "ACGCCTCGTT"}, - {"AB06R_42", "TGAAGGTGAA"}, - {"AC06F_43", "CACCGCTGTT"}, - {"AC06R_43", "GTAGTGGCTT"}, - {"AD06F_44", "TCTATAGCGG"}, - {"AD06R_44", "CGTAACAGAA"}, - {"AE06F_45", "CCGATGGACA"}, - {"AE06R_45", "AAGGCCATAA"}, - {"AF06F_46", "TTCAACATGC"}, - {"AF06R_46", "TTCATAGACC"}, - {"AG06F_47", "GGAGTAACGC"}, - {"AG06R_47", "CCAACTCCGA"}, - {"AH06F_48", "AGCCTTAGCG"}, - {"AH06R_48", "CACGAGTATG"}, - {"AA07F_49", "TTACCTCAGT"}, - {"AA07R_49", "CCGCTACCAA"}, - {"AB07F_50", "CAGGCATTGT"}, - {"AB07R_50", "CTGAACCTCC"}, - {"AC07F_51", "GTGTTCCACG"}, - {"AC07R_51", "GGCCTTGTTA"}, - {"AD07F_52", "TTGATCCGCC"}, - {"AD07R_52", "TTAACGCAGA"}, - {"AE07F_53", "GGAGGCTGAT"}, - {"AE07R_53", "AGGTAGTGCG"}, - {"AF07F_54", "AACGTGACAA"}, - {"AF07R_54", "CGTGTAACTT"}, - {"AG07F_55", "CACAAGCTCC"}, - {"AG07R_55", "ACTTGTGACG"}, - {"AH07F_56", "CCGTGTTGTC"}, - {"AH07R_56", "CCATGCGTTG"}, - {"AA08F_57", "TTGAGCCAGC"}, - {"AA08R_57", "CCTTGTAGCG"}, - {"AB08F_58", "GCGTTACAGA"}, - {"AB08R_58", "ACATACGTGA"}, - {"AC08F_59", "TCCAGACATT"}, - {"AC08R_59", "CTTGATATCC"}, - {"AD08F_60", "TCGAACTCTT"}, - {"AD08R_60", "CAGCCGATGT"}, - {"AE08F_61", "ACCTTCTCGG"}, - {"AE08R_61", "TCATGCGCTA"}, - {"AF08F_62", "AGACGCCAAC"}, - {"AF08R_62", "ACTCCGTCCA"}, - {"AG08F_63", "CAACCGTAAT"}, - {"AG08R_63", "GACAGCCTTG"}, - {"AH08F_64", "TTATGCGTTG"}, - {"AH08R_64", "CGGTTATCTG"}, - {"AA09F_65", "CTATGAGAAC"}, - {"AA09R_65", "TACTCCACGG"}, - {"AB09F_66", "AAGTTACACG"}, - {"AB09R_66", "ACTTCCGGCA"}, - {"AC09F_67", "GCAATGTGAG"}, - {"AC09R_67", "GTGAAGCTGC"}, - {"AD09F_68", "CGAAGTCGCA"}, - {"AD09R_68", "TTGCTCTTCT"}, - {"AE09F_69", "CCTGATTCAA"}, - {"AE09R_69", "AACGCACGTA"}, - {"AF09F_70", "TAGAACGTGC"}, - {"AF09R_70", "TTACTGCAGG"}, - {"AG09F_71", "TTCGCAAGGT"}, - {"AG09R_71", "CCAGTTGAGG"}, - {"AH09F_72", "TTAATGCCGA"}, - {"AH09R_72", "TGTGCGTTAA"}, - {"AA10F_73", "AGAACAGAGT"}, - {"AA10R_73", "ACTAGTGCTT"}, - {"AB10F_74", "CCATCTGTTC"}, - {"AB10R_74", "CGTGGAACAC"}, - {"AC10F_75", "TTCGTAGGTG"}, - {"AC10R_75", "ATGGAAGTGG"}, - {"AD10F_76", "GCACGGTACA"}, - {"AD10R_76", "TGAGATCACA"}, - {"AE10F_77", "TGTCAAGAGG"}, - {"AE10R_77", "GTCCTTGGTG"}, - {"AF10F_78", "TCTAAGGTAC"}, - {"AF10R_78", "GAGCGTGGAA"}, - {"AG10F_79", "GAACGGAGAC"}, - {"AG10R_79", "CACACGCTGT"}, - {"AH10F_80", "CGCTACCATC"}, - {"AH10R_80", "TGGTTGTACA"}, - {"AA11F_81", "TTACGGTAAC"}, - {"AA11R_81", "ATCACTCACA"}, - {"AB11F_82", "TTCAGATGGA"}, - {"AB11R_82", "CGGAGGTAGA"}, - {"AC11F_83", "TAGCATCTGT"}, - {"AC11R_83", "GAGTTGACAA"}, - {"AD11F_84", "GGACGAGATC"}, - {"AD11R_84", "GCCGAACTTG"}, - {"AE11F_85", "AGGTTCTGTT"}, - {"AE11R_85", "AGGCCTCACA"}, - {"AF11F_86", "CATACTCGTG"}, - {"AF11R_86", "TCTCTGTTAG"}, - {"AG11F_87", "CCGGATACCA"}, - {"AG11R_87", "TCCGACGATT"}, - {"AH11F_88", "ATGTCCACCG"}, - {"AH11R_88", "AGGCTATGTT"}, - {"AA12F_89", "CACCAAGTGG"}, - {"AA12R_89", "CGTTCTCTTG"}, - {"AB12F_90", "TTGAGTACAC"}, - {"AB12R_90", "TTGTCTATGG"}, - {"AC12F_91", "CGGTTCCGTA"}, - {"AC12R_91", "GATGGATACA"}, - {"AD12F_92", "GGAGGTCCTA"}, - {"AD12R_92", "CACTTAGGCG"}, - {"AE12F_93", "CCTGCTTGGA"}, - {"AE12R_93", "ACACTGGCTA"}, - {"AF12F_94", "TTCACGTCAG"}, - {"AF12R_94", "ATCGCCACTG"}, - {"AG12F_95", "AACATAGCCT"}, - {"AG12R_95", "CTGACGTGAA"}, - {"AH12F_96", "TGACATAGTC"}, - {"AH12R_96", "TCAATCGTCT"}, + {"AA01F_01", "GCTGAAGATA"}, + {"AA01R_01", "CCAATATTCG"}, + {"AB01F_02", "TATCCGTGCA"}, + {"AB01R_02", "CGCAGACAAC"}, + {"AC01F_03", "TCTATCAACC"}, + {"AC01R_03", "TCGGAGCAGA"}, + {"AD01F_04", "AGGCAGGAGT"}, + {"AD01R_04", "GAGTCCGTAG"}, + {"AE01F_05", "CGACTATCGG"}, + {"AE01R_05", "ATGTTCACGT"}, + {"AF01F_06", "TTCGATCTTG"}, + {"AF01R_06", "TTCGATGGTT"}, + {"AG01F_07", "GAAGGAGCCT"}, + {"AG01R_07", "TATCCGTGCA"}, + {"AH01F_08", "CTATCCGTAT"}, + {"AH01R_08", "AAGCGCAGAG"}, + {"AA02F_09", "TGAGGCTATT"}, + {"AA02R_09", "CCGACTTAGT"}, + {"AB02F_10", "CCGATTGCAG"}, + {"AB02R_10", "TTCTGCATCG"}, + {"AC02F_11", "ATAACTCAGG"}, + {"AC02R_11", "GGAAGTGCCA"}, + {"AD02F_12", "TCTGGACGTC"}, + {"AD02R_12", "AGATTCAACC"}, + {"AE02F_13", "CCGATTATTC"}, + {"AE02R_13", "TTCAGGAGAT"}, + {"AF02F_14", "CGGAGTGTGT"}, + {"AF02R_14", "AAGGCGTCTG"}, + {"AG02F_15", "CGGTCGGTAA"}, + {"AG02R_15", "ACGCTTGACA"}, + {"AH02F_16", "GGCGAACACT"}, + {"AH02R_16", "CATGAAGTGA"}, + {"AA03F_17", "AAGAACGTAG"}, + {"AA03R_17", "TTACGACCTG"}, + {"AB03F_18", "TTCGTGTCGA"}, + {"AB03R_18", "ATGCAAGCCG"}, + {"AC03F_19", "AAGTTATCGG"}, + {"AC03R_19", "CTCCGTATAC"}, + {"AD03F_20", "CGATGTCCAA"}, + {"AD03R_20", "GAATCTGGTC"}, + {"AE03F_21", "TCTCAACGTT"}, + {"AE03R_21", "CGGTCGGTAA"}, + {"AF03F_22", "TTCACTGGCC"}, + {"AF03R_22", "TCTGCTAATG"}, + {"AG03F_23", "CCGGAGACAT"}, + {"AG03R_23", "CTCTTATTCG"}, + {"AH03F_24", "GAACGCCTTC"}, + {"AH03R_24", "CACCTCTAGC"}, + {"AA04F_25", "TCTAGGAACA"}, + {"AA04R_25", "TTACTTACCG"}, + {"AB04F_26", "ACCTCGAGAG"}, + {"AB04R_26", "CTATGCCTTA"}, + {"AC04F_27", "TACCGTACAG"}, + {"AC04R_27", "GGAAGGTACG"}, + {"AD04F_28", "TTGCCATAAG"}, + {"AD04R_28", "GAGGAGACGT"}, + {"AE04F_29", "GCTATGCGGA"}, + {"AE04R_29", "ACGCAAGGCA"}, + {"AF04F_30", "AGGTGCTTGC"}, + {"AF04R_30", "TATCCTGACG"}, + {"AG04F_31", "TAGGACAGGC"}, + {"AG04R_31", "GAAGACCGCT"}, + {"AH04F_32", "GATAGACAGT"}, + {"AH04R_32", "CAACGTGGAC"}, + {"AA05F_33", "TACATGGACG"}, + {"AA05R_33", "TAAGTGCTCG"}, + {"AB05F_34", "TTGCAGTTAG"}, + {"AB05R_34", "CACATCGTAG"}, + {"AC05F_35", "ACCACAAGCA"}, + {"AC05R_35", "ACTACCGAGG"}, + {"AD05F_36", "TGTGCTTACA"}, + {"AD05R_36", "GATGTGTTCT"}, + {"AE05F_37", "ACGCAACGAG"}, + {"AE05R_37", "AAGTGTCGTA"}, + {"AF05F_38", "CACCTCTAGC"}, + {"AF05R_38", "GGAGAACCAC"}, + {"AG05F_39", "TTCTCCGCTT"}, + {"AG05R_39", "TGTACGAACT"}, + {"AH05F_40", "CAGCGTCATT"}, + {"AH05R_40", "GGATGAGTGC"}, + {"AA06F_41", "CGCGTACCAA"}, + {"AA06R_41", "TAGTAGGACA"}, + {"AB06F_42", "TTCACCTTCA"}, + {"AB06R_42", "ACGCCTCGTT"}, + {"AC06F_43", "AAGCCACTAC"}, + {"AC06R_43", "CACCGCTGTT"}, + {"AD06F_44", "TTCTGTTACG"}, + {"AD06R_44", "TCTATAGCGG"}, + {"AE06F_45", "TTATGGCCTT"}, + {"AE06R_45", "CCGATGGACA"}, + {"AF06F_46", "GGTCTATGAA"}, + {"AF06R_46", "TTCAACATGC"}, + {"AG06F_47", "TCGGAGTTGG"}, + {"AG06R_47", "GGAGTAACGC"}, + {"AH06F_48", "CATACTCGTG"}, + {"AH06R_48", "AGCCTTAGCG"}, + {"AA07F_49", "TTGGTAGCGG"}, + {"AA07R_49", "TTACCTCAGT"}, + {"AB07F_50", "GGAGGTTCAG"}, + {"AB07R_50", "CAGGCATTGT"}, + {"AC07F_51", "TAACAAGGCC"}, + {"AC07R_51", "GTGTTCCACG"}, + {"AD07F_52", "TCTGCGTTAA"}, + {"AD07R_52", "TTGATCCGCC"}, + {"AE07F_53", "CGCACTACCT"}, + {"AE07R_53", "GGAGGCTGAT"}, + {"AF07F_54", "AAGTTACACG"}, + {"AF07R_54", "AACGTGACAA"}, + {"AG07F_55", "CGTCACAAGT"}, + {"AG07R_55", "CACAAGCTCC"}, + {"AH07F_56", "CAACGCATGG"}, + {"AH07R_56", "CCGTGTTGTC"}, + {"AA08F_57", "CGCTACAAGG"}, + {"AA08R_57", "TTGAGCCAGC"}, + {"AB08F_58", "TCACGTATGT"}, + {"AB08R_58", "GCGTTACAGA"}, + {"AC08F_59", "GGATATCAAG"}, + {"AC08R_59", "TCCAGACATT"}, + {"AD08F_60", "ACATCGGCTG"}, + {"AD08R_60", "TCGAACTCTT"}, + {"AE08F_61", "TAGCGCATGA"}, + {"AE08R_61", "ACCTTCTCGG"}, + {"AF08F_62", "TGGACGGAGT"}, + {"AF08R_62", "AGACGCCAAC"}, + {"AG08F_63", "CAAGGCTGTC"}, + {"AG08R_63", "CAACCGTAAT"}, + {"AH08F_64", "CAGATAACCG"}, + {"AH08R_64", "TTATGCGTTG"}, + {"AA09F_65", "CCGTGGAGTA"}, + {"AA09R_65", "CTATGAGAAC"}, + {"AB09F_66", "TGCCGGAAGT"}, + {"AB09R_66", "AAGTTACACG"}, + {"AC09F_67", "GCAGCTTCAC"}, + {"AC09R_67", "GCAATGTGAG"}, + {"AD09F_68", "AGAAGAGCAA"}, + {"AD09R_68", "CGAAGTCGCA"}, + {"AE09F_69", "TACGTGCGTT"}, + {"AE09R_69", "CCTGATTCAA"}, + {"AF09F_70", "CCTGCAGTAA"}, + {"AF09R_70", "TAGAACGTGC"}, + {"AG09F_71", "CCTCAACTGG"}, + {"AG09R_71", "TTCGCAAGGT"}, + {"AH09F_72", "TTAACGCACA"}, + {"AH09R_72", "TTAATGCCGA"}, + {"AA10F_73", "AAGCACTAGT"}, + {"AA10R_73", "AGAACAGAGT"}, + {"AB10F_74", "GTGTTCCACG"}, + {"AB10R_74", "CCATCTGTTC"}, + {"AC10F_75", "CCACTTCCAT"}, + {"AC10R_75", "TTCGTAGGTG"}, + {"AD10F_76", "TGTGATCTCA"}, + {"AD10R_76", "GCACGGTACA"}, + {"AE10F_77", "CACCAAGGAC"}, + {"AE10R_77", "TGTCAAGAGG"}, + {"AF10F_78", "TTCCACGCTC"}, + {"AF10R_78", "TCTAAGGTAC"}, + {"AG10F_79", "ACAGCGTGTG"}, + {"AG10R_79", "GAACGGAGAC"}, + {"AH10F_80", "TGTACAACCA"}, + {"AH10R_80", "GAACGGAGAC"}, + {"AA11F_81", "TGTGAGTGAT"}, + {"AA11R_81", "TTACGGTAAC"}, + {"AB11F_82", "TCTACCTCCG"}, + {"AB11R_82", "TTCAGATGGA"}, + {"AC11F_83", "TTGTCAACTC"}, + {"AC11R_83", "TAGCATCTGT"}, + {"AD11F_84", "CAAGTTCGGC"}, + {"AD11R_84", "GGACGAGATC"}, + {"AE11F_85", "TGTGAGGCCT"}, + {"AE11R_85", "AGGTTCTGTT"}, + {"AF11F_86", "CTAACAGAGA"}, + {"AF11R_86", "CATACTCGTG"}, + {"AG11F_87", "AATCGTCGGA"}, + {"AG11R_87", "CCGGATACCA"}, + {"AH11F_88", "AACATAGCCT"}, + {"AH11R_88", "ATGTCCACCG"}, + {"AA12F_89", "CAAGAGAACG"}, + {"AA12R_89", "CACCAAGTGG"}, + {"AB12F_90", "CCATAGACAA"}, + {"AB12R_90", "TTGAGTACAC"}, + {"AC12F_91", "TGTATCCATC"}, + {"AC12R_91", "CGGTTCCGTA"}, + {"AD12F_92", "CGCCTAAGTG"}, + {"AD12R_92", "GGAGGTCCTA"}, + {"AE12F_93", "TAGCCAGTGT"}, + {"AE12R_93", "CCTGCTTGGA"}, + {"AF12F_94", "CAGTGGCGAT"}, + {"AF12R_94", "TTCACGTCAG"}, + {"AG12F_95", "TTCACGTCAG"}, + {"AG12R_95", "AACATAGCCT"}, + {"AH12F_96", "AGACGATTGA"}, + {"AH12R_96", "TGACATAGTC"}, }; } // namespace From e33d5c76c8e395de1c9028aecb7801af301a70c5 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Thu, 21 Dec 2023 18:52:47 +0000 Subject: [PATCH 06/20] support only parsing scores from custom kit file --- dorado/demux/BarcodeClassifier.cpp | 16 +++++----- dorado/demux/BarcodeClassifierSelector.cpp | 20 ++++++++----- dorado/demux/parse_custom_kit.cpp | 1 + .../read_pipeline/BarcodeClassifierNode.cpp | 29 ++++++++++++++++++- dorado/read_pipeline/BarcodeClassifierNode.h | 3 ++ dorado/utils/types.cpp | 5 ---- 6 files changed, 53 insertions(+), 21 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index ef453d81..11ae4f31 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -119,7 +119,9 @@ std::unordered_map process_custom_ki std::unordered_map kit_map; if (custom_kit) { auto [kit_name, kit_info] = demux::parse_custom_arrangement(*custom_kit); - kit_map[kit_name] = kit_info; + if (!kit_name.empty()) { + kit_map[kit_name] = kit_info; + } } return kit_map; } @@ -208,17 +210,15 @@ std::vector BarcodeClassifier::generate_ const std::vector& kit_names) { std::vector candidates_list; - const auto& kit_info_map = barcode_kits::get_kit_infos(); - std::vector final_kit_names; if (!m_custom_kit.empty()) { for (auto& [kit_name, _] : m_custom_kit) { final_kit_names.push_back(kit_name); } } else if (kit_names.empty()) { - for (auto& [kit_name, _] : kit_info_map) { - final_kit_names.push_back(kit_name); - } + throw std::runtime_error( + "Either custom kit must include kit arrangement or a kit name needs to be passed " + "in."); } else { final_kit_names = kit_names; } @@ -613,11 +613,11 @@ BarcodeScoreResult BarcodeClassifier::find_best_barcode( auto best_bottom_score = std::max_element( scores.begin(), scores.end(), [](const auto& l, const auto& r) { return l.bottom_score < r.bottom_score; }); - spdlog::trace("Check double ends: top bc {}, bottom bc {}", best_top_score->barcode_name, - best_bottom_score->barcode_name); if ((best_top_score->score > m_scoring_params.min_soft_barcode_threshold) && (best_bottom_score->score > m_scoring_params.min_soft_barcode_threshold) && (best_top_score->barcode_name != best_bottom_score->barcode_name)) { + spdlog::trace("Two ends confidently predict different BCs: top bc {}, bottom bc {}", + best_top_score->barcode_name, best_bottom_score->barcode_name); return UNCLASSIFIED; } } diff --git a/dorado/demux/BarcodeClassifierSelector.cpp b/dorado/demux/BarcodeClassifierSelector.cpp index 137e2555..65df894c 100644 --- a/dorado/demux/BarcodeClassifierSelector.cpp +++ b/dorado/demux/BarcodeClassifierSelector.cpp @@ -11,15 +11,21 @@ namespace dorado::demux { std::shared_ptr BarcodeClassifierSelector::get_barcoder( const BarcodingInfo& barcode_kit_info) { - assert(!barcode_kit_info.kit_name.empty()); + if (barcode_kit_info.kit_name.empty() && !barcode_kit_info.custom_kit.has_value()) { + throw std::runtime_error("Either kit name or custom kit file must be specified!"); + } + const auto kit_id = barcode_kit_info.kit_name.empty() ? *barcode_kit_info.custom_kit + : barcode_kit_info.kit_name; std::lock_guard lock(m_mutex); - if (!m_barcoder_lut.count(barcode_kit_info.kit_name)) { - m_barcoder_lut.emplace(barcode_kit_info.kit_name, - std::make_shared( - std::vector{barcode_kit_info.kit_name}, - barcode_kit_info.custom_kit, barcode_kit_info.custom_seqs)); + if (!m_barcoder_lut.count(kit_id)) { + m_barcoder_lut.emplace( + kit_id, std::make_shared( + barcode_kit_info.kit_name.empty() + ? std::vector{} + : std::vector{barcode_kit_info.kit_name}, + barcode_kit_info.custom_kit, barcode_kit_info.custom_seqs)); } - return m_barcoder_lut.at(barcode_kit_info.kit_name); + return m_barcoder_lut.at(kit_id); } } // namespace dorado::demux diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index e1544342..13740a24 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -41,6 +41,7 @@ std::pair parse_custom_arrangement( const toml::value config_toml = toml::parse(arrangement_file); if (!config_toml.contains("arrangement")) { + return {"", barcode_kits::KitInfo{}}; throw std::runtime_error( "Custom barcode arrangement file must have [arrangement] section."); } diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 798cb291..4e26e002 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -86,7 +86,33 @@ void BarcodeClassifierNode::restart() { start_threads(); } -BarcodeClassifierNode::~BarcodeClassifierNode() { terminate_impl(); } +BarcodeClassifierNode::~BarcodeClassifierNode() { + terminate_impl(); + static bool done = false; + if (!done) { + done = !done; + spdlog::info("Barcode distribution :"); + size_t unclassified = 0; + size_t fp = 0; + size_t total = 0; + for (const auto& [k, v] : bc_count) { + std::unordered_set tps = {"TWIST-ALL_barcode33", "TWIST-ALL_barcode34", + "TWIST-ALL_barcode35", "TWIST-ALL_barcode36", + "TWIST-ALL_barcode37", "TWIST-ALL_barcode38", + "TWIST-ALL_barcode39", "TWIST-ALL_barcode40", + "TWIST-ALL_barcode41", "TWIST-ALL_barcode42"}; + spdlog::info("{} : {}", k, v); + total += v; + if (k == "unclassified") { + unclassified += v; + } else if (tps.find(k) == tps.end()) { + fp += v; + } + } + spdlog::info("Unclassified rate {}", float(unclassified) / total); + spdlog::info("False Positive rate {}", float(fp) / total); + } +} void BarcodeClassifierNode::worker_thread() { Message message; @@ -134,6 +160,7 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, m_default_barcoding_info->allowed_barcodes); auto bc = generate_barcode_string(bc_res); + bc_count[bc]++; bam_aux_append(irecord, "BC", 'Z', int(bc.length() + 1), (uint8_t*)bc.c_str()); m_num_records++; diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index 68d1b7dc..5e69a6bc 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -5,6 +5,7 @@ #include "utils/types.h" #include +#include #include #include #include @@ -46,6 +47,8 @@ class BarcodeClassifierNode : public MessageSink { void barcode(SimplexRead& read); void terminate_impl(); + + std::map> bc_count; }; } // namespace dorado diff --git a/dorado/utils/types.cpp b/dorado/utils/types.cpp index e3a47f5c..e7ac3bab 100644 --- a/dorado/utils/types.cpp +++ b/dorado/utils/types.cpp @@ -22,11 +22,6 @@ std::shared_ptr create_barcoding_info( std::string kit_name = ""; if (!kit_names.empty()) { kit_name = kit_names[0]; - } else if (custom_kit.has_value()) { - kit_name = *custom_kit; - } else { - throw std::runtime_error( - "Neither kit name nor custom kit path was specified for BarcodeingInfo creation."); } spdlog::debug("Creating barcoding info for kit: {}", kit_name); auto result = From e1f436d0065645952be04f38b2f2918e45d59c65 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:42:52 +0000 Subject: [PATCH 07/20] add test for score only toml file along with - better error handling - moving more debug info to trace level - fix input read counting --- dorado/demux/BarcodeClassifier.cpp | 8 ++++-- dorado/demux/parse_custom_kit.cpp | 1 - .../read_pipeline/BarcodeClassifierNode.cpp | 28 ++++++++----------- dorado/read_pipeline/BarcodeClassifierNode.h | 7 +++-- dorado/read_pipeline/HtsReader.cpp | 3 +- dorado/utils/barcode_kits.cpp | 24 +--------------- tests/BarcodeClassifierTest.cpp | 12 ++++++++ 7 files changed, 38 insertions(+), 45 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index 11ae4f31..a5a072dc 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -118,9 +118,13 @@ std::unordered_map process_custom_ki const std::optional& custom_kit) { std::unordered_map kit_map; if (custom_kit) { - auto [kit_name, kit_info] = demux::parse_custom_arrangement(*custom_kit); - if (!kit_name.empty()) { + try { + auto [kit_name, kit_info] = demux::parse_custom_arrangement(*custom_kit); kit_map[kit_name] = kit_info; + } catch (const std::runtime_error& e) { + // This implies no arrangement information was found. This is okay + // since the custom kit may only have scoring information. + (void)e; } } return kit_map; diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index 13740a24..e1544342 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -41,7 +41,6 @@ std::pair parse_custom_arrangement( const toml::value config_toml = toml::parse(arrangement_file); if (!config_toml.contains("arrangement")) { - return {"", barcode_kits::KitInfo{}}; throw std::runtime_error( "Custom barcode arrangement file must have [arrangement] section."); } diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 4e26e002..fc047775 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -55,7 +55,7 @@ BarcodeClassifierNode::BarcodeClassifierNode(int threads, if (m_default_barcoding_info->kit_name.empty()) { spdlog::debug("Barcode with new kit from {}", *m_default_barcoding_info->custom_kit); } else { - spdlog::info("Barcode for {}", m_default_barcoding_info->kit_name); + spdlog::debug("Barcode for {}", m_default_barcoding_info->kit_name); } start_threads(); } @@ -88,29 +88,22 @@ void BarcodeClassifierNode::restart() { BarcodeClassifierNode::~BarcodeClassifierNode() { terminate_impl(); + // Report how many reads were classified into each + // barcode. static bool done = false; - if (!done) { + if (!done && (spdlog::get_level() <= spdlog::level::debug)) { done = !done; - spdlog::info("Barcode distribution :"); + spdlog::debug("Barcode distribution :"); size_t unclassified = 0; - size_t fp = 0; size_t total = 0; - for (const auto& [k, v] : bc_count) { - std::unordered_set tps = {"TWIST-ALL_barcode33", "TWIST-ALL_barcode34", - "TWIST-ALL_barcode35", "TWIST-ALL_barcode36", - "TWIST-ALL_barcode37", "TWIST-ALL_barcode38", - "TWIST-ALL_barcode39", "TWIST-ALL_barcode40", - "TWIST-ALL_barcode41", "TWIST-ALL_barcode42"}; - spdlog::info("{} : {}", k, v); + for (const auto& [k, v] : m_barcode_count) { + spdlog::debug("{} : {}", k, v); total += v; if (k == "unclassified") { unclassified += v; - } else if (tps.find(k) == tps.end()) { - fp += v; } } - spdlog::info("Unclassified rate {}", float(unclassified) / total); - spdlog::info("False Positive rate {}", float(fp) / total); + spdlog::debug("Classified rate {}%", (1.f - float(unclassified) / total) * 100.f); } } @@ -160,9 +153,12 @@ void BarcodeClassifierNode::barcode(BamPtr& read) { auto bc_res = barcoder->barcode(seq, m_default_barcoding_info->barcode_both_ends, m_default_barcoding_info->allowed_barcodes); auto bc = generate_barcode_string(bc_res); - bc_count[bc]++; bam_aux_append(irecord, "BC", 'Z', int(bc.length() + 1), (uint8_t*)bc.c_str()); m_num_records++; + { + std::lock_guard lock(m_barcode_count_mutex); + m_barcode_count[bc]++; + } if (m_default_barcoding_info->trim) { int seqlen = irecord->core.l_qseq; diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index 5e69a6bc..2955aa52 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -4,9 +4,9 @@ #include "utils/stats.h" #include "utils/types.h" -#include #include #include +#include #include #include #include @@ -48,7 +48,10 @@ class BarcodeClassifierNode : public MessageSink { void terminate_impl(); - std::map> bc_count; + // Track how mant reads were classified as each barcode for debugging + // purposes. + std::map> m_barcode_count; + std::mutex m_barcode_count_mutex; }; } // namespace dorado diff --git a/dorado/read_pipeline/HtsReader.cpp b/dorado/read_pipeline/HtsReader.cpp index d1d5e8e2..e3b87b54 100644 --- a/dorado/read_pipeline/HtsReader.cpp +++ b/dorado/read_pipeline/HtsReader.cpp @@ -55,7 +55,8 @@ void HtsReader::read(Pipeline& pipeline, int max_reads) { } } pipeline.push_message(BamPtr(bam_dup1(record.get()))); - if (max_reads > 0 && ++num_reads >= max_reads) { + ++num_reads; + if (max_reads > 0 && num_reads >= max_reads) { break; } if (num_reads % 50000 == 0) { diff --git a/dorado/utils/barcode_kits.cpp b/dorado/utils/barcode_kits.cpp index d24cc63f..d7155667 100644 --- a/dorado/utils/barcode_kits.cpp +++ b/dorado/utils/barcode_kits.cpp @@ -360,29 +360,6 @@ const std::unordered_map kit_info_map = { "AE11R_85", "AF11R_86", "AG11R_87", "AH11R_88", "AA12R_89", "AB12R_90", "AC12R_91", "AD12R_92", "AE12R_93", "AF12R_94", "AG12R_95", "AH12R_96"}, }}, - {"TWIST-FRONT", - {"PGx", - false, - false, - "CATACGAGAT", - "GTGACTGGAG", - "", - "", - {"AA01F_01", "AB01F_02", "AC01F_03", "AD01F_04", "AE01F_05", "AF01F_06", "AG01F_07", - "AH01F_08", "AA02F_09", "AB02F_10", "AC02F_11", "AD02F_12", "AE02F_13", "AF02F_14", - "AG02F_15", "AH02F_16", "AA03F_17", "AB03F_18", "AC03F_19", "AD03F_20", "AE03F_21", - "AF03F_22", "AG03F_23", "AH03F_24", "AA04F_25", "AB04F_26", "AC04F_27", "AD04F_28", - "AE04F_29", "AF04F_30", "AG04F_31", "AH04F_32", "AA05F_33", "AB05F_34", "AC05F_35", - "AD05F_36", "AE05F_37", "AF05F_38", "AG05F_39", "AH05F_40", "AA06F_41", "AB06F_42", - "AC06F_43", "AD06F_44", "AE06F_45", "AF06F_46", "AG06F_47", "AH06F_48", "AA07F_49", - "AB07F_50", "AC07F_51", "AD07F_52", "AE07F_53", "AF07F_54", "AG07F_55", "AH07F_56", - "AA08F_57", "AB08F_58", "AC08F_59", "AD08F_60", "AE08F_61", "AF08F_62", "AG08F_63", - "AH08F_64", "AA09F_65", "AB09F_66", "AC09F_67", "AD09F_68", "AE09F_69", "AF09F_70", - "AG09F_71", "AH09F_72", "AA10F_73", "AB10F_74", "AC10F_75", "AD10F_76", "AE10F_77", - "AF10F_78", "AG10F_79", "AH10F_80", "AA11F_81", "AB11F_82", "AC11F_83", "AD11F_84", - "AE11F_85", "AF11F_86", "AG11F_87", "AH11F_88", "AA12F_89", "AB12F_90", "AC12F_91", - "AD12F_92", "AE12F_93", "AF12F_94", "AG12F_95", "AH12F_96"}, - {}}}, }; const std::unordered_map barcodes = { @@ -838,6 +815,7 @@ std::string barcode_kits_list_str() { std::string normalize_barcode_name(const std::string& barcode_name) { std::string digits = ""; + // Normalize using only the digits at the end of the barcode name. for (auto rit = barcode_name.rbegin(); rit != barcode_name.rend(); ++rit) { if (std::isdigit(static_cast(*rit))) { digits += *rit; diff --git a/tests/BarcodeClassifierTest.cpp b/tests/BarcodeClassifierTest.cpp index c28c2d3b..9fc196cd 100644 --- a/tests/BarcodeClassifierTest.cpp +++ b/tests/BarcodeClassifierTest.cpp @@ -417,3 +417,15 @@ TEST_CASE("BarcodeClassifier: test custom kit with double ended barcode", TEST_G } } } + +TEST_CASE( + "BarcodeClassifier: Fail if no kit name is passed and custom kit doesn't contain " + "arrangement", + TEST_GROUP) { + const fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); + const auto kit_file = data_dir / "scoring_params.toml"; + + CHECK_THROWS_WITH( + demux::BarcodeClassifier({}, kit_file.string(), std::nullopt), + "Either custom kit must include kit arrangement or a kit name needs to be passed in."); +} From a67f8d0fc277ac6efa8e86dcbc156c4037bddbb2 Mon Sep 17 00:00:00 2001 From: Joyjit Daw <1127155+tijyojwad@users.noreply.github.com> Date: Wed, 3 Jan 2024 13:34:27 +0000 Subject: [PATCH 08/20] address review comments --- dorado/demux/BarcodeClassifier.cpp | 9 +++------ dorado/demux/parse_custom_kit.cpp | 7 +++---- dorado/demux/parse_custom_kit.h | 3 ++- dorado/read_pipeline/BarcodeClassifierNode.cpp | 10 +++++----- dorado/read_pipeline/BarcodeClassifierNode.h | 5 +++-- tests/CustomBarcodeParserTest.cpp | 8 ++++---- 6 files changed, 20 insertions(+), 22 deletions(-) diff --git a/dorado/demux/BarcodeClassifier.cpp b/dorado/demux/BarcodeClassifier.cpp index a5a072dc..de6af0bc 100644 --- a/dorado/demux/BarcodeClassifier.cpp +++ b/dorado/demux/BarcodeClassifier.cpp @@ -118,13 +118,10 @@ std::unordered_map process_custom_ki const std::optional& custom_kit) { std::unordered_map kit_map; if (custom_kit) { - try { - auto [kit_name, kit_info] = demux::parse_custom_arrangement(*custom_kit); + auto custom_arrangement = demux::parse_custom_arrangement(*custom_kit); + if (custom_arrangement) { + const auto& [kit_name, kit_info] = *custom_arrangement; kit_map[kit_name] = kit_info; - } catch (const std::runtime_error& e) { - // This implies no arrangement information was found. This is okay - // since the custom kit may only have scoring information. - (void)e; } } return kit_map; diff --git a/dorado/demux/parse_custom_kit.cpp b/dorado/demux/parse_custom_kit.cpp index e1544342..5895dd0c 100644 --- a/dorado/demux/parse_custom_kit.cpp +++ b/dorado/demux/parse_custom_kit.cpp @@ -36,13 +36,12 @@ bool check_normalized_id_pattern(const std::string& pattern) { return true; } -std::pair parse_custom_arrangement( +std::optional> parse_custom_arrangement( const std::string& arrangement_file) { const toml::value config_toml = toml::parse(arrangement_file); if (!config_toml.contains("arrangement")) { - throw std::runtime_error( - "Custom barcode arrangement file must have [arrangement] section."); + return std::nullopt; } barcode_kits::KitInfo new_kit; @@ -108,7 +107,7 @@ std::pair parse_custom_arrangement( (barcode1_pattern != barcode2_pattern); } - return {kit_name, new_kit}; + return std::make_pair(kit_name, new_kit); } BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file) { diff --git a/dorado/demux/parse_custom_kit.h b/dorado/demux/parse_custom_kit.h index d69e722b..2eeaf833 100644 --- a/dorado/demux/parse_custom_kit.h +++ b/dorado/demux/parse_custom_kit.h @@ -2,6 +2,7 @@ #include "utils/barcode_kits.h" +#include #include #include @@ -15,7 +16,7 @@ struct BarcodeKitScoringParams { float min_barcode_score_dist = 0.25f; }; -std::pair parse_custom_arrangement( +std::optional> parse_custom_arrangement( const std::string& arrangement_file); std::unordered_map parse_custom_barcode_sequences( diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index fc047775..6bbc33f1 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -96,11 +96,11 @@ BarcodeClassifierNode::~BarcodeClassifierNode() { spdlog::debug("Barcode distribution :"); size_t unclassified = 0; size_t total = 0; - for (const auto& [k, v] : m_barcode_count) { - spdlog::debug("{} : {}", k, v); - total += v; - if (k == "unclassified") { - unclassified += v; + for (const auto& [bc_name, bc_count] : m_barcode_count) { + spdlog::debug("{} : {}", bc_name, bc_count); + total += bc_count; + if (bc_name == "unclassified") { + unclassified += bc_count; } } spdlog::debug("Classified rate {}%", (1.f - float(unclassified) / total) * 100.f); diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index 2955aa52..618e1a0e 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -4,6 +4,7 @@ #include "utils/stats.h" #include "utils/types.h" +#include #include #include #include @@ -48,9 +49,9 @@ class BarcodeClassifierNode : public MessageSink { void terminate_impl(); - // Track how mant reads were classified as each barcode for debugging + // Track how many reads were classified as each barcode for debugging // purposes. - std::map> m_barcode_count; + std::map m_barcode_count; std::mutex m_barcode_count_mutex; }; diff --git a/tests/CustomBarcodeParserTest.cpp b/tests/CustomBarcodeParserTest.cpp index 6e27597a..8cca792d 100644 --- a/tests/CustomBarcodeParserTest.cpp +++ b/tests/CustomBarcodeParserTest.cpp @@ -9,7 +9,7 @@ TEST_CASE("Parse custom single ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_single_ended.toml"; - auto [kit_name, kit_info] = dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_single_ended"); CHECK(kit_info.barcodes.size() == 4); @@ -29,7 +29,7 @@ TEST_CASE("Parse double ended barcode arrangement", "[barcode_demux]") { fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_double_ended.toml"; - auto [kit_name, kit_info] = dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_double_ended"); CHECK(kit_info.barcodes.size() == 24); @@ -49,7 +49,7 @@ TEST_CASE("Parse double ended barcode arrangement with different flanks", "[barc fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_flanks.toml"; - auto [kit_name, kit_info] = dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_flanks"); CHECK(kit_info.barcodes.size() == 96); @@ -69,7 +69,7 @@ TEST_CASE("Parse double ended barcode arrangement with different barcodes", "[ba fs::path data_dir = fs::path(get_data_dir("barcode_demux/custom_barcodes")); const auto test_file = data_dir / "test_kit_ends_different_barcodes.toml"; - auto [kit_name, kit_info] = dorado::demux::parse_custom_arrangement(test_file.string()); + auto [kit_name, kit_info] = *dorado::demux::parse_custom_arrangement(test_file.string()); CHECK(kit_name == "test_kit_ends_different_barcodes"); CHECK(kit_info.barcodes.size() == 12); From 07fbabcf04535608af6e71a499addce9f3055048 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Thu, 4 Jan 2024 14:42:56 +0000 Subject: [PATCH 09/20] INSTX-3348 Improved error reporting when the device string is invalid for CUDA devices, and added a get_cuda_device_info function to inspect the devices on the host. --- dorado/utils/cuda_utils.cpp | 67 +++++++++++++++++++++++++++++++++++-- dorado/utils/cuda_utils.h | 17 +++++++++- 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/dorado/utils/cuda_utils.cpp b/dorado/utils/cuda_utils.cpp index 55b7a66b..b0d9439d 100644 --- a/dorado/utils/cuda_utils.cpp +++ b/dorado/utils/cuda_utils.cpp @@ -121,17 +121,26 @@ std::vector parse_cuda_device_string(std::string device_string) { std::regex e("[0-9]+"); std::smatch m; + auto num_devices = torch::cuda::device_count(); if (device_string.substr(0, 5) != "cuda:") { return devices; // empty vector; } else if (device_string == "cuda:all" || device_string == "cuda:auto") { - auto num_devices = torch::cuda::device_count(); + if (num_devices == 0) { + throw std::runtime_error("device string set to " + device_string + " but no CUDA devices available."); + } for (size_t i = 0; i < num_devices; i++) { devices.push_back("cuda:" + std::to_string(i)); } } else { while (std::regex_search(device_string, m, e)) { for (auto x : m) { - devices.push_back("cuda:" + x.str()); + std::string device_id = x.str(); + int device_idx = std::stoi(device_id); + if (device_idx >= num_devices || device_idx < 0) { + throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + + ", there are " + std::to_string(num_devices) + " visible CUDA devices."); + } + devices.push_back("cuda:" + device_id); } device_string = m.suffix().str(); } @@ -140,6 +149,60 @@ std::vector parse_cuda_device_string(std::string device_string) { return devices; } +std::vector get_cuda_device_info(std::string device_string) { + std::vector results; + std::regex e("[0-9]+"); + std::smatch m; + auto num_devices = torch::cuda::device_count(); + + // Get the set of ids that are in use according to the device_string + std::set device_ids; + if (device_string.substr(0, 5) != "cuda:") { + // Nothing to add to device_ids + } + else if (device_string == "cuda:all" || device_string == "cuda:auto") { + if (num_devices == 0) { + throw std::runtime_error("device string set to " + device_string + " but no CUDA devices available."); + } + for (int i = 0; i < int(num_devices); i++) { + device_ids.insert(i); + } + } + else { + while (std::regex_search(device_string, m, e)) { + for (auto x : m) { + std::string device_id = x.str(); + int device_idx = std::stoi(device_id); + if (device_idx >= num_devices || device_idx < 0) { + throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + + ", there are " + std::to_string(num_devices) + " visible CUDA devices."); + } + device_ids.insert(device_idx); + } + device_string = m.suffix().str(); + } + } + + // Now inspect all the devices on the host to create the CUDADeviceInfo + for (int device_id = 0; device_id < int(num_devices); device_id++) { + CUDADeviceInfo device_info; + device_info.device_id = device_id; + + cudaSetDevice(device_id); + cudaMemGetInfo(&device_info.free_mem, &device_info.total_mem); + cudaDeviceGetAttribute(&device_info.compute_cap_major, cudaDevAttrComputeCapabilityMajor, device_id); + cudaDeviceGetAttribute(&device_info.compute_cap_minor, cudaDevAttrComputeCapabilityMinor, device_id); + cudaGetDeviceProperties(&device_info.device_properties, device_id); + + device_info.in_use = device_ids.find(device_id) != device_ids.end(); + + results.push_back(device_info); + } + + return results; +} + + std::unique_lock acquire_gpu_lock(int gpu_index, bool use_lock) { static std::vector gpu_mutexes(torch::cuda::device_count()); diff --git a/dorado/utils/cuda_utils.h b/dorado/utils/cuda_utils.h index 1c4f7999..3393126d 100644 --- a/dorado/utils/cuda_utils.h +++ b/dorado/utils/cuda_utils.h @@ -2,6 +2,8 @@ #include +#include + #include #include #include @@ -16,9 +18,22 @@ namespace dorado::utils { std::unique_lock acquire_gpu_lock(int gpu_index, bool use_lock); // Given a string representing cuda devices (e.g "cuda:0,1,3") returns a vector of strings, one for -// each device (e.g ["cuda:0", "cuda:2", ..., "cuda:7"] +// each device (e.g ["cuda:0", "cuda:2", ..., "cuda:7"]. This function will validate that the device IDs +// exist and will raise an exception if there is any issue with the string. std::vector parse_cuda_device_string(std::string device_string); +struct CUDADeviceInfo { + size_t free_mem, total_mem; + int device_id; + int compute_cap_major, compute_cap_minor; + cudaDeviceProp device_properties; + bool in_use; +}; + +// Given a string representing cuda devices (e.g "cuda:0,1,3") returns a vector of CUDADeviceInfo for all +// visible devices on the host machine, with information on whether they are in use or not. +std::vector get_cuda_device_info(std::string device_string); + // Reports the amount of available memory (in bytes) for a given device. size_t available_memory(torch::Device device); From 93ef341f5ff10cf40f24f4862de96699b2a07ee7 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Thu, 4 Jan 2024 14:58:00 +0000 Subject: [PATCH 10/20] Clang format fix. --- dorado/utils/cuda_utils.cpp | 33 +++++++++++++++++++-------------- dorado/utils/cuda_utils.h | 3 +-- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/dorado/utils/cuda_utils.cpp b/dorado/utils/cuda_utils.cpp index b0d9439d..21ecfcd5 100644 --- a/dorado/utils/cuda_utils.cpp +++ b/dorado/utils/cuda_utils.cpp @@ -126,7 +126,8 @@ std::vector parse_cuda_device_string(std::string device_string) { return devices; // empty vector; } else if (device_string == "cuda:all" || device_string == "cuda:auto") { if (num_devices == 0) { - throw std::runtime_error("device string set to " + device_string + " but no CUDA devices available."); + throw std::runtime_error("device string set to " + device_string + + " but no CUDA devices available."); } for (size_t i = 0; i < num_devices; i++) { devices.push_back("cuda:" + std::to_string(i)); @@ -137,8 +138,10 @@ std::vector parse_cuda_device_string(std::string device_string) { std::string device_id = x.str(); int device_idx = std::stoi(device_id); if (device_idx >= num_devices || device_idx < 0) { - throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + - ", there are " + std::to_string(num_devices) + " visible CUDA devices."); + throw std::runtime_error("Invalid CUDA device index \"" + device_id + + "\" from device string " + device_string + + ", there are " + std::to_string(num_devices) + + " visible CUDA devices."); } devices.push_back("cuda:" + device_id); } @@ -159,23 +162,24 @@ std::vector get_cuda_device_info(std::string device_string) { std::set device_ids; if (device_string.substr(0, 5) != "cuda:") { // Nothing to add to device_ids - } - else if (device_string == "cuda:all" || device_string == "cuda:auto") { + } else if (device_string == "cuda:all" || device_string == "cuda:auto") { if (num_devices == 0) { - throw std::runtime_error("device string set to " + device_string + " but no CUDA devices available."); + throw std::runtime_error("device string set to " + device_string + + " but no CUDA devices available."); } for (int i = 0; i < int(num_devices); i++) { device_ids.insert(i); } - } - else { + } else { while (std::regex_search(device_string, m, e)) { for (auto x : m) { std::string device_id = x.str(); int device_idx = std::stoi(device_id); if (device_idx >= num_devices || device_idx < 0) { - throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + - ", there are " + std::to_string(num_devices) + " visible CUDA devices."); + throw std::runtime_error("Invalid CUDA device index \"" + device_id + + "\" from device string " + device_string + + ", there are " + std::to_string(num_devices) + + " visible CUDA devices."); } device_ids.insert(device_idx); } @@ -187,11 +191,13 @@ std::vector get_cuda_device_info(std::string device_string) { for (int device_id = 0; device_id < int(num_devices); device_id++) { CUDADeviceInfo device_info; device_info.device_id = device_id; - + cudaSetDevice(device_id); cudaMemGetInfo(&device_info.free_mem, &device_info.total_mem); - cudaDeviceGetAttribute(&device_info.compute_cap_major, cudaDevAttrComputeCapabilityMajor, device_id); - cudaDeviceGetAttribute(&device_info.compute_cap_minor, cudaDevAttrComputeCapabilityMinor, device_id); + cudaDeviceGetAttribute(&device_info.compute_cap_major, cudaDevAttrComputeCapabilityMajor, + device_id); + cudaDeviceGetAttribute(&device_info.compute_cap_minor, cudaDevAttrComputeCapabilityMinor, + device_id); cudaGetDeviceProperties(&device_info.device_properties, device_id); device_info.in_use = device_ids.find(device_id) != device_ids.end(); @@ -202,7 +208,6 @@ std::vector get_cuda_device_info(std::string device_string) { return results; } - std::unique_lock acquire_gpu_lock(int gpu_index, bool use_lock) { static std::vector gpu_mutexes(torch::cuda::device_count()); diff --git a/dorado/utils/cuda_utils.h b/dorado/utils/cuda_utils.h index 3393126d..33dcf30d 100644 --- a/dorado/utils/cuda_utils.h +++ b/dorado/utils/cuda_utils.h @@ -1,8 +1,7 @@ #pragma once -#include - #include +#include #include #include From 56082f24404561507c773dc0401f994f2049fd29 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Thu, 4 Jan 2024 15:42:42 +0000 Subject: [PATCH 11/20] Linux build fix for int comparison warning. --- dorado/utils/cuda_utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dorado/utils/cuda_utils.cpp b/dorado/utils/cuda_utils.cpp index 21ecfcd5..91d5a1fe 100644 --- a/dorado/utils/cuda_utils.cpp +++ b/dorado/utils/cuda_utils.cpp @@ -137,7 +137,7 @@ std::vector parse_cuda_device_string(std::string device_string) { for (auto x : m) { std::string device_id = x.str(); int device_idx = std::stoi(device_id); - if (device_idx >= num_devices || device_idx < 0) { + if (device_idx >= int(num_devices) || device_idx < 0) { throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + ", there are " + std::to_string(num_devices) + @@ -175,7 +175,7 @@ std::vector get_cuda_device_info(std::string device_string) { for (auto x : m) { std::string device_id = x.str(); int device_idx = std::stoi(device_id); - if (device_idx >= num_devices || device_idx < 0) { + if (device_idx >= int(num_devices) || device_idx < 0) { throw std::runtime_error("Invalid CUDA device index \"" + device_id + "\" from device string " + device_string + ", there are " + std::to_string(num_devices) + From 0231f202e064f2fb0ddb9e28586669ebccac8dc9 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Thu, 4 Jan 2024 23:12:33 -0500 Subject: [PATCH 12/20] move barcode stats reporting to progress tracker --- .../read_pipeline/BarcodeClassifierNode.cpp | 27 +++++------------- dorado/read_pipeline/ProgressTracker.h | 28 +++++++++++++++++++ 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/dorado/read_pipeline/BarcodeClassifierNode.cpp b/dorado/read_pipeline/BarcodeClassifierNode.cpp index 6bbc33f1..b202a742 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.cpp +++ b/dorado/read_pipeline/BarcodeClassifierNode.cpp @@ -86,26 +86,7 @@ void BarcodeClassifierNode::restart() { start_threads(); } -BarcodeClassifierNode::~BarcodeClassifierNode() { - terminate_impl(); - // Report how many reads were classified into each - // barcode. - static bool done = false; - if (!done && (spdlog::get_level() <= spdlog::level::debug)) { - done = !done; - spdlog::debug("Barcode distribution :"); - size_t unclassified = 0; - size_t total = 0; - for (const auto& [bc_name, bc_count] : m_barcode_count) { - spdlog::debug("{} : {}", bc_name, bc_count); - total += bc_count; - if (bc_name == "unclassified") { - unclassified += bc_count; - } - } - spdlog::debug("Classified rate {}%", (1.f - float(unclassified) / total) * 100.f); - } -} +BarcodeClassifierNode::~BarcodeClassifierNode() { terminate_impl(); } void BarcodeClassifierNode::worker_thread() { Message message; @@ -194,6 +175,12 @@ void BarcodeClassifierNode::barcode(SimplexRead& read) { stats::NamedStats BarcodeClassifierNode::sample_stats() const { auto stats = stats::from_obj(m_work_queue); stats["num_barcodes_demuxed"] = m_num_records.load(); + { + for (const auto& [bc_name, bc_count] : m_barcode_count) { + std::string key = "bc." + bc_name; + stats[key] = static_cast(bc_count); + } + } return stats; } diff --git a/dorado/read_pipeline/ProgressTracker.h b/dorado/read_pipeline/ProgressTracker.h index 822b5692..c31add91 100644 --- a/dorado/read_pipeline/ProgressTracker.h +++ b/dorado/read_pipeline/ProgressTracker.h @@ -69,6 +69,21 @@ class ProgressTracker { rate_str << std::scientific << m_num_barcodes_demuxed / (duration / 1000.0); spdlog::info("> {} reads demuxed @ classifications/s: {}", m_num_barcodes_demuxed, rate_str.str()); + // Report how many reads were classified into each + // barcode. + if (spdlog::get_level() <= spdlog::level::debug) { + spdlog::debug("Barcode distribution :"); + size_t unclassified = 0; + size_t total = 0; + for (const auto& [bc_name, bc_count] : m_barcode_count) { + spdlog::debug("{} : {}", bc_name, bc_count); + total += bc_count; + if (bc_name == "unclassified") { + unclassified += bc_count; + } + } + spdlog::debug("Classified rate {}%", (1.f - float(unclassified) / total) * 100.f); + } } } @@ -133,6 +148,17 @@ class ProgressTracker { std::cerr << "\r> Output records written: " << m_num_simplex_reads_written; std::cerr << "\r"; } + + // Collect per barcode stats. + if (m_num_barcodes_demuxed > 0 && (spdlog::get_level() <= spdlog::level::debug)) { + for (const auto& [stat, val] : stats) { + const std::string prefix = "BarcodeClassifierNode.bc."; + if (stat.find(prefix) != std::string::npos) { + auto bc_name = stat.substr(prefix.length()); + m_barcode_count[bc_name] = static_cast(val); + } + } + } } private: @@ -150,6 +176,8 @@ class ProgressTracker { int m_num_reads_expected; + std::map m_barcode_count; + std::chrono::time_point m_initialization_time; std::chrono::time_point m_end_time; From be1846ce2d792b4bd7b223ed9f71b9cf3a1f86f6 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Fri, 5 Jan 2024 09:38:40 +0000 Subject: [PATCH 13/20] Disabled exception on cuda:all being used when there's no valid devices. --- dorado/utils/cuda_utils.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dorado/utils/cuda_utils.cpp b/dorado/utils/cuda_utils.cpp index 91d5a1fe..649ce7e9 100644 --- a/dorado/utils/cuda_utils.cpp +++ b/dorado/utils/cuda_utils.cpp @@ -125,10 +125,6 @@ std::vector parse_cuda_device_string(std::string device_string) { if (device_string.substr(0, 5) != "cuda:") { return devices; // empty vector; } else if (device_string == "cuda:all" || device_string == "cuda:auto") { - if (num_devices == 0) { - throw std::runtime_error("device string set to " + device_string + - " but no CUDA devices available."); - } for (size_t i = 0; i < num_devices; i++) { devices.push_back("cuda:" + std::to_string(i)); } From 931e060432e05f28b1cdf9a51ebfee42dc4cb71f Mon Sep 17 00:00:00 2001 From: Kevin Dolan Date: Fri, 5 Jan 2024 14:38:28 +0000 Subject: [PATCH 14/20] Added adapter/primer trim interval to read message. --- dorado/read_pipeline/AdapterDetectorNode.cpp | 1 + dorado/read_pipeline/ReadPipeline.h | 1 + 2 files changed, 2 insertions(+) diff --git a/dorado/read_pipeline/AdapterDetectorNode.cpp b/dorado/read_pipeline/AdapterDetectorNode.cpp index d2bb89ca..512a0254 100644 --- a/dorado/read_pipeline/AdapterDetectorNode.cpp +++ b/dorado/read_pipeline/AdapterDetectorNode.cpp @@ -136,6 +136,7 @@ void AdapterDetectorNode::process_read(SimplexRead& read) { return; } Trimmer::trim_sequence(read, trim_interval); + read.read_common.adapter_trim_interval = trim_interval; } m_num_records++; } diff --git a/dorado/read_pipeline/ReadPipeline.h b/dorado/read_pipeline/ReadPipeline.h index 4ec757a7..568fd2c7 100644 --- a/dorado/read_pipeline/ReadPipeline.h +++ b/dorado/read_pipeline/ReadPipeline.h @@ -58,6 +58,7 @@ class ReadCommon { std::shared_ptr barcoding_info; std::shared_ptr barcoding_result; std::size_t pre_trim_seq_length{}; + std::pair adapter_trim_interval{}; std::pair barcode_trim_interval{}; std::string alignment_string{}; From 0032b37831dd3324d495e880cea4d0f6c72fff43 Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Fri, 5 Jan 2024 11:09:40 -0500 Subject: [PATCH 15/20] use utils::starts_with --- dorado/read_pipeline/BarcodeClassifierNode.h | 2 +- dorado/read_pipeline/ProgressTracker.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dorado/read_pipeline/BarcodeClassifierNode.h b/dorado/read_pipeline/BarcodeClassifierNode.h index 618e1a0e..c3010acc 100644 --- a/dorado/read_pipeline/BarcodeClassifierNode.h +++ b/dorado/read_pipeline/BarcodeClassifierNode.h @@ -51,7 +51,7 @@ class BarcodeClassifierNode : public MessageSink { // Track how many reads were classified as each barcode for debugging // purposes. - std::map m_barcode_count; + std::map> m_barcode_count; std::mutex m_barcode_count_mutex; }; diff --git a/dorado/read_pipeline/ProgressTracker.h b/dorado/read_pipeline/ProgressTracker.h index c31add91..df995438 100644 --- a/dorado/read_pipeline/ProgressTracker.h +++ b/dorado/read_pipeline/ProgressTracker.h @@ -1,6 +1,7 @@ #pragma once #include "utils/stats.h" +#include "utils/string_utils.h" #include "utils/tty_utils.h" #ifdef WIN32 @@ -153,7 +154,7 @@ class ProgressTracker { if (m_num_barcodes_demuxed > 0 && (spdlog::get_level() <= spdlog::level::debug)) { for (const auto& [stat, val] : stats) { const std::string prefix = "BarcodeClassifierNode.bc."; - if (stat.find(prefix) != std::string::npos) { + if (utils::starts_with(stat, prefix)) { auto bc_name = stat.substr(prefix.length()); m_barcode_count[bc_name] = static_cast(val); } From 7a0ff1397ca5f7f017f8f468ba48b4116d27713c Mon Sep 17 00:00:00 2001 From: Richard Harris Date: Mon, 8 Jan 2024 18:56:22 +0000 Subject: [PATCH 16/20] DOR-517 only require standardisation params if active --- dorado/basecall/CRFModelConfig.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dorado/basecall/CRFModelConfig.cpp b/dorado/basecall/CRFModelConfig.cpp index 47e6fc60..35d7ae12 100644 --- a/dorado/basecall/CRFModelConfig.cpp +++ b/dorado/basecall/CRFModelConfig.cpp @@ -132,8 +132,10 @@ SignalNormalisationParams parse_signal_normalisation_params(const toml::value &c if (config_toml.contains("standardisation")) { const auto &norm = toml::find(config_toml, "standardisation"); params.standarisation.standardise = toml::find(norm, "standardise") > 0; - params.standarisation.mean = toml::find(norm, "mean"); - params.standarisation.stdev = toml::find(norm, "stdev"); + if (params.standarisation.standardise) { + params.standarisation.mean = toml::find(norm, "mean"); + params.standarisation.stdev = toml::find(norm, "stdev"); + } if (params.standarisation.standardise && params.strategy != ScalingStrategy::PA) { throw std::runtime_error( From 9f4522054c9a3d4e07fddfc637a983196c07073f Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 9 Jan 2024 16:06:23 +0000 Subject: [PATCH 17/20] Inherit barcoding_info and adapter_info from parent read when splitting --- dorado/splitter/splitter_utils.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dorado/splitter/splitter_utils.cpp b/dorado/splitter/splitter_utils.cpp index 838e9ae1..63827882 100644 --- a/dorado/splitter/splitter_utils.cpp +++ b/dorado/splitter/splitter_utils.cpp @@ -80,6 +80,9 @@ SimplexReadPtr subread(const SimplexRead& read, subread->read_common.get_raw_data_samples()); } + subread->read_common.barcoding_info = read.read_common.barcoding_info; + subread->read_common.adapter_info = read.read_common.adapter_info; + // Initialize the subreads previous and next reads with the parent's ids. // These are updated at the end when all subreads are available. subread->prev_read = read.prev_read; From a074db0e5d546c8ccf9119db051da9bd45ee7f5e Mon Sep 17 00:00:00 2001 From: Steve Malton Date: Tue, 9 Jan 2024 16:26:00 +0000 Subject: [PATCH 18/20] Move read info copies into existing util func --- dorado/read_pipeline/read_utils.cpp | 6 ++++++ dorado/splitter/splitter_utils.cpp | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dorado/read_pipeline/read_utils.cpp b/dorado/read_pipeline/read_utils.cpp index 37ff8840..793cb5d9 100644 --- a/dorado/read_pipeline/read_utils.cpp +++ b/dorado/read_pipeline/read_utils.cpp @@ -37,6 +37,12 @@ SimplexReadPtr shallow_copy_read(const SimplexRead& read) { copy->end_sample = read.end_sample; copy->run_acquisition_start_time_ms = read.run_acquisition_start_time_ms; copy->read_common.is_duplex = read.read_common.is_duplex; + + copy->read_common.read_tag = read.read_common.read_tag; + copy->read_common.client_info = read.read_common.client_info; + copy->read_common.barcoding_info = read.read_common.barcoding_info; + copy->read_common.adapter_info = read.read_common.adapter_info; + return copy; } diff --git a/dorado/splitter/splitter_utils.cpp b/dorado/splitter/splitter_utils.cpp index 63827882..6dc805c3 100644 --- a/dorado/splitter/splitter_utils.cpp +++ b/dorado/splitter/splitter_utils.cpp @@ -37,8 +37,6 @@ SimplexReadPtr subread(const SimplexRead& read, auto subread = utils::shallow_copy_read(read); - subread->read_common.read_tag = read.read_common.read_tag; - subread->read_common.client_info = read.read_common.client_info; subread->read_common.raw_data = subread->read_common.raw_data.index( {at::indexing::Slice(signal_range.first, signal_range.second)}); subread->read_common.attributes.read_number = -1; @@ -80,9 +78,6 @@ SimplexReadPtr subread(const SimplexRead& read, subread->read_common.get_raw_data_samples()); } - subread->read_common.barcoding_info = read.read_common.barcoding_info; - subread->read_common.adapter_info = read.read_common.adapter_info; - // Initialize the subreads previous and next reads with the parent's ids. // These are updated at the end when all subreads are available. subread->prev_read = read.prev_read; From cf1bf3aa63885940b899f35468143e5951d5e657 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Wed, 10 Jan 2024 11:02:16 +0000 Subject: [PATCH 19/20] Fixed alignment to include the rl:i tag when processing ReadCommon messages. Added utest for comparing output of alignment from BamPtr and ReadCommon messages --- dorado/alignment/Minimap2Aligner.cpp | 2 +- tests/AlignerTest.cpp | 331 ++++++++++++++++++--------- 2 files changed, 224 insertions(+), 109 deletions(-) diff --git a/dorado/alignment/Minimap2Aligner.cpp b/dorado/alignment/Minimap2Aligner.cpp index 217625e5..ebb50338 100644 --- a/dorado/alignment/Minimap2Aligner.cpp +++ b/dorado/alignment/Minimap2Aligner.cpp @@ -264,7 +264,7 @@ void Minimap2Aligner::align(dorado::ReadCommon& read_common, mm_tbuf_t* buffer) for (int reg_idx{0}; reg_idx < n_regs; ++reg_idx) { kstring_t alignment_line{0, 0, nullptr}; mm_write_sam3(&alignment_line, m_minimap_index->index(), &query, 0, reg_idx, 1, &n_regs, - ®s, NULL, MM_F_OUT_MD, -1); + ®s, NULL, MM_F_OUT_MD, buffer->rep_len); alignment_string += std::string(alignment_line.s, alignment_line.l) + "\n"; free(alignment_line.s); free(regs[reg_idx].p); diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index b0e81026..6e49d058 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -4,15 +4,19 @@ #include "read_pipeline/AlignerNode.h" #include "read_pipeline/ClientInfo.h" #include "read_pipeline/HtsReader.h" +#include "utils/PostCondition.h" #include "utils/bam_utils.h" #include "utils/sequence_utils.h" +#include "utils/string_utils.h" #include #include #include #include +#include #include +#include #include #define TEST_GROUP "[bam_utils][aligner]" @@ -34,28 +38,103 @@ class TestClientInfo : public dorado::ClientInfo { bool is_disconnected() const override { return false; } }; -template -std::unique_ptr create_pipeline(std::vector& output_messages, - Args&&... args) { - dorado::PipelineDescriptor pipeline_desc; - auto sink = pipeline_desc.add_node({}, 100, output_messages); - pipeline_desc.add_node({sink}, args...); - return dorado::Pipeline::create(std::move(pipeline_desc), nullptr); +std::unordered_map get_tags_from_sam_line_fields( + const std::vector& fields) { + // returns tags TAG:TYPE:VALUE as kvps of TAG:TYPE to VALUE + std::unordered_map result{}; + // sam line is 11 fields plus tags + CHECK(fields.size() >= 11); + for (std::size_t field_index{11}; field_index < fields.size(); ++field_index) { + const auto& field = fields[field_index]; + auto tokens = dorado::utils::split(field, ':'); + CHECK(tokens.size() == 3); + result[tokens[0] + ":" + tokens[1]] = tokens[2]; + } + return result; } -template -std::vector RunAlignmentPipeline(dorado::HtsReader& reader, Args&&... args) { - std::vector messages; - auto index_file_access = std::make_shared(); - auto pipeline = create_pipeline(messages, index_file_access, args...); - reader.read(*pipeline, 100); - pipeline.reset(); - return ConvertMessages(std::move(messages)); -} +class AlignerNodeTestFixture { +private: + std::vector output_messages{}; + +protected: + std::unique_ptr pipeline; + dorado::NodeHandle aligner_node_handle; + + template + void create_pipeline(Args&&... args) { + dorado::PipelineDescriptor pipeline_desc; + auto sink = pipeline_desc.add_node({}, 100, output_messages); + aligner_node_handle = pipeline_desc.add_node({sink}, args...); + pipeline = dorado::Pipeline::create(std::move(pipeline_desc), nullptr); + } + + template + std::vector RunPipelineWithBamMessages(dorado::HtsReader& reader, + Args&&... args) { + auto index_file_access = std::make_shared(); + create_pipeline(index_file_access, args...); + reader.read(*pipeline, 100); + pipeline->terminate({}); + return ConvertMessages(std::move(output_messages)); + } + + template > + MessageTypePtr RunPipelineForRead(const dorado::AlignmentInfo& loaded_align_info, + const dorado::AlignmentInfo& client_align_info, + std::string read_id, + std::string sequence) { + auto index_file_access = std::make_shared(); + CHECK(index_file_access->load_index(loaded_align_info.reference_file, + loaded_align_info.minimap_options, + 2) == dorado::alignment::IndexLoadResult::success); + create_pipeline(index_file_access, 2); + + dorado::ReadCommon read_common{}; + read_common.client_info = std::make_shared(client_align_info); + read_common.read_id = std::move(read_id); + read_common.seq = std::move(sequence); + + auto read = std::make_unique(); + read->read_common = std::move(read_common); + + pipeline->push_message(std::move(read)); + pipeline->terminate({}); + + CHECK((output_messages.size() == 1 && + std::holds_alternative(output_messages[0]))); + + return std::get(std::move(output_messages[0])); + } + + void RunPipelineForReadCommonMessage(const dorado::AlignmentInfo& align_info, + dorado::Message&& message) { + auto index_file_access = std::make_shared(); + CHECK(index_file_access->load_index(align_info.reference_file, align_info.minimap_options, + 2) == dorado::alignment::IndexLoadResult::success); + + create_pipeline(index_file_access, 2); + pipeline->push_message(std::move(message)); + pipeline->terminate({}); + } + + std::string get_sam_line_from_bam(dorado::BamPtr bam_record) { + dorado::SamHdrPtr header(sam_hdr_init()); + const auto& aligner_ref = + dynamic_cast(pipeline->get_node_ref(aligner_node_handle)); + dorado::utils::add_sq_hdr(header.get(), aligner_ref.get_sequence_records_for_header()); + + auto sam_line_buffer = dorado::utils::allocate_kstring(); + dorado::utils::PostCondition free_buffer([&sam_line_buffer] { ks_free(&sam_line_buffer); }); + CHECK(sam_format1(header.get(), bam_record.get(), &sam_line_buffer) >= 0); + + return std::string(ks_str(&sam_line_buffer), ks_len(&sam_line_buffer)); + } +}; } // namespace -TEST_CASE("AlignerTest: Check standard alignment", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check standard alignment", TEST_GROUP) { using Catch::Matchers::Contains; fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); @@ -66,7 +145,7 @@ TEST_CASE("AlignerTest: Check standard alignment", TEST_GROUP) { options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -90,7 +169,7 @@ TEST_CASE("AlignerTest: Check standard alignment", TEST_GROUP) { } } -TEST_CASE("AlignerTest: Check supplementary alignment", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, "AlignerTest: Check supplementary alignment", TEST_GROUP) { using Catch::Matchers::Contains; fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); @@ -101,7 +180,7 @@ TEST_CASE("AlignerTest: Check supplementary alignment", TEST_GROUP) { options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); REQUIRE(bam_records.size() == 2); // Check first alignment is primary. @@ -127,7 +206,9 @@ TEST_CASE("AlignerTest: Check supplementary alignment", TEST_GROUP) { } } -TEST_CASE("AlignerTest: Check reverse complement alignment", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerTest: Check reverse complement alignment", + TEST_GROUP) { using Catch::Matchers::Contains; fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); @@ -138,7 +219,7 @@ TEST_CASE("AlignerTest: Check reverse complement alignment", TEST_GROUP) { options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -159,7 +240,9 @@ TEST_CASE("AlignerTest: Check reverse complement alignment", TEST_GROUP) { CHECK(orig_qual == aligned_qual); } -TEST_CASE("AlignerTest: Check dorado tags are retained", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerTest: Check dorado tags are retained", + TEST_GROUP) { using Catch::Matchers::Contains; fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); @@ -170,7 +253,7 @@ TEST_CASE("AlignerTest: Check dorado tags are retained", TEST_GROUP) { options.kmer_size = options.window_size = 15; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); REQUIRE(bam_records.size() == 1); bam1_t* rec = bam_records[0].get(); @@ -184,7 +267,9 @@ TEST_CASE("AlignerTest: Check dorado tags are retained", TEST_GROUP) { } } -TEST_CASE("AlignerTest: Check modbase tags are removed for secondary alignments", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerTest: Check modbase tags are removed for secondary alignments", + TEST_GROUP) { using Catch::Matchers::Contains; fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); @@ -196,7 +281,7 @@ TEST_CASE("AlignerTest: Check modbase tags are removed for secondary alignments" options.index_batch_size = 1'000'000'000ull; options.soft_clipping = GENERATE(true, false); dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 10); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 10); REQUIRE(bam_records.size() == 3); bam1_t* primary_rec = bam_records[0].get(); @@ -227,7 +312,9 @@ TEST_CASE("AlignerTest: Check modbase tags are removed for secondary alignments" } } -TEST_CASE("AlignerTest: Verify impact of updated aligner args", TEST_GROUP) { +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerTest: Verify impact of updated aligner args", + TEST_GROUP) { fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); auto ref = aligner_test_dir / "target.fq"; auto query = aligner_test_dir / "query.fa"; @@ -238,7 +325,7 @@ TEST_CASE("AlignerTest: Verify impact of updated aligner args", TEST_GROUP) { options.kmer_size = options.window_size = 28; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 2); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 2); CHECK(bam_records.size() == 2); // Generates 2 alignments. } @@ -248,7 +335,7 @@ TEST_CASE("AlignerTest: Verify impact of updated aligner args", TEST_GROUP) { options.kmer_size = options.window_size = 5; options.index_batch_size = 1'000'000'000ull; dorado::HtsReader reader(query.string(), std::nullopt); - auto bam_records = RunAlignmentPipeline(reader, ref.string(), options, 2); + auto bam_records = RunPipelineWithBamMessages(reader, ref.string(), options, 2); CHECK(bam_records.size() == 1); // Generates 1 alignment. } } @@ -264,93 +351,56 @@ TEST_CASE("AlignerTest: Check AlignerNode crashes if multi index encountered", T CHECK_THROWS(dorado::AlignerNode(index_file_access, ref.string(), options, 1)); } -SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) { +SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GROUP) { GIVEN("AlgnerNode constructed with populated index file collection") { const std::string read_id{"aligner_node_test"}; fs::path aligner_test_dir{get_aligner_data_dir()}; auto ref = aligner_test_dir / "target.fq"; - dorado::AlignmentInfo loaded_align_info{}; - loaded_align_info.minimap_options = dorado::alignment::dflt_options; - loaded_align_info.reference_file = ref.string(); - auto index_file_access = std::make_shared(); - CHECK(index_file_access->load_index(loaded_align_info.reference_file, - loaded_align_info.minimap_options, - 2) == dorado::alignment::IndexLoadResult::success); - std::vector messages; - auto pipeline = create_pipeline(messages, index_file_access, 2); + const dorado::AlignmentInfo empty_align_info{}; + dorado::AlignmentInfo align_info{}; + align_info.minimap_options = dorado::alignment::dflt_options; + align_info.reference_file = ref.string(); + //auto index_file_access = std::make_shared(); + //CHECK(index_file_access->load_index(align_info.reference_file, align_info.minimap_options, + // 2) == dorado::alignment::IndexLoadResult::success); + //create_pipeline(index_file_access, 2); dorado::ReadCommon read_common{}; read_common.read_id = read_id; AND_GIVEN("client with no alignment requirements") { - auto client_without_align = std::make_shared(dorado::AlignmentInfo{}); - read_common.client_info = client_without_align; - WHEN("push simplex read to pipeline") { - read_common.seq = "ACGTACGTACGTACGT"; - auto simplex_read = std::make_unique(); - simplex_read->read_common = std::move(read_common); - simplex_read->read_common.seq = "ACGTACGTACGTACGT"; - - pipeline->push_message(std::move(simplex_read)); - pipeline.reset(); - - THEN("Single simplex read is output") { - REQUIRE((messages.size() == 1 && - std::holds_alternative(messages[0]))); - } + auto simplex_read = RunPipelineForRead( + align_info, empty_align_info, read_id, "ACGTACGTACGTACGT"); THEN("Output simplex read has empty alignment_string") { - simplex_read = std::get(std::move(messages[0])); REQUIRE(simplex_read->read_common.alignment_string.empty()); } } WHEN("push duplex read to pipeline") { - read_common.seq = "ACGTACGTACGTACGT"; - auto duplex_read = std::make_unique(); - duplex_read->read_common = std::move(read_common); - - pipeline->push_message(std::move(duplex_read)); - pipeline.reset(); - - THEN("Single duplex read is output") { - REQUIRE((messages.size() == 1 && - std::holds_alternative(messages[0]))); - } - + auto duplex_read = RunPipelineForRead( + align_info, empty_align_info, read_id, "ACGTACGTACGTACGT"); THEN("Output duplex read has empty alignment_string") { - duplex_read = std::get(std::move(messages[0])); REQUIRE(duplex_read->read_common.alignment_string.empty()); } } } AND_GIVEN("client requiring alignment") { - auto client_requiring_alignment = std::make_shared(loaded_align_info); + auto client_requiring_alignment = std::make_shared(align_info); read_common.client_info = client_requiring_alignment; WHEN("push simplex read with no alignment matches to pipeline") { - read_common.seq = "ACGTACGTACGTACGT"; - auto simplex_read = std::make_unique(); - simplex_read->read_common = std::move(read_common); - - pipeline->push_message(std::move(simplex_read)); - pipeline.reset(); - - THEN("Single simplex read is output") { - REQUIRE((messages.size() == 1 && - std::holds_alternative(messages[0]))); - } + auto simplex_read = RunPipelineForRead( + align_info, align_info, read_id, "ACGTACGTACGTACGT"); THEN("Output simplex read has alignment_string populated") { - simplex_read = std::get(std::move(messages[0])); REQUIRE_FALSE(simplex_read->read_common.alignment_string.empty()); } THEN("Output simplex read has alignment_string containing unmapped sam line") { - simplex_read = std::get(std::move(messages[0])); const std::string expected{read_id + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(simplex_read->read_common.alignment_string == expected); @@ -358,25 +408,14 @@ SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) { } WHEN("push duplex read with no alignment matches to pipeline") { - read_common.seq = "ACGTACGTACGTACGT"; - auto duplex_read = std::make_unique(); - duplex_read->read_common = std::move(read_common); - - pipeline->push_message(std::move(duplex_read)); - pipeline.reset(); - - THEN("Single duplex read is output") { - REQUIRE((messages.size() == 1 && - std::holds_alternative(messages[0]))); - } + auto duplex_read = RunPipelineForRead( + align_info, align_info, read_id, "ACGTACGTACGTACGT"); THEN("Output duplex read has alignment_string populated") { - duplex_read = std::get(std::move(messages[0])); REQUIRE_FALSE(duplex_read->read_common.alignment_string.empty()); } THEN("Output duplex read has alignment_string containing unmapped sam line") { - duplex_read = std::get(std::move(messages[0])); const std::string expected{read_id + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(duplex_read->read_common.alignment_string == expected); @@ -387,16 +426,11 @@ SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) { dorado::HtsReader reader(ref.string(), std::nullopt); reader.read(); auto sequence = dorado::utils::extract_sequence(reader.record.get()); - read_common.seq = dorado::utils::extract_sequence(reader.record.get()); WHEN("pushed as simplex read to pipeline") { - auto simplex_read = std::make_unique(); - simplex_read->read_common = std::move(read_common); + auto simplex_read = RunPipelineForRead( + align_info, align_info, read_id, sequence); - pipeline->push_message(std::move(simplex_read)); - pipeline.reset(); - - simplex_read = std::get(std::move(messages[0])); THEN("Output sam line has read_id as QNAME") { const std::string expected{read_id + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; @@ -411,13 +445,9 @@ SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) { } WHEN("pushed as duplex read to pipeline") { - auto duplex_read = std::make_unique(); - duplex_read->read_common = std::move(read_common); - - pipeline->push_message(std::move(duplex_read)); - pipeline.reset(); + auto duplex_read = RunPipelineForRead( + align_info, align_info, read_id, sequence); - duplex_read = std::get(std::move(messages[0])); THEN("Output sam line has read_id as QNAME") { const std::string expected{read_id + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; @@ -434,3 +464,88 @@ SCENARIO("AlignerNode push SimplexRead", TEST_GROUP) { } } } + +std::pair get_read_id_and_sequence_from_fasta( + const std::string& fasta_file) { + std::ifstream query_input_stream(fasta_file); + + std::string line; + std::getline(query_input_stream, line); + CHECK(!line.empty()); + CHECK(dorado::utils::starts_with(line, ">")); + line = line.substr(1); + auto read_id = line.substr(0, line.find_first_of(" ")); + + std::string sequence; + while (std::getline(query_input_stream, sequence)) { + if (!dorado::utils::starts_with(sequence, ">")) { + break; + } + } + CHECK(!sequence.empty()); + + return {read_id, sequence}; +} + +TEST_CASE_METHOD(AlignerNodeTestFixture, + "AlignerNode compare BamPtr and ReadCommon processing", + TEST_GROUP) { + fs::path aligner_test_dir = fs::path(get_aligner_data_dir()); + auto ref = (aligner_test_dir / "target.fq").string(); + auto query = (aligner_test_dir / "query.fa").string(); + + auto options = dorado::alignment::dflt_options; + options.kmer_size = options.window_size = 5; + options.index_batch_size = 1'000'000'000ull; + + // Get the sam line from BAM pipeline + dorado::HtsReader bam_reader(query, std::nullopt); + auto bam_records = RunPipelineWithBamMessages(bam_reader, ref, options, 2); + CHECK(bam_records.size() == 1); + auto sam_line_from_bam_ptr = get_sam_line_from_bam(std::move(bam_records[0])); + + // Get the sam line from ReadCommon pipeline + auto [read_id, sequence] = get_read_id_and_sequence_from_fasta(query); + dorado::AlignmentInfo align_info{}; + align_info.minimap_options = options; + align_info.reference_file = ref; + auto simplex_read = RunPipelineForRead( + align_info, align_info, std::move(read_id), std::move(sequence)); + auto sam_line_from_read_common = std::move(simplex_read->read_common.alignment_string); + + // Do the comparison checks + CHECK_FALSE(sam_line_from_read_common.empty()); + + if (sam_line_from_read_common.at(sam_line_from_read_common.size() - 1) == '\n') { + sam_line_from_read_common = + sam_line_from_read_common.substr(0, sam_line_from_read_common.size() - 1); + } + + const auto bam_fields = dorado::utils::split(sam_line_from_bam_ptr, '\t'); + const auto read_common_fields = dorado::utils::split(sam_line_from_read_common, '\t'); + CHECK(bam_fields.size() == read_common_fields.size()); + CHECK(bam_fields.size() >= 11); + // first 11 mandatory fields should be identical + for (std::size_t field_index{0}; field_index < 11; ++field_index) { + CHECK(bam_fields[field_index] == read_common_fields[field_index]); + } + + const auto bam_tags = get_tags_from_sam_line_fields(bam_fields); + const auto read_common_tags = get_tags_from_sam_line_fields(read_common_fields); + CHECK(bam_tags.size() == read_common_tags.size()); + for (const auto& [key, value] : bam_tags) { + INFO(key); + CHECK(read_common_tags.count(key) == 1); + // de:f tag compare to 4dp as this is the precision the minimap sam line generation function uses + if (key == "de:f") { + auto bam_value = std::stof(value); + auto read_common_value = std::stof(read_common_tags.at(key)); + float diff = bam_value - read_common_value; + constexpr float TOLERANCE_DP4{1e-04f}; + CHECK(diff < TOLERANCE_DP4); + continue; + } + + CHECK(read_common_tags.at(key) == value); + } +} \ No newline at end of file From eec3d7acf11b7fec97bfdd38a25851b51212bec2 Mon Sep 17 00:00:00 2001 From: Hugh Pendry Date: Wed, 10 Jan 2024 14:39:54 +0000 Subject: [PATCH 20/20] Updates following review --- tests/AlignerTest.cpp | 79 +++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index 6e49d058..79a6ab76 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -55,7 +55,7 @@ std::unordered_map get_tags_from_sam_line_fields( class AlignerNodeTestFixture { private: - std::vector output_messages{}; + std::vector m_output_messages{}; protected: std::unique_ptr pipeline; @@ -64,7 +64,7 @@ class AlignerNodeTestFixture { template void create_pipeline(Args&&... args) { dorado::PipelineDescriptor pipeline_desc; - auto sink = pipeline_desc.add_node({}, 100, output_messages); + auto sink = pipeline_desc.add_node({}, 100, m_output_messages); aligner_node_handle = pipeline_desc.add_node({sink}, args...); pipeline = dorado::Pipeline::create(std::move(pipeline_desc), nullptr); } @@ -76,7 +76,7 @@ class AlignerNodeTestFixture { create_pipeline(index_file_access, args...); reader.read(*pipeline, 100); pipeline->terminate({}); - return ConvertMessages(std::move(output_messages)); + return ConvertMessages(std::move(m_output_messages)); } template > @@ -101,21 +101,10 @@ class AlignerNodeTestFixture { pipeline->push_message(std::move(read)); pipeline->terminate({}); - CHECK((output_messages.size() == 1 && - std::holds_alternative(output_messages[0]))); + CHECK((m_output_messages.size() == 1 && + std::holds_alternative(m_output_messages[0]))); - return std::get(std::move(output_messages[0])); - } - - void RunPipelineForReadCommonMessage(const dorado::AlignmentInfo& align_info, - dorado::Message&& message) { - auto index_file_access = std::make_shared(); - CHECK(index_file_access->load_index(align_info.reference_file, align_info.minimap_options, - 2) == dorado::alignment::IndexLoadResult::success); - - create_pipeline(index_file_access, 2); - pipeline->push_message(std::move(message)); - pipeline->terminate({}); + return std::get(std::move(m_output_messages[0])); } std::string get_sam_line_from_bam(dorado::BamPtr bam_record) { @@ -353,26 +342,21 @@ TEST_CASE("AlignerTest: Check AlignerNode crashes if multi index encountered", T SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GROUP) { GIVEN("AlgnerNode constructed with populated index file collection") { - const std::string read_id{"aligner_node_test"}; + const std::string READ_ID{"aligner_node_test"}; fs::path aligner_test_dir{get_aligner_data_dir()}; auto ref = aligner_test_dir / "target.fq"; - const dorado::AlignmentInfo empty_align_info{}; dorado::AlignmentInfo align_info{}; align_info.minimap_options = dorado::alignment::dflt_options; align_info.reference_file = ref.string(); - //auto index_file_access = std::make_shared(); - //CHECK(index_file_access->load_index(align_info.reference_file, align_info.minimap_options, - // 2) == dorado::alignment::IndexLoadResult::success); - //create_pipeline(index_file_access, 2); - dorado::ReadCommon read_common{}; - read_common.read_id = read_id; + const std::string TEST_SEQUENCE{"ACGTACGTACGTACGT"}; AND_GIVEN("client with no alignment requirements") { + const dorado::AlignmentInfo EMPTY_ALIGN_INFO{}; WHEN("push simplex read to pipeline") { auto simplex_read = RunPipelineForRead( - align_info, empty_align_info, read_id, "ACGTACGTACGTACGT"); + align_info, EMPTY_ALIGN_INFO, READ_ID, TEST_SEQUENCE); THEN("Output simplex read has empty alignment_string") { REQUIRE(simplex_read->read_common.alignment_string.empty()); @@ -381,7 +365,7 @@ SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GRO WHEN("push duplex read to pipeline") { auto duplex_read = RunPipelineForRead( - align_info, empty_align_info, read_id, "ACGTACGTACGTACGT"); + align_info, EMPTY_ALIGN_INFO, READ_ID, TEST_SEQUENCE); THEN("Output duplex read has empty alignment_string") { REQUIRE(duplex_read->read_common.alignment_string.empty()); } @@ -389,34 +373,31 @@ SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GRO } AND_GIVEN("client requiring alignment") { - auto client_requiring_alignment = std::make_shared(align_info); - read_common.client_info = client_requiring_alignment; - WHEN("push simplex read with no alignment matches to pipeline") { - auto simplex_read = RunPipelineForRead( - align_info, align_info, read_id, "ACGTACGTACGTACGT"); + auto simplex_read = RunPipelineForRead(align_info, align_info, + READ_ID, TEST_SEQUENCE); THEN("Output simplex read has alignment_string populated") { REQUIRE_FALSE(simplex_read->read_common.alignment_string.empty()); } THEN("Output simplex read has alignment_string containing unmapped sam line") { - const std::string expected{read_id + + const std::string expected{READ_ID + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(simplex_read->read_common.alignment_string == expected); } } WHEN("push duplex read with no alignment matches to pipeline") { - auto duplex_read = RunPipelineForRead( - align_info, align_info, read_id, "ACGTACGTACGTACGT"); + auto duplex_read = RunPipelineForRead(align_info, align_info, + READ_ID, TEST_SEQUENCE); THEN("Output duplex read has alignment_string populated") { REQUIRE_FALSE(duplex_read->read_common.alignment_string.empty()); } THEN("Output duplex read has alignment_string containing unmapped sam line") { - const std::string expected{read_id + + const std::string expected{READ_ID + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(duplex_read->read_common.alignment_string == expected); } @@ -429,13 +410,13 @@ SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GRO WHEN("pushed as simplex read to pipeline") { auto simplex_read = RunPipelineForRead( - align_info, align_info, read_id, sequence); + align_info, align_info, READ_ID, sequence); THEN("Output sam line has read_id as QNAME") { - const std::string expected{read_id + + const std::string expected{READ_ID + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(simplex_read->read_common.alignment_string.substr( - 0, read_id.size()) == read_id); + 0, READ_ID.size()) == READ_ID); } THEN("Output sam line contains sequence string") { @@ -446,13 +427,13 @@ SCENARIO_METHOD(AlignerNodeTestFixture, "AlignerNode push SimplexRead", TEST_GRO WHEN("pushed as duplex read to pipeline") { auto duplex_read = RunPipelineForRead( - align_info, align_info, read_id, sequence); + align_info, align_info, READ_ID, sequence); THEN("Output sam line has read_id as QNAME") { - const std::string expected{read_id + + const std::string expected{READ_ID + dorado::alignment::UNMAPPED_SAM_LINE_STRIPPED}; REQUIRE(duplex_read->read_common.alignment_string.substr( - 0, read_id.size()) == read_id); + 0, READ_ID.size()) == READ_ID); } THEN("Output sam line contains sequence string") { @@ -533,19 +514,21 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, const auto bam_tags = get_tags_from_sam_line_fields(bam_fields); const auto read_common_tags = get_tags_from_sam_line_fields(read_common_fields); CHECK(bam_tags.size() == read_common_tags.size()); - for (const auto& [key, value] : bam_tags) { + for (const auto& [key, bam_value] : bam_tags) { INFO(key); - CHECK(read_common_tags.count(key) == 1); + auto tag_entry = read_common_tags.find(key); + REQUIRE(tag_entry != read_common_tags.end()); // de:f tag compare to 4dp as this is the precision the minimap sam line generation function uses + const auto& read_common_value = tag_entry->second; if (key == "de:f") { - auto bam_value = std::stof(value); - auto read_common_value = std::stof(read_common_tags.at(key)); - float diff = bam_value - read_common_value; + auto bam_value_as_float = std::stof(bam_value); + auto read_common_value_as_float = std::stof(read_common_value); + float diff = bam_value_as_float - read_common_value_as_float; constexpr float TOLERANCE_DP4{1e-04f}; CHECK(diff < TOLERANCE_DP4); continue; } - CHECK(read_common_tags.at(key) == value); + CHECK(read_common_value == bam_value); } } \ No newline at end of file