From 1c5c1129f72204b69beb0200d3cebb78aa74d0e4 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 21 Jun 2019 16:18:48 -0500 Subject: [PATCH 1/2] fix issue #98, now checking component subtypes for positive and negative kmers being present at the same position added unit tests --- bio_hansel/qc/checks.py | 4 +- bio_hansel/qc/utils.py | 23 ++++++-- tests/data/qc/conflicting_subtypes/fail.tsv | 64 +++++++++++++++++++++ tests/data/qc/conflicting_subtypes/pass.tsv | 63 ++++++++++++++++++++ tests/test_qc_utils.py | 25 +++++++- 5 files changed, 171 insertions(+), 8 deletions(-) create mode 100644 tests/data/qc/conflicting_subtypes/fail.tsv create mode 100644 tests/data/qc/conflicting_subtypes/pass.tsv diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py index 02f9016..e4079ee 100644 --- a/bio_hansel/qc/checks.py +++ b/bio_hansel/qc/checks.py @@ -140,7 +140,7 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str """ if not st.are_subtypes_consistent: return QC.FAIL, f'Mixed subtypes found: "{"; ".join(sorted(st.inconsistent_subtypes))}".' - conflicting_kmers = get_conflicting_kmers(st, df) + conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input()) if conflicting_kmers is None or conflicting_kmers.shape[0] == 0: return None, None @@ -250,7 +250,7 @@ def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingPar total_subtype_kmers = int(st.n_kmers_matching_subtype_expected) total_subtype_kmers_hits = int(st.n_kmers_matching_subtype) - conflicting_kmers = get_conflicting_kmers(st, df) + conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input()) num_pos_kmers, num_neg_kmers = get_num_pos_neg_kmers(st, df) obs = int(st.n_kmers_matching_all) exp = int(st.n_kmers_matching_all_expected) diff --git a/bio_hansel/qc/utils.py b/bio_hansel/qc/utils.py index 0f2ede8..f8c9c85 100644 --- a/bio_hansel/qc/utils.py +++ b/bio_hansel/qc/utils.py @@ -5,7 +5,21 @@ from ..subtype import Subtype -def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]: +def component_subtypes(subtype: str) -> Iterable[str]: + """Generate component subtypes from a subtype. + + Args: + subtype: Subtype string, e.g. "4.2.1.1" + Yields: + Component subtypes (e.g. for subtype "4.2.1.1", will yield + ['4', '4.2', '4.2.1', '4.2.1.1']) + """ + split_subtype = subtype.split('.') + for i, x in enumerate(split_subtype): + yield '.'.join(split_subtype[:i+1]) + + +def get_conflicting_kmers(subtype: str, df: DataFrame, is_fastq_input: bool = True) -> Optional[DataFrame]: """ Get positive and negative kmers that both are present for a subtype. Find positive and negative kmers for the same refposition/target site in the results `df`. @@ -17,10 +31,9 @@ def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]: Returns: DataFrame of conflicting positive and negative kmers """ - if st.is_fastq_input(): - dfst = df[(df['subtype'] == str(st.subtype)) & (df['is_kmer_freq_okay'])] - else: # fasta files - dfst = df[(df['subtype'] == str(st.subtype))] + dfst = df[(df['subtype'].isin(list(component_subtypes(subtype))))] + if is_fastq_input: + dfst = dfst[dfst['is_kmer_freq_okay']] pos_kmer_positions = dfst[dfst['is_pos_kmer']]['refposition'] neg_kmers = dfst[~dfst['is_pos_kmer']] diff --git a/tests/data/qc/conflicting_subtypes/fail.tsv b/tests/data/qc/conflicting_subtypes/fail.tsv new file mode 100644 index 0000000..ec81f8a --- /dev/null +++ b/tests/data/qc/conflicting_subtypes/fail.tsv @@ -0,0 +1,64 @@ +refposition subtype is_pos_kmer is_kmer_freq_okay +931123 4 True True +62657 4.1 True True +891756 4.1.2 True True +3216553 1.1.1.1 False True +3479545 1.2.1 False True +3470377 1.2.2 False True +3466426 4.6 False True +3388166 4.3.2.1 False True +3273107 3 False True +3021283 1.1.1 False True +3722702 3.1.2 False True +2875883 4.6.2.2 False True +2874344 3.1.2.2 False True +2831482 8.1 False True +2694560 4.4.1.2 False True +2622402 1.1.2 False True +3570528 4.6.2.1 False True +62657 4.1 False True +2411730 4.2 False True +3836274 2.2.1.2 False True +3836739 4.8 False True +3977226 4.3.4 False True +4125058 4.6.2 False True +4151558 4.4.1 False True +4229087 4.1.1.3 False True +4246508 4.4.2 False True +4248115 2.2.1.1 False True +4249732 4.7 False True +4260268 4.6.1 False True +4307886 4.4 False True +4316114 4.3.2 False True +4398141 4.3.4.1 False True +2505085 2.2 False True +1881090 2.1 False True +1882180 8 False True +874787 4.6.1.1 False True +107794 4.1.2.1 False True +346693 2.2.2 False True +355181 4.4.1.1 False True +403364 4.3.3 False True +497491 2 False True +514245 4.1.1 False True +541048 4.1.1.2 False True +615614 4.3.1 False True +615938 1 False True +764995 4.3 False True +783601 4.2.1 False True +797736 2.2.1 False True +1084911 3.1.1 False True +1850119 4.1.1.1 False True +1132368 4.3.4.2 False True +1137518 7 False True +1237818 3.1.2.1 False True +1455780 4.2.2.1 False True +1487796 4.2.2 False True +1491275 1.1.3 False True +1501468 4.6.1.2 False True +1502120 4.3.4.2.1 False True +1719757 4.5 False True +1759252 4.9 False True +1799921 5 False True +1816587 8.2 False True +4404247 1.1 False True diff --git a/tests/data/qc/conflicting_subtypes/pass.tsv b/tests/data/qc/conflicting_subtypes/pass.tsv new file mode 100644 index 0000000..87547e5 --- /dev/null +++ b/tests/data/qc/conflicting_subtypes/pass.tsv @@ -0,0 +1,63 @@ +refposition subtype is_pos_kmer is_kmer_freq_okay +931123 4 True True +62657 4.1 True True +891756 4.1.2 True True +3216553 1.1.1.1 False True +3479545 1.2.1 False True +3470377 1.2.2 False True +3466426 4.6 False True +3388166 4.3.2.1 False True +3273107 3 False True +3021283 1.1.1 False True +3722702 3.1.2 False True +2875883 4.6.2.2 False True +2874344 3.1.2.2 False True +2831482 8.1 False True +2694560 4.4.1.2 False True +2622402 1.1.2 False True +3570528 4.6.2.1 False True +2411730 4.2 False True +3836274 2.2.1.2 False True +3836739 4.8 False True +3977226 4.3.4 False True +4125058 4.6.2 False True +4151558 4.4.1 False True +4229087 4.1.1.3 False True +4246508 4.4.2 False True +4248115 2.2.1.1 False True +4249732 4.7 False True +4260268 4.6.1 False True +4307886 4.4 False True +4316114 4.3.2 False True +4398141 4.3.4.1 False True +2505085 2.2 False True +1881090 2.1 False True +1882180 8 False True +874787 4.6.1.1 False True +107794 4.1.2.1 False True +346693 2.2.2 False True +355181 4.4.1.1 False True +403364 4.3.3 False True +497491 2 False True +514245 4.1.1 False True +541048 4.1.1.2 False True +615614 4.3.1 False True +615938 1 False True +764995 4.3 False True +783601 4.2.1 False True +797736 2.2.1 False True +1084911 3.1.1 False True +1850119 4.1.1.1 False True +1132368 4.3.4.2 False True +1137518 7 False True +1237818 3.1.2.1 False True +1455780 4.2.2.1 False True +1487796 4.2.2 False True +1491275 1.1.3 False True +1501468 4.6.1.2 False True +1502120 4.3.4.2.1 False True +1719757 4.5 False True +1759252 4.9 False True +1799921 5 False True +1816587 8.2 False True +4404247 1.1 False True diff --git a/tests/test_qc_utils.py b/tests/test_qc_utils.py index edd1ff4..0fd1886 100644 --- a/tests/test_qc_utils.py +++ b/tests/test_qc_utils.py @@ -2,7 +2,12 @@ import pandas as pd -from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts + +from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts, component_subtypes, get_conflicting_kmers + + +fail_tsv = 'tests/data/qc/conflicting_subtypes/fail.tsv' +pass_tsv = 'tests/data/qc/conflicting_subtypes/pass.tsv' def test_get_mixed_subtype_kmer_counts(): @@ -22,3 +27,21 @@ def test_get_mixed_subtype_kmer_counts(): assert(int(st_pos_kmers.get('2.1')) == 5) assert(int(st_pos_kmers.get('2.2')) == 3) assert(int(st_pos_kmers.get('0')) == 1) + + +def test_component_subtypes(): + assert list(component_subtypes('4.2.1.1')) == ['4', '4.2', '4.2.1', '4.2.1.1'] + assert list(component_subtypes('1')) == ['1'] + + +def test_get_conflicting_kmers(): + df_pass = pd.read_csv(pass_tsv, sep='\t') + df_pass_result = get_conflicting_kmers('4.1.2', df_pass, True) + assert df_pass_result.shape[0] == 0, 'Must be no conflicting kmers' + df_fail = pd.read_csv(fail_tsv, sep='\t') + df_fail_result = get_conflicting_kmers('4.1.2', df_fail, True) + df_fail_result.reset_index(inplace=True) + assert df_fail_result.shape[0] == 1, 'Must be one conflicting kmer' + assert df_fail_result.refposition[0] == 62657 + assert df_fail_result.subtype[0] == '4.1' + assert df_fail_result.is_pos_kmer[0] == False From fc9f8c7ceac4035f0038288323530a4248312bb7 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 21 Jun 2019 16:29:37 -0500 Subject: [PATCH 2/2] import Iterable --- bio_hansel/qc/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio_hansel/qc/utils.py b/bio_hansel/qc/utils.py index f8c9c85..61542dd 100644 --- a/bio_hansel/qc/utils.py +++ b/bio_hansel/qc/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple, Optional, List, Any, Dict +from typing import Tuple, Optional, List, Any, Dict, Iterable from pandas import DataFrame