Skip to content

Commit

Permalink
Merge pull request #100 from phac-nml/fix/98-mixed-subtype
Browse files Browse the repository at this point in the history
Fix #98 QC check for mixed subtypes
  • Loading branch information
DarianHole committed Jun 24, 2019
2 parents e2905b4 + fc9f8c7 commit 6a9574a
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 9 deletions.
4 changes: 2 additions & 2 deletions bio_hansel/qc/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str
"""
if not st.are_subtypes_consistent:
return QC.FAIL, f'Mixed subtypes found: "{"; ".join(sorted(st.inconsistent_subtypes))}".'
conflicting_kmers = get_conflicting_kmers(st, df)
conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input())
if conflicting_kmers is None or conflicting_kmers.shape[0] == 0:
return None, None

Expand Down Expand Up @@ -250,7 +250,7 @@ def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingPar

total_subtype_kmers = int(st.n_kmers_matching_subtype_expected)
total_subtype_kmers_hits = int(st.n_kmers_matching_subtype)
conflicting_kmers = get_conflicting_kmers(st, df)
conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input())
num_pos_kmers, num_neg_kmers = get_num_pos_neg_kmers(st, df)
obs = int(st.n_kmers_matching_all)
exp = int(st.n_kmers_matching_all_expected)
Expand Down
25 changes: 19 additions & 6 deletions bio_hansel/qc/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
from typing import Tuple, Optional, List, Any, Dict
from typing import Tuple, Optional, List, Any, Dict, Iterable

from pandas import DataFrame

from ..subtype import Subtype


def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]:
def component_subtypes(subtype: str) -> Iterable[str]:
"""Generate component subtypes from a subtype.
Args:
subtype: Subtype string, e.g. "4.2.1.1"
Yields:
Component subtypes (e.g. for subtype "4.2.1.1", will yield
['4', '4.2', '4.2.1', '4.2.1.1'])
"""
split_subtype = subtype.split('.')
for i, x in enumerate(split_subtype):
yield '.'.join(split_subtype[:i+1])


def get_conflicting_kmers(subtype: str, df: DataFrame, is_fastq_input: bool = True) -> Optional[DataFrame]:
""" Get positive and negative kmers that both are present for a subtype.
Find positive and negative kmers for the same refposition/target site in the results `df`.
Expand All @@ -17,10 +31,9 @@ def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]:
Returns:
DataFrame of conflicting positive and negative kmers
"""
if st.is_fastq_input():
dfst = df[(df['subtype'] == str(st.subtype)) & (df['is_kmer_freq_okay'])]
else: # fasta files
dfst = df[(df['subtype'] == str(st.subtype))]
dfst = df[(df['subtype'].isin(list(component_subtypes(subtype))))]
if is_fastq_input:
dfst = dfst[dfst['is_kmer_freq_okay']]

pos_kmer_positions = dfst[dfst['is_pos_kmer']]['refposition']
neg_kmers = dfst[~dfst['is_pos_kmer']]
Expand Down
64 changes: 64 additions & 0 deletions tests/data/qc/conflicting_subtypes/fail.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
refposition subtype is_pos_kmer is_kmer_freq_okay
931123 4 True True
62657 4.1 True True
891756 4.1.2 True True
3216553 1.1.1.1 False True
3479545 1.2.1 False True
3470377 1.2.2 False True
3466426 4.6 False True
3388166 4.3.2.1 False True
3273107 3 False True
3021283 1.1.1 False True
3722702 3.1.2 False True
2875883 4.6.2.2 False True
2874344 3.1.2.2 False True
2831482 8.1 False True
2694560 4.4.1.2 False True
2622402 1.1.2 False True
3570528 4.6.2.1 False True
62657 4.1 False True
2411730 4.2 False True
3836274 2.2.1.2 False True
3836739 4.8 False True
3977226 4.3.4 False True
4125058 4.6.2 False True
4151558 4.4.1 False True
4229087 4.1.1.3 False True
4246508 4.4.2 False True
4248115 2.2.1.1 False True
4249732 4.7 False True
4260268 4.6.1 False True
4307886 4.4 False True
4316114 4.3.2 False True
4398141 4.3.4.1 False True
2505085 2.2 False True
1881090 2.1 False True
1882180 8 False True
874787 4.6.1.1 False True
107794 4.1.2.1 False True
346693 2.2.2 False True
355181 4.4.1.1 False True
403364 4.3.3 False True
497491 2 False True
514245 4.1.1 False True
541048 4.1.1.2 False True
615614 4.3.1 False True
615938 1 False True
764995 4.3 False True
783601 4.2.1 False True
797736 2.2.1 False True
1084911 3.1.1 False True
1850119 4.1.1.1 False True
1132368 4.3.4.2 False True
1137518 7 False True
1237818 3.1.2.1 False True
1455780 4.2.2.1 False True
1487796 4.2.2 False True
1491275 1.1.3 False True
1501468 4.6.1.2 False True
1502120 4.3.4.2.1 False True
1719757 4.5 False True
1759252 4.9 False True
1799921 5 False True
1816587 8.2 False True
4404247 1.1 False True
63 changes: 63 additions & 0 deletions tests/data/qc/conflicting_subtypes/pass.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
refposition subtype is_pos_kmer is_kmer_freq_okay
931123 4 True True
62657 4.1 True True
891756 4.1.2 True True
3216553 1.1.1.1 False True
3479545 1.2.1 False True
3470377 1.2.2 False True
3466426 4.6 False True
3388166 4.3.2.1 False True
3273107 3 False True
3021283 1.1.1 False True
3722702 3.1.2 False True
2875883 4.6.2.2 False True
2874344 3.1.2.2 False True
2831482 8.1 False True
2694560 4.4.1.2 False True
2622402 1.1.2 False True
3570528 4.6.2.1 False True
2411730 4.2 False True
3836274 2.2.1.2 False True
3836739 4.8 False True
3977226 4.3.4 False True
4125058 4.6.2 False True
4151558 4.4.1 False True
4229087 4.1.1.3 False True
4246508 4.4.2 False True
4248115 2.2.1.1 False True
4249732 4.7 False True
4260268 4.6.1 False True
4307886 4.4 False True
4316114 4.3.2 False True
4398141 4.3.4.1 False True
2505085 2.2 False True
1881090 2.1 False True
1882180 8 False True
874787 4.6.1.1 False True
107794 4.1.2.1 False True
346693 2.2.2 False True
355181 4.4.1.1 False True
403364 4.3.3 False True
497491 2 False True
514245 4.1.1 False True
541048 4.1.1.2 False True
615614 4.3.1 False True
615938 1 False True
764995 4.3 False True
783601 4.2.1 False True
797736 2.2.1 False True
1084911 3.1.1 False True
1850119 4.1.1.1 False True
1132368 4.3.4.2 False True
1137518 7 False True
1237818 3.1.2.1 False True
1455780 4.2.2.1 False True
1487796 4.2.2 False True
1491275 1.1.3 False True
1501468 4.6.1.2 False True
1502120 4.3.4.2.1 False True
1719757 4.5 False True
1759252 4.9 False True
1799921 5 False True
1816587 8.2 False True
4404247 1.1 False True
25 changes: 24 additions & 1 deletion tests/test_qc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import pandas as pd

from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts

from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts, component_subtypes, get_conflicting_kmers


fail_tsv = 'tests/data/qc/conflicting_subtypes/fail.tsv'
pass_tsv = 'tests/data/qc/conflicting_subtypes/pass.tsv'


def test_get_mixed_subtype_kmer_counts():
Expand All @@ -22,3 +27,21 @@ def test_get_mixed_subtype_kmer_counts():
assert(int(st_pos_kmers.get('2.1')) == 5)
assert(int(st_pos_kmers.get('2.2')) == 3)
assert(int(st_pos_kmers.get('0')) == 1)


def test_component_subtypes():
assert list(component_subtypes('4.2.1.1')) == ['4', '4.2', '4.2.1', '4.2.1.1']
assert list(component_subtypes('1')) == ['1']


def test_get_conflicting_kmers():
df_pass = pd.read_csv(pass_tsv, sep='\t')
df_pass_result = get_conflicting_kmers('4.1.2', df_pass, True)
assert df_pass_result.shape[0] == 0, 'Must be no conflicting kmers'
df_fail = pd.read_csv(fail_tsv, sep='\t')
df_fail_result = get_conflicting_kmers('4.1.2', df_fail, True)
df_fail_result.reset_index(inplace=True)
assert df_fail_result.shape[0] == 1, 'Must be one conflicting kmer'
assert df_fail_result.refposition[0] == 62657
assert df_fail_result.subtype[0] == '4.1'
assert df_fail_result.is_pos_kmer[0] == False

0 comments on commit 6a9574a

Please sign in to comment.