Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #98 QC check for mixed subtypes #100

Merged
merged 2 commits into from
Jun 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bio_hansel/qc/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str
"""
if not st.are_subtypes_consistent:
return QC.FAIL, f'Mixed subtypes found: "{"; ".join(sorted(st.inconsistent_subtypes))}".'
conflicting_kmers = get_conflicting_kmers(st, df)
conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input())
if conflicting_kmers is None or conflicting_kmers.shape[0] == 0:
return None, None

Expand Down Expand Up @@ -250,7 +250,7 @@ def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingPar

total_subtype_kmers = int(st.n_kmers_matching_subtype_expected)
total_subtype_kmers_hits = int(st.n_kmers_matching_subtype)
conflicting_kmers = get_conflicting_kmers(st, df)
conflicting_kmers = get_conflicting_kmers(st.subtype, df, st.is_fastq_input())
num_pos_kmers, num_neg_kmers = get_num_pos_neg_kmers(st, df)
obs = int(st.n_kmers_matching_all)
exp = int(st.n_kmers_matching_all_expected)
Expand Down
25 changes: 19 additions & 6 deletions bio_hansel/qc/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
from typing import Tuple, Optional, List, Any, Dict
from typing import Tuple, Optional, List, Any, Dict, Iterable

from pandas import DataFrame

from ..subtype import Subtype


def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]:
def component_subtypes(subtype: str) -> Iterable[str]:
"""Generate component subtypes from a subtype.

Args:
subtype: Subtype string, e.g. "4.2.1.1"
Yields:
Component subtypes (e.g. for subtype "4.2.1.1", will yield
['4', '4.2', '4.2.1', '4.2.1.1'])
"""
split_subtype = subtype.split('.')
for i, x in enumerate(split_subtype):
yield '.'.join(split_subtype[:i+1])


def get_conflicting_kmers(subtype: str, df: DataFrame, is_fastq_input: bool = True) -> Optional[DataFrame]:
""" Get positive and negative kmers that both are present for a subtype.

Find positive and negative kmers for the same refposition/target site in the results `df`.
Expand All @@ -17,10 +31,9 @@ def get_conflicting_kmers(st: Subtype, df: DataFrame) -> Optional[DataFrame]:
Returns:
DataFrame of conflicting positive and negative kmers
"""
if st.is_fastq_input():
dfst = df[(df['subtype'] == str(st.subtype)) & (df['is_kmer_freq_okay'])]
else: # fasta files
dfst = df[(df['subtype'] == str(st.subtype))]
dfst = df[(df['subtype'].isin(list(component_subtypes(subtype))))]
if is_fastq_input:
dfst = dfst[dfst['is_kmer_freq_okay']]

pos_kmer_positions = dfst[dfst['is_pos_kmer']]['refposition']
neg_kmers = dfst[~dfst['is_pos_kmer']]
Expand Down
64 changes: 64 additions & 0 deletions tests/data/qc/conflicting_subtypes/fail.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
refposition subtype is_pos_kmer is_kmer_freq_okay
931123 4 True True
62657 4.1 True True
891756 4.1.2 True True
3216553 1.1.1.1 False True
3479545 1.2.1 False True
3470377 1.2.2 False True
3466426 4.6 False True
3388166 4.3.2.1 False True
3273107 3 False True
3021283 1.1.1 False True
3722702 3.1.2 False True
2875883 4.6.2.2 False True
2874344 3.1.2.2 False True
2831482 8.1 False True
2694560 4.4.1.2 False True
2622402 1.1.2 False True
3570528 4.6.2.1 False True
62657 4.1 False True
2411730 4.2 False True
3836274 2.2.1.2 False True
3836739 4.8 False True
3977226 4.3.4 False True
4125058 4.6.2 False True
4151558 4.4.1 False True
4229087 4.1.1.3 False True
4246508 4.4.2 False True
4248115 2.2.1.1 False True
4249732 4.7 False True
4260268 4.6.1 False True
4307886 4.4 False True
4316114 4.3.2 False True
4398141 4.3.4.1 False True
2505085 2.2 False True
1881090 2.1 False True
1882180 8 False True
874787 4.6.1.1 False True
107794 4.1.2.1 False True
346693 2.2.2 False True
355181 4.4.1.1 False True
403364 4.3.3 False True
497491 2 False True
514245 4.1.1 False True
541048 4.1.1.2 False True
615614 4.3.1 False True
615938 1 False True
764995 4.3 False True
783601 4.2.1 False True
797736 2.2.1 False True
1084911 3.1.1 False True
1850119 4.1.1.1 False True
1132368 4.3.4.2 False True
1137518 7 False True
1237818 3.1.2.1 False True
1455780 4.2.2.1 False True
1487796 4.2.2 False True
1491275 1.1.3 False True
1501468 4.6.1.2 False True
1502120 4.3.4.2.1 False True
1719757 4.5 False True
1759252 4.9 False True
1799921 5 False True
1816587 8.2 False True
4404247 1.1 False True
63 changes: 63 additions & 0 deletions tests/data/qc/conflicting_subtypes/pass.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
refposition subtype is_pos_kmer is_kmer_freq_okay
931123 4 True True
62657 4.1 True True
891756 4.1.2 True True
3216553 1.1.1.1 False True
3479545 1.2.1 False True
3470377 1.2.2 False True
3466426 4.6 False True
3388166 4.3.2.1 False True
3273107 3 False True
3021283 1.1.1 False True
3722702 3.1.2 False True
2875883 4.6.2.2 False True
2874344 3.1.2.2 False True
2831482 8.1 False True
2694560 4.4.1.2 False True
2622402 1.1.2 False True
3570528 4.6.2.1 False True
2411730 4.2 False True
3836274 2.2.1.2 False True
3836739 4.8 False True
3977226 4.3.4 False True
4125058 4.6.2 False True
4151558 4.4.1 False True
4229087 4.1.1.3 False True
4246508 4.4.2 False True
4248115 2.2.1.1 False True
4249732 4.7 False True
4260268 4.6.1 False True
4307886 4.4 False True
4316114 4.3.2 False True
4398141 4.3.4.1 False True
2505085 2.2 False True
1881090 2.1 False True
1882180 8 False True
874787 4.6.1.1 False True
107794 4.1.2.1 False True
346693 2.2.2 False True
355181 4.4.1.1 False True
403364 4.3.3 False True
497491 2 False True
514245 4.1.1 False True
541048 4.1.1.2 False True
615614 4.3.1 False True
615938 1 False True
764995 4.3 False True
783601 4.2.1 False True
797736 2.2.1 False True
1084911 3.1.1 False True
1850119 4.1.1.1 False True
1132368 4.3.4.2 False True
1137518 7 False True
1237818 3.1.2.1 False True
1455780 4.2.2.1 False True
1487796 4.2.2 False True
1491275 1.1.3 False True
1501468 4.6.1.2 False True
1502120 4.3.4.2.1 False True
1719757 4.5 False True
1759252 4.9 False True
1799921 5 False True
1816587 8.2 False True
4404247 1.1 False True
25 changes: 24 additions & 1 deletion tests/test_qc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import pandas as pd

from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts

from bio_hansel.qc.utils import get_mixed_subtype_kmer_counts, component_subtypes, get_conflicting_kmers


fail_tsv = 'tests/data/qc/conflicting_subtypes/fail.tsv'
pass_tsv = 'tests/data/qc/conflicting_subtypes/pass.tsv'


def test_get_mixed_subtype_kmer_counts():
Expand All @@ -22,3 +27,21 @@ def test_get_mixed_subtype_kmer_counts():
assert(int(st_pos_kmers.get('2.1')) == 5)
assert(int(st_pos_kmers.get('2.2')) == 3)
assert(int(st_pos_kmers.get('0')) == 1)


def test_component_subtypes():
assert list(component_subtypes('4.2.1.1')) == ['4', '4.2', '4.2.1', '4.2.1.1']
assert list(component_subtypes('1')) == ['1']


def test_get_conflicting_kmers():
df_pass = pd.read_csv(pass_tsv, sep='\t')
df_pass_result = get_conflicting_kmers('4.1.2', df_pass, True)
assert df_pass_result.shape[0] == 0, 'Must be no conflicting kmers'
df_fail = pd.read_csv(fail_tsv, sep='\t')
df_fail_result = get_conflicting_kmers('4.1.2', df_fail, True)
df_fail_result.reset_index(inplace=True)
assert df_fail_result.shape[0] == 1, 'Must be one conflicting kmer'
assert df_fail_result.refposition[0] == 62657
assert df_fail_result.subtype[0] == '4.1'
assert df_fail_result.is_pos_kmer[0] == False