Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#13, #14: Quality Checking Module & Technician's Output #17

Merged
merged 38 commits into from
Nov 30, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
3800a5c
1) Added new column for CONFIDENT_IS_SUBTYPE to determine if the subt…
mgopez Oct 16, 2017
dfd873b
Merge branch 'master' into new_output_feature_requests
mgopez Oct 16, 2017
a4e13a1
Changed the min tiles threshold to be >5% of expected tiles.
mgopez Oct 16, 2017
8c25ea6
Changed the way mixed subtypes and min tiles are checked and stored.
mgopez Oct 23, 2017
8e5ccf5
#13: Created new simple output file. Added new argument for the path …
mgopez Oct 24, 2017
7c4b2e3
Merge remote-tracking branch 'origin/new_output_feature_requests' int…
mgopez Oct 24, 2017
6768fb7
Adding checking for null input.
mgopez Oct 25, 2017
46c5194
Fixing test data.
mgopez Nov 4, 2017
1e00b97
Refactoring the quality check module.
mgopez Nov 6, 2017
21083f2
Changing warning messages, fixed dead code.
mgopez Nov 6, 2017
8dc2535
Refactoring structure of quality checking.
mgopez Nov 6, 2017
f6ff944
Re-re-factoring quality check functions, and added new method
mgopez Nov 7, 2017
d7a601b
Adding intuitive messages, fixing bug with no inconsistent subtypes b…
mgopez Nov 8, 2017
acaf53b
Minor text fixes
mgopez Nov 8, 2017
d2adee9
Adding return type.
mgopez Nov 8, 2017
2d79c64
Adding simplier method for checking if the subtype is there.
mgopez Nov 8, 2017
1aa20ce
Skeleton for re-factored QC methods.
mgopez Nov 13, 2017
76937f4
Skeleton for re-factored QC methods.
mgopez Nov 13, 2017
c76bfca
Adding genome coverage for check_missing_tiles.
mgopez Nov 14, 2017
7482582
Created Quality Check module as per Genevieve's requirements.
mgopez Nov 15, 2017
bdd55c5
Adding git ignore.
mgopez Nov 15, 2017
5d58710
Corrected quality checking methods.
mgopez Nov 16, 2017
9639236
text fixes
mgopez Nov 16, 2017
285f868
editing git file
mgopez Nov 16, 2017
e6af08d
Fixed git ignore.
mgopez Nov 16, 2017
2029087
Changing variable names, adding constants for error types.
mgopez Nov 16, 2017
94d616c
Adding Inconsistent Results Error 3B
mgopez Nov 17, 2017
05ed00d
Adding none checking.
mgopez Nov 17, 2017
3991c0b
WIP: Changing doc strings
mgopez Nov 20, 2017
0b40987
Removing record.txt
mgopez Nov 21, 2017
a80bf9b
Changed argument for simple summary to -S
mgopez Nov 21, 2017
9ca4353
Removing nesting within the quality checking methods.
mgopez Nov 21, 2017
43e1914
Change negating.
mgopez Nov 21, 2017
839320f
Added --force, to overwrite existing output files.
mgopez Nov 22, 2017
f7a50db
Added SubtypingParams.
mgopez Nov 22, 2017
671e3db
Fixing formatting, created new tests.
mgopez Nov 22, 2017
746989d
Added new arguments.
mgopez Nov 24, 2017
36a7542
Readding test files.
mgopez Nov 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,8 @@ ENV/
# Rope project settings
.ropeproject

# Output files
match_results.tab
results.tab
test.tab

39 changes: 39 additions & 0 deletions bio_hansel/const.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# -*- coding: utf-8 -*-
from pkg_resources import resource_filename

from bio_hansel import program_name
from bio_hansel.subtyping_params import SubtypingParams

SCHEME_FASTAS = {'heidelberg': {'file': resource_filename(program_name, 'data/heidelberg/tiles.fasta'),
'version': '0.5.0',
'subtyping_params': SubtypingParams(low_coverage_depth_freq=20)},
'enteritidis': {'file': resource_filename(program_name, 'data/enteritidis/tiles.fasta'),
'version': '0.7.0',
'subtyping_params': SubtypingParams(low_coverage_depth_freq=50)}}


FASTA_COLUMNS_TO_REMOVE = '''
pident
Expand All @@ -17,3 +29,30 @@
coverage
is_trunc
'''.strip().split('\n')

# These are present within the subtype module.
SUBTYPE_SUMMARY_COLS = """
sample
scheme
scheme_version
subtype
all_subtypes
tiles_matching_subtype
are_subtypes_consistent
inconsistent_subtypes
n_tiles_matching_all
n_tiles_matching_all_expected
n_tiles_matching_positive
n_tiles_matching_positive_expected
n_tiles_matching_subtype
n_tiles_matching_subtype_expected
file_path
qc_status
qc_message""".strip().split('\n')

SIMPLE_SUMMARY_COLS = """
sample
subtype
qc_status
qc_message
""".strip().split('\n')
38 changes: 32 additions & 6 deletions bio_hansel/kmer_count/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# -*- coding: utf-8 -*-
import re
import shutil
from typing import Tuple, Optional

import attr
import os
from datetime import datetime
import logging
import pandas as pd

from ..utils import exc_exists, run_command, find_inconsistent_subtypes, SCHEME_FASTAS
from ..utils import exc_exists, run_command, find_inconsistent_subtypes
from bio_hansel.const import SCHEME_FASTAS
from ..blast_wrapper.helpers import parse_fasta, revcomp
from ..subtype import Subtype
from ..subtype_stats import subtype_counts
Expand All @@ -31,6 +34,7 @@ class Jellyfisher(object):
jf_file = attr.ib(default=None)
jf_query_tiles_file = attr.ib(default=None)
df_results = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(pd.DataFrame)))
subtype = attr.ib(default=None)

@scheme_subtype_counts.default
def _default_scheme_subtype_counts(self):
Expand Down Expand Up @@ -206,7 +210,7 @@ def parse_query(self):
df['tilename'] = tiles
df['is_pos_tile'] = [not x.startswith('negative') for x in tiles]
df['subtype'] = [y for x, y in df.tilename.str.split('-')]
df['refposition'] = [x for x, y in df.tilename.str.split('-')]
df['refposition'] = [int(x.replace('negative', '')) for x, y in df.tilename.str.split('-')]
df['is_kmer_freq_okay'] = (df.freq >= self.min_kmer_freq) & (df.freq <= self.max_kmer_freq)
logging.info('n=%s k-mers with freq within thresholds of %s and %s',
df.is_kmer_freq_okay.sum(),
Expand All @@ -219,18 +223,21 @@ def parse_query(self):
self.df_results = df
return df

def summary(self):
def summary(self) -> Tuple[Subtype, Optional[pd.DataFrame]]:
if self.df_results is None:
self.parse_query()
df = self.df_results
st = Subtype(sample=self.genome_name, file_path=self._reads_to_str(), scheme=self.scheme, scheme_version=self.scheme_version)
st = Subtype(sample=self.genome_name, file_path=self._reads_to_str(), scheme=self.scheme,
scheme_version=self.scheme_version)
st.scheme_subtype_counts = self.scheme_subtype_counts
self.subtype = st
if df is None or df.shape[0] == 0:
logging.warning('No "%s" subtyping scheme tile matches for "%s"', self.scheme, self.reads)
st.are_subtypes_consistent = False
return st, None
dfgood = df[df.is_kmer_freq_okay]
dfpos = dfgood[dfgood.is_pos_tile]
dfneg = dfgood[~dfgood.is_pos_tile]
logging.debug('dfpos: %s', dfpos)
subtype_lens = dfpos.subtype.apply(len)
max_subtype_strlen = subtype_lens.max()
Expand All @@ -244,20 +251,37 @@ def summary(self):
logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
st.n_tiles_matching_all = dfgood.shape[0]
st.n_tiles_matching_positive = dfpos.shape[0]
st.n_tiles_matching_negative = dfneg.shape[0]
st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
pos_subtypes_str = [x for x in dfpos.subtype.unique()]
pos_subtypes_str.sort(key=lambda x: len(x))
st.all_subtypes = '; '.join(pos_subtypes_str)
subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
st.subtype = '; '.join(subtype_list)
st.n_tiles_matching_all_expected = ';'.join([str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
st.n_tiles_matching_all_expected = ';'.join(
[str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
st.n_tiles_matching_positive_expected = ';'.join(
[str(self.scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
st.n_tiles_matching_subtype_expected = ';'.join([str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
st.n_tiles_matching_negative_expected = ';'.join(
[str(self.scheme_subtype_counts[x].negative_tile_count) for x in subtype_list])
st.n_tiles_matching_subtype_expected = ';'.join(
[str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename])

possible_downstream_subtypes = [s for s in self.scheme_subtype_counts
if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)]
non_present_subtypes = []
if possible_downstream_subtypes:
for subtype in possible_downstream_subtypes:
if subtype not in df['subtype']:
non_present_subtypes.append(subtype)

st.non_present_subtypes = non_present_subtypes

if len(inconsistent_subtypes) > 0:
st.are_subtypes_consistent = False
st.inconsistent_subtypes = inconsistent_subtypes

logging.info(st)
return st, df

Expand All @@ -270,3 +294,5 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
self.cleanup()


66 changes: 54 additions & 12 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
import pandas as pd
from collections import defaultdict

from . import program_name, program_desc, __version__
from .subtyper import subtype_fasta, SUBTYPE_SUMMARY_COLS, subtype_reads
from .subtype_stats import subtype_counts
from .utils import genome_name_from_fasta_path, get_scheme_fasta
from bio_hansel import program_name, program_desc, __version__
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please change all imports to from . import ... format.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems when I leave it at .. instead of bio_hansel. PyCharm won't let me run the program. Perhaps this is an error with my setup?

from bio_hansel.const import SUBTYPE_SUMMARY_COLS
from bio_hansel.subtyper import subtype_fasta, subtype_reads
from bio_hansel.subtype_stats import subtype_counts
from bio_hansel.subtyping_params import SubtypingParams
from bio_hansel.utils import genome_name_from_fasta_path, get_scheme_fasta, out_files_exists, get_scheme_params

SCRIPT_NAME = 'hansel'
LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
Expand Down Expand Up @@ -59,14 +61,31 @@ def init_parser():
help='Subtyping summary output path (tab-delimited)')
parser.add_argument('-O', '--output-tile-results',
help='Subtyping tile matching output path (tab-delimited)')
parser.add_argument('-S', '--output-simple-summary',
help='Subtyping simple summary output path')
parser.add_argument('--force',
action='store_true',
help='Force existing output files to be overwritten')
parser.add_argument('--min-kmer-freq',
type=int,
default=10,
help='Min k-mer freq/coverage')
parser.add_argument('--max-kmer-freq',
type=int,
default=200,
help='Max k-mer freq/coverage')
# Changes
parser.add_argument('--low-cov-depth-freq',
type=int,
help='Frequencies below this coverage are considered low coverage')
parser.add_argument('--max-missing-tiles',
type=float,
help='Decimal proportion of maximum allowable missing tiles before being considered an error. (0.0 - 1.0)')
parser.add_argument('--min-ambiguous-tiles',
type=int,
help='Minimum number of missing tiles to be considered an ambiguous result')
parser.add_argument('--max-intermediate-tiles',
type=float,
help='Decimal proportion of maximum allowable missing tiles to be considered an intermediate subtype. (0.0 - 1.0)')
# Changes
parser.add_argument('-t', '--threads',
type=int,
default=1,
Expand Down Expand Up @@ -97,14 +116,31 @@ def main():
init_console_logger(args.verbose)
output_summary_path = args.output_summary
output_tile_results = args.output_tile_results

output_simple_summary_path = args.output_simple_summary
out_files_exists(output_simple_summary_path, args.force)
out_files_exists(output_summary_path, args.force)
out_files_exists(output_tile_results, args.force)
scheme = args.scheme # type: str
scheme_name = args.scheme_name # type: Optional[str]
scheme_fasta = get_scheme_fasta(scheme)
scheme_subtype_counts = subtype_counts(scheme_fasta)
input_genomes = []
reads = []
logging.debug(args)

subtyping_params = get_scheme_params(scheme)
if not subtyping_params:
subtyping_params = SubtypingParams()

if args.low_cov_depth_freq:
subtyping_params.low_coverage_depth_freq = args.low_cov_depth_freq
if args.max_missing_tiles:
subtyping_params.max_perc_missing_tiles = args.max_missing_tiles
if args.min_ambiguous_tiles:
subtyping_params.min_ambiguous_tiles = args.min_ambiguous_tiles
if args.max_intermediate_tiles:
subtyping_params.max_perc_intermediate_tiles = args.max_intermediate_tiles

if args.files:
fastas = [x for x in args.files if re.match(r'^.+\.(fasta|fa|fna)$', x)]
fastqs = [x for x in args.files if re.match(r'^.+\.(fastq|fq)$', x)]
Expand Down Expand Up @@ -158,7 +194,8 @@ def main():
if input_genomes:
if n_threads == 1:
logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
outputs = [subtype_fasta(scheme,
outputs = [subtype_fasta(subtyping_params,
scheme,
input_fasta,
genome_name,
tmp_dir=tmp_dir,
Expand All @@ -170,7 +207,8 @@ def main():
logging.info('Initializing thread pool with %s threads', n_threads)
pool = Pool(processes=n_threads)
logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes))
res = [pool.apply_async(subtype_fasta, (scheme,
res = [pool.apply_async(subtype_fasta, (subtyping_params,
scheme,
input_fasta,
genome_name,
tmp_dir,
Expand All @@ -187,13 +225,12 @@ def main():
subtype_results.append(attr.asdict(subtype))

if reads:
outputs = [subtype_reads(scheme=scheme,
outputs = [subtype_reads(subtyping_params,
scheme=scheme,
reads=r,
genome_name=genome_name,
tmp_dir=tmp_dir,
threads=n_threads,
min_kmer_freq=args.min_kmer_freq,
max_kmer_freq=args.max_kmer_freq,
scheme_name=scheme_name,
scheme_subtype_counts=scheme_subtype_counts)
for r, genome_name in reads]
Expand All @@ -207,6 +244,8 @@ def main():
dfsummary = pd.DataFrame(subtype_results)
dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]

if output_summary_path:
dfsummary.to_csv(output_summary_path, sep='\t', index=None)
logging.info('Wrote subtyping output summary to %s', output_summary_path)
Expand All @@ -216,6 +255,9 @@ def main():
if output_tile_results:
dfall.to_csv(output_tile_results, sep='\t', index=None)

if output_simple_summary_path:
df_simple_summary.to_csv(output_simple_summary_path, sep='\t', index=None)


def collect_fasta_from_dir(input_directory):
input_genomes = []
Expand Down
61 changes: 61 additions & 0 deletions bio_hansel/quality_check/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import List, Callable, Tuple

from pandas import DataFrame

from ..subtyping_params import SubtypingParams
from ..quality_check.quality_check_functions import check_missing_tiles, does_subtype_result_exist, \
check_mixed_subtype, check_intermediate_subtype, is_missing_target_sites, is_missing_downstream_targets
from ..quality_check.const import FAIL_MESSAGE, WARNING_MESSAGE
from ..subtype import Subtype
import logging


QC_FUNCS = \
[
check_missing_tiles,
check_mixed_subtype,
is_missing_target_sites,
is_missing_downstream_targets,
check_intermediate_subtype,
] # type: List[Callable[[Subtype, DataFrame, SubtypingParams], Tuple[str, str]]]


def perform_quality_check(st: Subtype, df: DataFrame, subtyping_params: SubtypingParams):
""" Driver method to call all quality checking functions and handle their responses.
Note:
This is the driver method for the quality check module. Every method within the QC_FUNCS list will be run
with parameters ( SUBTYPE, DATAFRAME ). If a quality check module returns something other than None, then
an Error, or Warning has occured.

Args:
:param st: Subtyping results.
:param df: DataFrame containing subtyping results.

Returns:
None, modifies the subtype with the result.
"""
logging.debug("Performing Quality Checking")
overall_qc_status = 'PASS'
messages = []

if does_subtype_result_exist(st) is False:
logging.warning("QC: Quality checking not run, subtype result did not exist.")
st.qc_status = 'FAIL'
st.qc_message = 'FAIL: Subtype does not exist, quality checking was not run.'
return None

for func in QC_FUNCS:
# Calls run_method to check that the qc function takes a Subtype, returns Tuple[Optional[str], Optional[str]]
status, message = func(st, df, subtyping_params)
if status is None:
# If quality check function passes, move on to the next.
continue
messages.append('{}: {}'.format(status, message))
if status is FAIL_MESSAGE:
overall_qc_status = FAIL_MESSAGE
elif overall_qc_status != FAIL_MESSAGE and status == WARNING_MESSAGE:
overall_qc_status = WARNING_MESSAGE

st.qc_status = overall_qc_status
st.qc_message = ' | '.join(messages)
logging.debug("QC: Finished!")
8 changes: 8 additions & 0 deletions bio_hansel/quality_check/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FAIL_MESSAGE = "FAIL"
WARNING_MESSAGE = "WARNING"
# Errors for Hansel
MISSING_TILES_ERROR_1 = "Missing Tiles Error 1"
MIXED_SAMPLE_ERROR_2 = "Mixed Sample Error 2"
AMBIGUOUS_RESULTS_ERROR_3 = "Ambiguous Results Error 3"
NON_CONFIDENT_RESULTS_ERROR_4 = "Non Confident Results Error 4"
INTERMEDIATE_SUBTYPE_WARNING = "Intermediate Subtype Warning"
Loading