Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#13, #14: Quality Checking Module & Technician's Output #17

Merged
merged 38 commits into from
Nov 30, 2017
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
3800a5c
1) Added new column for CONFIDENT_IS_SUBTYPE to determine if the subt…
mgopez Oct 16, 2017
dfd873b
Merge branch 'master' into new_output_feature_requests
mgopez Oct 16, 2017
a4e13a1
Changed the min tiles threshold to be >5% of expected tiles.
mgopez Oct 16, 2017
8c25ea6
Changed the way mixed subtypes and min tiles are checked and stored.
mgopez Oct 23, 2017
8e5ccf5
#13: Created new simple output file. Added new argument for the path …
mgopez Oct 24, 2017
7c4b2e3
Merge remote-tracking branch 'origin/new_output_feature_requests' int…
mgopez Oct 24, 2017
6768fb7
Adding checking for null input.
mgopez Oct 25, 2017
46c5194
Fixing test data.
mgopez Nov 4, 2017
1e00b97
Refactoring the quality check module.
mgopez Nov 6, 2017
21083f2
Changing warning messages, fixed dead code.
mgopez Nov 6, 2017
8dc2535
Refactoring structure of quality checking.
mgopez Nov 6, 2017
f6ff944
Re-re-factoring quality check functions, and added new method
mgopez Nov 7, 2017
d7a601b
Adding intuitive messages, fixing bug with no inconsistent subtypes b…
mgopez Nov 8, 2017
acaf53b
Minor text fixes
mgopez Nov 8, 2017
d2adee9
Adding return type.
mgopez Nov 8, 2017
2d79c64
Adding simplier method for checking if the subtype is there.
mgopez Nov 8, 2017
1aa20ce
Skeleton for re-factored QC methods.
mgopez Nov 13, 2017
76937f4
Skeleton for re-factored QC methods.
mgopez Nov 13, 2017
c76bfca
Adding genome coverage for check_missing_tiles.
mgopez Nov 14, 2017
7482582
Created Quality Check module as per Genevieve's requirements.
mgopez Nov 15, 2017
bdd55c5
Adding git ignore.
mgopez Nov 15, 2017
5d58710
Corrected quality checking methods.
mgopez Nov 16, 2017
9639236
text fixes
mgopez Nov 16, 2017
285f868
editing git file
mgopez Nov 16, 2017
e6af08d
Fixed git ignore.
mgopez Nov 16, 2017
2029087
Changing variable names, adding constants for error types.
mgopez Nov 16, 2017
94d616c
Adding Inconsistent Results Error 3B
mgopez Nov 17, 2017
05ed00d
Adding none checking.
mgopez Nov 17, 2017
3991c0b
WIP: Changing doc strings
mgopez Nov 20, 2017
0b40987
Removing record.txt
mgopez Nov 21, 2017
a80bf9b
Changed argument for simple summary to -S
mgopez Nov 21, 2017
9ca4353
Removing nesting within the quality checking methods.
mgopez Nov 21, 2017
43e1914
Change negating.
mgopez Nov 21, 2017
839320f
Added --force, to overwrite existing output files.
mgopez Nov 22, 2017
f7a50db
Added SubtypingParams.
mgopez Nov 22, 2017
671e3db
Fixing formatting, created new tests.
mgopez Nov 22, 2017
746989d
Added new arguments.
mgopez Nov 24, 2017
36a7542
Readding test files.
mgopez Nov 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,14 @@ ENV/
# Rope project settings
.ropeproject

# Output files
match_results.tab
results.tab
test.tab


tests/data/Retro1000data/

tests/data/SRR1696752/

tests/data/SRR3392166/
27 changes: 27 additions & 0 deletions bio_hansel/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,30 @@
coverage
is_trunc
'''.strip().split('\n')

# These are present within the subtype module.
SUBTYPE_SUMMARY_COLS = """
sample
scheme
scheme_version
subtype
all_subtypes
tiles_matching_subtype
are_subtypes_consistent
inconsistent_subtypes
n_tiles_matching_all
n_tiles_matching_all_expected
n_tiles_matching_positive
n_tiles_matching_positive_expected
n_tiles_matching_subtype
n_tiles_matching_subtype_expected
file_path
qc_status
qc_message""".strip().split('\n')

SIMPLE_SUMMARY_COLS = """
sample
subtype
qc_status
qc_message
""".strip().split('\n')
9 changes: 9 additions & 0 deletions bio_hansel/kmer_count/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import pandas as pd

from ..quality_check import perform_quality_check
from ..utils import exc_exists, run_command, find_inconsistent_subtypes, SCHEME_FASTAS
from ..blast_wrapper.helpers import parse_fasta, revcomp
from ..subtype import Subtype
Expand Down Expand Up @@ -231,6 +232,7 @@ def summary(self):
return st, None
dfgood = df[df.is_kmer_freq_okay]
dfpos = dfgood[dfgood.is_pos_tile]
dfneg = dfgood[~dfgood.is_pos_tile]
logging.debug('dfpos: %s', dfpos)
subtype_lens = dfpos.subtype.apply(len)
max_subtype_strlen = subtype_lens.max()
Expand All @@ -244,6 +246,7 @@ def summary(self):
logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
st.n_tiles_matching_all = dfgood.shape[0]
st.n_tiles_matching_positive = dfpos.shape[0]
st.n_tiles_matching_negative = dfneg.shape[0]
st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
pos_subtypes_str = [x for x in dfpos.subtype.unique()]
pos_subtypes_str.sort(key=lambda x: len(x))
Expand All @@ -253,11 +256,17 @@ def summary(self):
st.n_tiles_matching_all_expected = ';'.join([str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
st.n_tiles_matching_positive_expected = ';'.join(
[str(self.scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
st.n_tiles_matching_negative_expected = ';'.join(
[str(self.scheme_subtype_counts[x].negative_tile_count) for x in subtype_list])
st.n_tiles_matching_subtype_expected = ';'.join([str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename])
st.possible_downstream_subtypes = [s for s in self.scheme_subtype_counts
if s.startswith(tuple(subtype_list)) and s not in subtype_list]

if len(inconsistent_subtypes) > 0:
st.are_subtypes_consistent = False
st.inconsistent_subtypes = inconsistent_subtypes

logging.info(st)
return st, df

Expand Down
61 changes: 53 additions & 8 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
import pandas as pd
from collections import defaultdict

from . import program_name, program_desc, __version__
from .subtyper import subtype_fasta, SUBTYPE_SUMMARY_COLS, subtype_reads
from .subtype_stats import subtype_counts
from .utils import genome_name_from_fasta_path, get_scheme_fasta
from bio_hansel import program_name, program_desc, __version__
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please change all imports to from . import ... format.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems when I leave it at .. instead of bio_hansel. PyCharm won't let me run the program. Perhaps this is an error with my setup?

from bio_hansel.const import SUBTYPE_SUMMARY_COLS
from bio_hansel.subtyper import subtype_fasta, subtype_reads
from bio_hansel.subtype_stats import subtype_counts
from bio_hansel.subtyping_params import SubtypingParams
from bio_hansel.utils import genome_name_from_fasta_path, get_scheme_fasta, out_files_exists

SCRIPT_NAME = 'hansel'
LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
Expand Down Expand Up @@ -59,6 +61,11 @@ def init_parser():
help='Subtyping summary output path (tab-delimited)')
parser.add_argument('-O', '--output-tile-results',
help='Subtyping tile matching output path (tab-delimited)')
parser.add_argument('-S', '--output-simple-summary',
help='Subtyping simple summary output path')
parser.add_argument('--force',
action='store_true',
help='Force existing output files to be overwritten')
parser.add_argument('--min-kmer-freq',
type=int,
default=10,
Expand All @@ -67,6 +74,24 @@ def init_parser():
type=int,
default=200,
help='Max k-mer freq/coverage')
# Changes
parser.add_argument('--low-cov-depth-freq',
type=int,
default=20,
help='Frequencies below this coverage are considered low coverage')
parser.add_argument('--missing-total-tiles-max',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's try to have names of things read like regular english statements and sentences, e.g. --max-missing-tiles

Is this the threshold for the number of missing positive or negative tiles to consider the result to be untrustworthy (i.e. qc_status == FAIL)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the maximum percentage of tiles allowed missing before the result is considered an error.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like I should change the type from int to float if it's a percentage.

type=int,
default=0.05,
help='Value in percentage, the maximum amount of total allowed missing tiles before being considered an error.')
parser.add_argument('--inc-tiles-max',
type=int,
default=3,
help='Minimum number of missing tiles to be considered an inconsistent result')
parser.add_argument('--int-subtype-tiles-max',
type=int,
default=0.05,
help='Value in percentage, the maximum amount of missing tiles to be tolerated to consider the result as an intermediate subtype.')
# Changes
parser.add_argument('-t', '--threads',
type=int,
default=1,
Expand Down Expand Up @@ -97,14 +122,26 @@ def main():
init_console_logger(args.verbose)
output_summary_path = args.output_summary
output_tile_results = args.output_tile_results

output_simple_summary_path = args.output_simple_summary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right about here we should add some checking to see if there are already files at each of those paths if the user has supplied those output paths and exit with non-zero status code while alerting user that a file already exists at one of these file paths (we don't want users overwriting their important files). We could add a --force opt to allow users to overwrite any files if they know what they are doing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I didn't think about that. I'll implement a warning.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I've added a check to see if output files already exist, and an optional argument --force to overwrite previous output files.

output_force = args.force
scheme = args.scheme # type: str
scheme_name = args.scheme_name # type: Optional[str]
scheme_fasta = get_scheme_fasta(scheme)
scheme_subtype_counts = subtype_counts(scheme_fasta)
input_genomes = []
reads = []
logging.debug(args)
subtyping_params = SubtypingParams(low_coverage_depth_freq=args.low_cov_depth_freq,
missing_total_tiles_max=args.missing_total_tiles_max,
inconsistent_tiles_max=args.inc_tiles_max,
intermediate_subtype_tiles_max=args.int_subtype_tiles_max)

if not output_force:
if out_files_exists(output_summary_path, output_tile_results, output_simple_summary_path):
return 0
else:
logging.info("Previous output files will be over written with --force")

if args.files:
fastas = [x for x in args.files if re.match(r'^.+\.(fasta|fa|fna)$', x)]
fastqs = [x for x in args.files if re.match(r'^.+\.(fastq|fq)$', x)]
Expand Down Expand Up @@ -158,7 +195,8 @@ def main():
if input_genomes:
if n_threads == 1:
logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
outputs = [subtype_fasta(scheme,
outputs = [subtype_fasta(subtyping_params,
scheme,
input_fasta,
genome_name,
tmp_dir=tmp_dir,
Expand All @@ -170,7 +208,8 @@ def main():
logging.info('Initializing thread pool with %s threads', n_threads)
pool = Pool(processes=n_threads)
logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes))
res = [pool.apply_async(subtype_fasta, (scheme,
res = [pool.apply_async(subtype_fasta, (subtyping_params,
scheme,
input_fasta,
genome_name,
tmp_dir,
Expand All @@ -187,7 +226,8 @@ def main():
subtype_results.append(attr.asdict(subtype))

if reads:
outputs = [subtype_reads(scheme=scheme,
outputs = [subtype_reads(subtyping_params,
scheme=scheme,
reads=r,
genome_name=genome_name,
tmp_dir=tmp_dir,
Expand All @@ -207,6 +247,8 @@ def main():
dfsummary = pd.DataFrame(subtype_results)
dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]

if output_summary_path:
dfsummary.to_csv(output_summary_path, sep='\t', index=None)
logging.info('Wrote subtyping output summary to %s', output_summary_path)
Expand All @@ -216,6 +258,9 @@ def main():
if output_tile_results:
dfall.to_csv(output_tile_results, sep='\t', index=None)

if output_simple_summary_path:
df_simple_summary.to_csv(output_simple_summary_path, sep='\t', index=None)


def collect_fasta_from_dir(input_directory):
input_genomes = []
Expand Down
60 changes: 60 additions & 0 deletions bio_hansel/quality_check/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import List, Callable, Tuple

from pandas import DataFrame

from ..subtyping_params import SubtypingParams
from ..quality_check.quality_check_functions import check_missing_tiles, does_subtype_result_exist, \
check_mixed_subtype, check_intermediate_subtype, check_inconsistent_results
from ..quality_check.const import FAIL_MESSAGE, WARNING_MESSAGE
from ..subtype import Subtype
import logging


QC_FUNCS: List[Callable[[Subtype, DataFrame, SubtypingParams], Tuple[str, str]]] = \
[
check_missing_tiles,
check_mixed_subtype,
check_inconsistent_results,
check_intermediate_subtype
]


def perform_quality_check(st: Subtype, df: DataFrame, subtyping_params: SubtypingParams):
""" Driver method to call all quality checking functions and handle their responses.
Note:
This is the driver method for the quality check module. Every method within the QC_FUNCS list will be run
with parameters ( SUBTYPE, DATAFRAME ). If a quality check module returns something other than None, then
an Error, or Warning has occured.

Args:
:param st: Subtyping results.
:param df: DataFrame containing subtyping results.

Returns:
None, modifies the subtype with the result.
"""
logging.debug("Performing Quality Checking")
overall_qc_status = 'PASS'
messages = []

if does_subtype_result_exist(st) is False:
logging.warning("QC: Quality checking not run, subtype result did not exist.")
st.qc_status = 'FAIL'
st.qc_message = 'FAIL: Subtype does not exist, quality checking was not run.'
return None

for func in QC_FUNCS:
# Calls run_method to check that the qc function takes a Subtype, returns Tuple[Optional[str], Optional[str]]
status, message = func(st, df, subtyping_params)
if status is None:
# If quality check function passes, move on to the next.
continue
messages.append('{}: {}'.format(status, message))
if status is FAIL_MESSAGE:
overall_qc_status = FAIL_MESSAGE
elif overall_qc_status != FAIL_MESSAGE and status == WARNING_MESSAGE:
overall_qc_status = WARNING_MESSAGE

st.qc_status = overall_qc_status
st.qc_message = ' | '.join(messages)
logging.debug("QC: Finished!")
9 changes: 9 additions & 0 deletions bio_hansel/quality_check/const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FAIL_MESSAGE = "FAIL"
WARNING_MESSAGE = "WARNING"
# Errors for Hansel
MISSING_TILES_ERROR_1A = "Missing Tiles Error 1A"
MISSING_TILES_ERROR_1B = "Missing Tiles Error 1B"
MIXED_SAMPLE_ERROR_2A = "Mixed Sample Error 2A"
INCONSISTENT_RESULTS_ERROR_3A = "Inconsistent Results Error 3A"
INCONSISTENT_RESULTS_ERROR_3B = "Inconsistent Results Error 3B"
INTERMEDIATE_SUBTYPE_WARNING = "Intermediate Subtype Warning"
81 changes: 81 additions & 0 deletions bio_hansel/quality_check/qc_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Tuple
from ..subtype import Subtype
from pandas import DataFrame, to_numeric, Series


def get_conflicting_tiles(st: Subtype, df: DataFrame) -> list:
""" This method gets positive and negative tiles that both are present for a subtype.
Note:
The purpose of this method is to find positive and negative tiles for the same refposition in the DataFrame.
The method will return a list with the conflicting tiles.

Args:
:param st: Subtyping results.
:param df: DataFrame containing subtyping results.

Returns:
DataFrame containing the conflicting positive and negative tiles.
"""
dfst = df.copy()
dfst['refposition'] = Series(dfst['refposition']).str.replace('negative', '')
dfst['refposition'] = to_numeric(dfst['refposition'], downcast='unsigned', errors='coerce')

if st.subtype:
if 'is_kmer_freq_okay' in df:
dfst = dfst[(dfst['subtype'] == str(st.subtype)) & (dfst['is_kmer_freq_okay'])]
else: # fasta files
dfst = dfst[(dfst['subtype'] == str(st.subtype))]

pos_tile_positions = dfst[dfst['is_pos_tile']]['refposition'].tolist()
neg_tiles = dfst[~dfst['is_pos_tile']]
conflicting_tiles = neg_tiles[neg_tiles['refposition'].isin(pos_tile_positions)]

return conflicting_tiles


def get_num_pos_neg_tiles(st: Subtype, df: DataFrame) -> Tuple[int, int]:
""" This method gets the number of positive and negative tiles.
Note:
The purpose of this method is to find the count of positive and negative tiles, and return them to the
caller.

Args:
:param st: Subtyping results.
:param df: DataFrame containing subtyping results.

Returns:
Tuple[int,int] containing the count of positive and negative tiles.
"""
num_pos_tiles = 0
num_neg_tiles = 0

if st.subtype:
dfst = df[(df['subtype'] == str(st.subtype))]
num_pos_tiles = dfst[dfst['is_pos_tile']].shape[0]
num_neg_tiles = dfst[~dfst['is_pos_tile']].shape[0]

return num_pos_tiles, num_neg_tiles


def possible_subtypes_exist_in_df(st: Subtype, df: DataFrame) -> list:
""" This method checks if the downstream subtypes' tiles are present within the DataFrame
Note:
The purpose of this method is to check if the downstream subtypes' tiles exist within the result.
If they're not present then we know that the result may or not be confident.

Args:
:param st: Subtyping results.
:param df: DataFrame containing subtyping results.

Returns:
list containing the non present subtypes.
"""
non_present_subtypes = []
possible_subtypes = st.possible_downstream_subtypes

if possible_subtypes:
for subtype in possible_subtypes:
if subtype not in df['subtype']:
non_present_subtypes.append(subtype)

return non_present_subtypes
Loading