phac-nml · peterk87 · Nov 30, 2017 · Oct 16, 2017 · Oct 16, 2017 · Oct 16, 2017
diff --git a/.gitignore b/.gitignore
@@ -140,3 +140,8 @@ ENV/
 # Rope project settings
 .ropeproject
 
+# Output files
+match_results.tab
+results.tab
+test.tab
+
diff --git a/bio_hansel/const.py b/bio_hansel/const.py
@@ -1,4 +1,16 @@
 # -*- coding: utf-8 -*-
+from pkg_resources import resource_filename
+
+from bio_hansel import program_name
+from bio_hansel.subtyping_params import SubtypingParams
+
+SCHEME_FASTAS = {'heidelberg': {'file': resource_filename(program_name, 'data/heidelberg/tiles.fasta'),
+                                'version': '0.5.0',
+                                'subtyping_params': SubtypingParams(low_coverage_depth_freq=20)},
+                 'enteritidis': {'file': resource_filename(program_name, 'data/enteritidis/tiles.fasta'),
+                                 'version': '0.7.0',
+                                 'subtyping_params': SubtypingParams(low_coverage_depth_freq=50)}}
+
 
 FASTA_COLUMNS_TO_REMOVE = '''
 pident
@@ -17,3 +29,30 @@
 coverage
 is_trunc
 '''.strip().split('\n')
+
+# These are present within the subtype module.
+SUBTYPE_SUMMARY_COLS = """
+sample
+scheme
+scheme_version
+subtype
+all_subtypes
+tiles_matching_subtype
+are_subtypes_consistent
+inconsistent_subtypes
+n_tiles_matching_all
+n_tiles_matching_all_expected
+n_tiles_matching_positive
+n_tiles_matching_positive_expected
+n_tiles_matching_subtype
+n_tiles_matching_subtype_expected
+file_path
+qc_status
+qc_message""".strip().split('\n')
+
+SIMPLE_SUMMARY_COLS = """
+sample
+subtype
+qc_status
+qc_message
+""".strip().split('\n')
diff --git a/bio_hansel/kmer_count/__init__.py b/bio_hansel/kmer_count/__init__.py
@@ -1,13 +1,16 @@
 # -*- coding: utf-8 -*-
+import re
 import shutil
+from typing import Tuple, Optional
 
 import attr
 import os
 from datetime import datetime
 import logging
 import pandas as pd
 
-from ..utils import exc_exists, run_command, find_inconsistent_subtypes, SCHEME_FASTAS
+from ..utils import exc_exists, run_command, find_inconsistent_subtypes
+from bio_hansel.const import SCHEME_FASTAS
 from ..blast_wrapper.helpers import parse_fasta, revcomp
 from ..subtype import Subtype
 from ..subtype_stats import subtype_counts
@@ -31,6 +34,7 @@ class Jellyfisher(object):
     jf_file = attr.ib(default=None)
     jf_query_tiles_file = attr.ib(default=None)
     df_results = attr.ib(default=None, validator=attr.validators.optional(attr.validators.instance_of(pd.DataFrame)))
+    subtype = attr.ib(default=None)
 
     @scheme_subtype_counts.default
     def _default_scheme_subtype_counts(self):
@@ -206,7 +210,7 @@ def parse_query(self):
         df['tilename'] = tiles
         df['is_pos_tile'] = [not x.startswith('negative') for x in tiles]
         df['subtype'] = [y for x, y in df.tilename.str.split('-')]
-        df['refposition'] = [x for x, y in df.tilename.str.split('-')]
+        df['refposition'] = [int(x.replace('negative', '')) for x, y in df.tilename.str.split('-')]
         df['is_kmer_freq_okay'] = (df.freq >= self.min_kmer_freq) & (df.freq <= self.max_kmer_freq)
         logging.info('n=%s k-mers with freq within thresholds of %s and %s',
                      df.is_kmer_freq_okay.sum(),
@@ -219,18 +223,21 @@ def parse_query(self):
         self.df_results = df
         return df
 
-    def summary(self):
+    def summary(self) -> Tuple[Subtype, Optional[pd.DataFrame]]:
         if self.df_results is None:
             self.parse_query()
         df = self.df_results
-        st = Subtype(sample=self.genome_name, file_path=self._reads_to_str(), scheme=self.scheme, scheme_version=self.scheme_version)
+        st = Subtype(sample=self.genome_name, file_path=self._reads_to_str(), scheme=self.scheme,
+                     scheme_version=self.scheme_version)
+        st.scheme_subtype_counts = self.scheme_subtype_counts
         self.subtype = st
         if df is None or df.shape[0] == 0:
             logging.warning('No "%s" subtyping scheme tile matches for "%s"', self.scheme, self.reads)
             st.are_subtypes_consistent = False
             return st, None
         dfgood = df[df.is_kmer_freq_okay]
         dfpos = dfgood[dfgood.is_pos_tile]
+        dfneg = dfgood[~dfgood.is_pos_tile]
         logging.debug('dfpos: %s', dfpos)
         subtype_lens = dfpos.subtype.apply(len)
         max_subtype_strlen = subtype_lens.max()
@@ -244,20 +251,37 @@ def summary(self):
         logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
         st.n_tiles_matching_all = dfgood.shape[0]
         st.n_tiles_matching_positive = dfpos.shape[0]
+        st.n_tiles_matching_negative = dfneg.shape[0]
         st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
         pos_subtypes_str = [x for x in dfpos.subtype.unique()]
         pos_subtypes_str.sort(key=lambda x: len(x))
         st.all_subtypes = '; '.join(pos_subtypes_str)
         subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
         st.subtype = '; '.join(subtype_list)
-        st.n_tiles_matching_all_expected = ';'.join([str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
+        st.n_tiles_matching_all_expected = ';'.join(
+            [str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
         st.n_tiles_matching_positive_expected = ';'.join(
             [str(self.scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
-        st.n_tiles_matching_subtype_expected = ';'.join([str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
+        st.n_tiles_matching_negative_expected = ';'.join(
+            [str(self.scheme_subtype_counts[x].negative_tile_count) for x in subtype_list])
+        st.n_tiles_matching_subtype_expected = ';'.join(
+            [str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
         st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename])
+
+        possible_downstream_subtypes = [s for s in self.scheme_subtype_counts
+                                           if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)]
+        non_present_subtypes = []
+        if possible_downstream_subtypes:
+            for subtype in possible_downstream_subtypes:
+                if subtype not in df['subtype']:
+                    non_present_subtypes.append(subtype)
+
+        st.non_present_subtypes = non_present_subtypes
+
         if len(inconsistent_subtypes) > 0:
             st.are_subtypes_consistent = False
             st.inconsistent_subtypes = inconsistent_subtypes
+
         logging.info(st)
         return st, df
 
@@ -270,3 +294,5 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.cleanup()
+
+
diff --git a/bio_hansel/main.py b/bio_hansel/main.py
@@ -12,10 +12,12 @@
 import pandas as pd
 from collections import defaultdict
 
-from . import program_name, program_desc, __version__
-from .subtyper import subtype_fasta, SUBTYPE_SUMMARY_COLS, subtype_reads
-from .subtype_stats import subtype_counts
-from .utils import genome_name_from_fasta_path, get_scheme_fasta
+from bio_hansel import program_name, program_desc, __version__
+from bio_hansel.const import SUBTYPE_SUMMARY_COLS
+from bio_hansel.subtyper import subtype_fasta, subtype_reads
+from bio_hansel.subtype_stats import subtype_counts
+from bio_hansel.subtyping_params import SubtypingParams
+from bio_hansel.utils import genome_name_from_fasta_path, get_scheme_fasta, out_files_exists, get_scheme_params
 
 SCRIPT_NAME = 'hansel'
 LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
@@ -59,14 +61,31 @@ def init_parser():
                         help='Subtyping summary output path (tab-delimited)')
     parser.add_argument('-O', '--output-tile-results',
                         help='Subtyping tile matching output path (tab-delimited)')
+    parser.add_argument('-S', '--output-simple-summary',
+                        help='Subtyping simple summary output path')
+    parser.add_argument('--force',
+                        action='store_true',
+                        help='Force existing output files to be overwritten')
     parser.add_argument('--min-kmer-freq',
                         type=int,
-                        default=10,
                         help='Min k-mer freq/coverage')
     parser.add_argument('--max-kmer-freq',
                         type=int,
-                        default=200,
                         help='Max k-mer freq/coverage')
+    # Changes
+    parser.add_argument('--low-cov-depth-freq',
+                        type=int,
+                        help='Frequencies below this coverage are considered low coverage')
+    parser.add_argument('--max-missing-tiles',
+                        type=float,
+                        help='Decimal proportion of maximum allowable missing tiles before being considered an error. (0.0 - 1.0)')
+    parser.add_argument('--min-ambiguous-tiles',
+                        type=int,
+                        help='Minimum number of missing tiles to be considered an ambiguous result')
+    parser.add_argument('--max-intermediate-tiles',
+                        type=float,
+                        help='Decimal proportion of maximum allowable missing tiles to be considered an intermediate subtype. (0.0 - 1.0)')
+    # Changes
     parser.add_argument('-t', '--threads',
                         type=int,
                         default=1,
@@ -97,14 +116,31 @@ def main():
     init_console_logger(args.verbose)
     output_summary_path = args.output_summary
     output_tile_results = args.output_tile_results
-
+    output_simple_summary_path = args.output_simple_summary
+    out_files_exists(output_simple_summary_path, args.force)
+    out_files_exists(output_summary_path, args.force)
+    out_files_exists(output_tile_results, args.force)
     scheme = args.scheme  # type: str
     scheme_name = args.scheme_name  # type: Optional[str]
     scheme_fasta = get_scheme_fasta(scheme)
     scheme_subtype_counts = subtype_counts(scheme_fasta)
     input_genomes = []
     reads = []
     logging.debug(args)
+
+    subtyping_params = get_scheme_params(scheme)
+    if not subtyping_params:
+        subtyping_params = SubtypingParams()
+
+    if args.low_cov_depth_freq:
+        subtyping_params.low_coverage_depth_freq = args.low_cov_depth_freq
+    if args.max_missing_tiles:
+        subtyping_params.max_perc_missing_tiles = args.max_missing_tiles
+    if args.min_ambiguous_tiles:
+        subtyping_params.min_ambiguous_tiles = args.min_ambiguous_tiles
+    if args.max_intermediate_tiles:
+        subtyping_params.max_perc_intermediate_tiles = args.max_intermediate_tiles
+
     if args.files:
         fastas = [x for x in args.files if re.match(r'^.+\.(fasta|fa|fna)$', x)]
         fastqs = [x for x in args.files if re.match(r'^.+\.(fastq|fq)$', x)]
@@ -158,7 +194,8 @@ def main():
     if input_genomes:
         if n_threads == 1:
             logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
-            outputs = [subtype_fasta(scheme,
+            outputs = [subtype_fasta(subtyping_params,
+                                     scheme,
                                      input_fasta,
                                      genome_name,
                                      tmp_dir=tmp_dir,
@@ -170,7 +207,8 @@ def main():
             logging.info('Initializing thread pool with %s threads', n_threads)
             pool = Pool(processes=n_threads)
             logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes))
-            res = [pool.apply_async(subtype_fasta, (scheme,
+            res = [pool.apply_async(subtype_fasta, (subtyping_params,
+                                                    scheme,
                                                     input_fasta,
                                                     genome_name,
                                                     tmp_dir,
@@ -187,13 +225,12 @@ def main():
             subtype_results.append(attr.asdict(subtype))
 
     if reads:
-        outputs = [subtype_reads(scheme=scheme,
+        outputs = [subtype_reads(subtyping_params,
+                                 scheme=scheme,
                                  reads=r,
                                  genome_name=genome_name,
                                  tmp_dir=tmp_dir,
                                  threads=n_threads,
-                                 min_kmer_freq=args.min_kmer_freq,
-                                 max_kmer_freq=args.max_kmer_freq,
                                  scheme_name=scheme_name,
                                  scheme_subtype_counts=scheme_subtype_counts)
                    for r, genome_name in reads]
@@ -207,6 +244,8 @@ def main():
     dfsummary = pd.DataFrame(subtype_results)
     dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]
 
+    df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]
+
     if output_summary_path:
         dfsummary.to_csv(output_summary_path, sep='\t', index=None)
         logging.info('Wrote subtyping output summary to %s', output_summary_path)
@@ -216,6 +255,9 @@ def main():
     if output_tile_results:
         dfall.to_csv(output_tile_results, sep='\t', index=None)
 
+    if output_simple_summary_path:
+        df_simple_summary.to_csv(output_simple_summary_path, sep='\t', index=None)
+
 
 def collect_fasta_from_dir(input_directory):
     input_genomes = []

diff --git a/bio_hansel/quality_check/__init__.py b/bio_hansel/quality_check/__init__.py
@@ -0,0 +1,61 @@
+from typing import List, Callable, Tuple
+
+from pandas import DataFrame
+
+from ..subtyping_params import SubtypingParams
+from ..quality_check.quality_check_functions import check_missing_tiles, does_subtype_result_exist, \
+    check_mixed_subtype, check_intermediate_subtype, is_missing_target_sites, is_missing_downstream_targets
+from ..quality_check.const import FAIL_MESSAGE, WARNING_MESSAGE
+from ..subtype import Subtype
+import logging
+
+
+QC_FUNCS = \
+[
+    check_missing_tiles,
+    check_mixed_subtype,
+    is_missing_target_sites,
+    is_missing_downstream_targets,
+    check_intermediate_subtype,
+] # type: List[Callable[[Subtype, DataFrame, SubtypingParams], Tuple[str, str]]]
+
+
+def perform_quality_check(st: Subtype, df: DataFrame, subtyping_params: SubtypingParams):
+    """ Driver method to call all quality checking functions and handle their responses.
+    Note:
+            This is the driver method for the quality check module. Every method within the QC_FUNCS list will be run
+            with parameters ( SUBTYPE, DATAFRAME ). If a quality check module returns something other than None, then
+            an Error, or Warning has occured.
+
+    Args:
+            :param st: Subtyping results.
+            :param df: DataFrame containing subtyping results.
+
+    Returns:
+            None, modifies the subtype with the result.
+    """
+    logging.debug("Performing Quality Checking")
+    overall_qc_status = 'PASS'
+    messages = []
+
+    if does_subtype_result_exist(st) is False:
+        logging.warning("QC: Quality checking not run, subtype result did not exist.")
+        st.qc_status = 'FAIL'
+        st.qc_message = 'FAIL: Subtype does not exist, quality checking was not run.'
+        return None
+
+    for func in QC_FUNCS:
+        # Calls run_method to check that the qc function takes a Subtype, returns Tuple[Optional[str], Optional[str]]
+        status, message = func(st, df, subtyping_params)
+        if status is None:
+            # If quality check function passes, move on to the next.
+            continue
+        messages.append('{}: {}'.format(status, message))
+        if status is FAIL_MESSAGE:
+            overall_qc_status = FAIL_MESSAGE
+        elif overall_qc_status != FAIL_MESSAGE and status == WARNING_MESSAGE:
+            overall_qc_status = WARNING_MESSAGE
+
+    st.qc_status = overall_qc_status
+    st.qc_message = ' | '.join(messages)
+    logging.debug("QC: Finished!")
diff --git a/bio_hansel/quality_check/const.py b/bio_hansel/quality_check/const.py
@@ -0,0 +1,8 @@
+FAIL_MESSAGE = "FAIL"
+WARNING_MESSAGE = "WARNING"
+# Errors for Hansel
+MISSING_TILES_ERROR_1 = "Missing Tiles Error 1"
+MIXED_SAMPLE_ERROR_2 = "Mixed Sample Error 2"
+AMBIGUOUS_RESULTS_ERROR_3 = "Ambiguous Results Error 3"
+NON_CONFIDENT_RESULTS_ERROR_4 = "Non Confident Results Error 4"
+INTERMEDIATE_SUBTYPE_WARNING = "Intermediate Subtype Warning"