phac-nml · peterk87 · Nov 30, 2017 · Oct 16, 2017 · Oct 16, 2017 · Oct 16, 2017
diff --git a/.gitignore b/.gitignore
@@ -140,3 +140,14 @@ ENV/
 # Rope project settings
 .ropeproject
 
+# Output files
+match_results.tab
+results.tab
+test.tab
+
+
+tests/data/Retro1000data/
+
+tests/data/SRR1696752/
+
+tests/data/SRR3392166/
diff --git a/bio_hansel/const.py b/bio_hansel/const.py
@@ -17,3 +17,30 @@
 coverage
 is_trunc
 '''.strip().split('\n')
+
+# These are present within the subtype module.
+SUBTYPE_SUMMARY_COLS = """
+sample
+scheme
+scheme_version
+subtype
+all_subtypes
+tiles_matching_subtype
+are_subtypes_consistent
+inconsistent_subtypes
+n_tiles_matching_all
+n_tiles_matching_all_expected
+n_tiles_matching_positive
+n_tiles_matching_positive_expected
+n_tiles_matching_subtype
+n_tiles_matching_subtype_expected
+file_path
+qc_status
+qc_message""".strip().split('\n')
+
+SIMPLE_SUMMARY_COLS = """
+sample
+subtype
+qc_status
+qc_message
+""".strip().split('\n')
diff --git a/bio_hansel/kmer_count/__init__.py b/bio_hansel/kmer_count/__init__.py
@@ -7,6 +7,7 @@
 import logging
 import pandas as pd
 
+from ..quality_check import perform_quality_check
 from ..utils import exc_exists, run_command, find_inconsistent_subtypes, SCHEME_FASTAS
 from ..blast_wrapper.helpers import parse_fasta, revcomp
 from ..subtype import Subtype
@@ -231,6 +232,7 @@ def summary(self):
             return st, None
         dfgood = df[df.is_kmer_freq_okay]
         dfpos = dfgood[dfgood.is_pos_tile]
+        dfneg = dfgood[~dfgood.is_pos_tile]
         logging.debug('dfpos: %s', dfpos)
         subtype_lens = dfpos.subtype.apply(len)
         max_subtype_strlen = subtype_lens.max()
@@ -244,6 +246,7 @@ def summary(self):
         logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
         st.n_tiles_matching_all = dfgood.shape[0]
         st.n_tiles_matching_positive = dfpos.shape[0]
+        st.n_tiles_matching_negative = dfneg.shape[0]
         st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
         pos_subtypes_str = [x for x in dfpos.subtype.unique()]
         pos_subtypes_str.sort(key=lambda x: len(x))
@@ -253,11 +256,17 @@ def summary(self):
         st.n_tiles_matching_all_expected = ';'.join([str(self.scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
         st.n_tiles_matching_positive_expected = ';'.join(
             [str(self.scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
+        st.n_tiles_matching_negative_expected = ';'.join(
+            [str(self.scheme_subtype_counts[x].negative_tile_count) for x in subtype_list])
         st.n_tiles_matching_subtype_expected = ';'.join([str(self.scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
         st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename])
+        st.possible_downstream_subtypes = [s for s in self.scheme_subtype_counts
+                                           if s.startswith(tuple(subtype_list)) and s not in subtype_list]
+
         if len(inconsistent_subtypes) > 0:
             st.are_subtypes_consistent = False
             st.inconsistent_subtypes = inconsistent_subtypes
+
         logging.info(st)
         return st, df
 

diff --git a/bio_hansel/main.py b/bio_hansel/main.py
@@ -12,10 +12,12 @@
 import pandas as pd
 from collections import defaultdict
 
-from . import program_name, program_desc, __version__
-from .subtyper import subtype_fasta, SUBTYPE_SUMMARY_COLS, subtype_reads
-from .subtype_stats import subtype_counts
-from .utils import genome_name_from_fasta_path, get_scheme_fasta
+from bio_hansel import program_name, program_desc, __version__
+from bio_hansel.const import SUBTYPE_SUMMARY_COLS
+from bio_hansel.subtyper import subtype_fasta, subtype_reads
+from bio_hansel.subtype_stats import subtype_counts
+from bio_hansel.subtyping_params import SubtypingParams
+from bio_hansel.utils import genome_name_from_fasta_path, get_scheme_fasta, out_files_exists
 
 SCRIPT_NAME = 'hansel'
 LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
@@ -59,6 +61,11 @@ def init_parser():
                         help='Subtyping summary output path (tab-delimited)')
     parser.add_argument('-O', '--output-tile-results',
                         help='Subtyping tile matching output path (tab-delimited)')
+    parser.add_argument('-S', '--output-simple-summary',
+                        help='Subtyping simple summary output path')
+    parser.add_argument('--force',
+                        action='store_true',
+                        help='Force existing output files to be overwritten')
     parser.add_argument('--min-kmer-freq',
                         type=int,
                         default=10,
@@ -67,6 +74,24 @@ def init_parser():
                         type=int,
                         default=200,
                         help='Max k-mer freq/coverage')
+    # Changes
+    parser.add_argument('--low-cov-depth-freq',
+                        type=int,
+                        default=20,
+                        help='Frequencies below this coverage are considered low coverage')
+    parser.add_argument('--missing-total-tiles-max',
+                        type=int,
+                        default=0.05,
+                        help='Value in percentage, the maximum amount of total allowed missing tiles before being considered an error.')
+    parser.add_argument('--inc-tiles-max',
+                        type=int,
+                        default=3,
+                        help='Minimum number of missing tiles to be considered an inconsistent result')
+    parser.add_argument('--int-subtype-tiles-max',
+                        type=int,
+                        default=0.05,
+                        help='Value in percentage, the maximum amount of missing tiles to be tolerated to consider the result as an intermediate subtype.')
+    # Changes
     parser.add_argument('-t', '--threads',
                         type=int,
                         default=1,
@@ -97,14 +122,26 @@ def main():
     init_console_logger(args.verbose)
     output_summary_path = args.output_summary
     output_tile_results = args.output_tile_results
-
+    output_simple_summary_path = args.output_simple_summary
+    output_force = args.force
     scheme = args.scheme  # type: str
     scheme_name = args.scheme_name  # type: Optional[str]
     scheme_fasta = get_scheme_fasta(scheme)
     scheme_subtype_counts = subtype_counts(scheme_fasta)
     input_genomes = []
     reads = []
     logging.debug(args)
+    subtyping_params = SubtypingParams(low_coverage_depth_freq=args.low_cov_depth_freq,
+                                       missing_total_tiles_max=args.missing_total_tiles_max,
+                                       inconsistent_tiles_max=args.inc_tiles_max,
+                                       intermediate_subtype_tiles_max=args.int_subtype_tiles_max)
+
+    if not output_force:
+        if out_files_exists(output_summary_path, output_tile_results, output_simple_summary_path):
+            return 0
+    else:
+        logging.info("Previous output files will be over written with --force")
+
     if args.files:
         fastas = [x for x in args.files if re.match(r'^.+\.(fasta|fa|fna)$', x)]
         fastqs = [x for x in args.files if re.match(r'^.+\.(fastq|fq)$', x)]
@@ -158,7 +195,8 @@ def main():
     if input_genomes:
         if n_threads == 1:
             logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
-            outputs = [subtype_fasta(scheme,
+            outputs = [subtype_fasta(subtyping_params,
+                                     scheme,
                                      input_fasta,
                                      genome_name,
                                      tmp_dir=tmp_dir,
@@ -170,7 +208,8 @@ def main():
             logging.info('Initializing thread pool with %s threads', n_threads)
             pool = Pool(processes=n_threads)
             logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes))
-            res = [pool.apply_async(subtype_fasta, (scheme,
+            res = [pool.apply_async(subtype_fasta, (subtyping_params,
+                                                    scheme,
                                                     input_fasta,
                                                     genome_name,
                                                     tmp_dir,
@@ -187,7 +226,8 @@ def main():
             subtype_results.append(attr.asdict(subtype))
 
     if reads:
-        outputs = [subtype_reads(scheme=scheme,
+        outputs = [subtype_reads(subtyping_params,
+                                 scheme=scheme,
                                  reads=r,
                                  genome_name=genome_name,
                                  tmp_dir=tmp_dir,
@@ -207,6 +247,8 @@ def main():
     dfsummary = pd.DataFrame(subtype_results)
     dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]
 
+    df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]
+
     if output_summary_path:
         dfsummary.to_csv(output_summary_path, sep='\t', index=None)
         logging.info('Wrote subtyping output summary to %s', output_summary_path)
@@ -216,6 +258,9 @@ def main():
     if output_tile_results:
         dfall.to_csv(output_tile_results, sep='\t', index=None)
 
+    if output_simple_summary_path:
+        df_simple_summary.to_csv(output_simple_summary_path, sep='\t', index=None)
+
 
 def collect_fasta_from_dir(input_directory):
     input_genomes = []

diff --git a/bio_hansel/quality_check/__init__.py b/bio_hansel/quality_check/__init__.py
@@ -0,0 +1,60 @@
+from typing import List, Callable, Tuple
+
+from pandas import DataFrame
+
+from ..subtyping_params import SubtypingParams
+from ..quality_check.quality_check_functions import check_missing_tiles, does_subtype_result_exist, \
+    check_mixed_subtype, check_intermediate_subtype, check_inconsistent_results
+from ..quality_check.const import FAIL_MESSAGE, WARNING_MESSAGE
+from ..subtype import Subtype
+import logging
+
+
+QC_FUNCS: List[Callable[[Subtype, DataFrame, SubtypingParams], Tuple[str, str]]] = \
+[
+    check_missing_tiles,
+    check_mixed_subtype,
+    check_inconsistent_results,
+    check_intermediate_subtype
+]
+
+
+def perform_quality_check(st: Subtype, df: DataFrame, subtyping_params: SubtypingParams):
+    """ Driver method to call all quality checking functions and handle their responses.
+    Note:
+            This is the driver method for the quality check module. Every method within the QC_FUNCS list will be run
+            with parameters ( SUBTYPE, DATAFRAME ). If a quality check module returns something other than None, then
+            an Error, or Warning has occured.
+
+    Args:
+            :param st: Subtyping results.
+            :param df: DataFrame containing subtyping results.
+
+    Returns:
+            None, modifies the subtype with the result.
+    """
+    logging.debug("Performing Quality Checking")
+    overall_qc_status = 'PASS'
+    messages = []
+
+    if does_subtype_result_exist(st) is False:
+        logging.warning("QC: Quality checking not run, subtype result did not exist.")
+        st.qc_status = 'FAIL'
+        st.qc_message = 'FAIL: Subtype does not exist, quality checking was not run.'
+        return None
+
+    for func in QC_FUNCS:
+        # Calls run_method to check that the qc function takes a Subtype, returns Tuple[Optional[str], Optional[str]]
+        status, message = func(st, df, subtyping_params)
+        if status is None:
+            # If quality check function passes, move on to the next.
+            continue
+        messages.append('{}: {}'.format(status, message))
+        if status is FAIL_MESSAGE:
+            overall_qc_status = FAIL_MESSAGE
+        elif overall_qc_status != FAIL_MESSAGE and status == WARNING_MESSAGE:
+            overall_qc_status = WARNING_MESSAGE
+
+    st.qc_status = overall_qc_status
+    st.qc_message = ' | '.join(messages)
+    logging.debug("QC: Finished!")
diff --git a/bio_hansel/quality_check/const.py b/bio_hansel/quality_check/const.py
@@ -0,0 +1,9 @@
+FAIL_MESSAGE = "FAIL"
+WARNING_MESSAGE = "WARNING"
+# Errors for Hansel
+MISSING_TILES_ERROR_1A = "Missing Tiles Error 1A"
+MISSING_TILES_ERROR_1B = "Missing Tiles Error 1B"
+MIXED_SAMPLE_ERROR_2A = "Mixed Sample Error 2A"
+INCONSISTENT_RESULTS_ERROR_3A = "Inconsistent Results Error 3A"
+INCONSISTENT_RESULTS_ERROR_3B = "Inconsistent Results Error 3B"
+INTERMEDIATE_SUBTYPE_WARNING = "Intermediate Subtype Warning"
diff --git a/bio_hansel/quality_check/qc_utils.py b/bio_hansel/quality_check/qc_utils.py
@@ -0,0 +1,81 @@
+from typing import Tuple
+from ..subtype import Subtype
+from pandas import DataFrame, to_numeric, Series
+
+
+def get_conflicting_tiles(st: Subtype, df: DataFrame) -> list:
+    """ This method gets positive and negative tiles that both are present for a subtype.
+    Note:
+            The purpose of this method is to find positive and negative tiles for the same refposition in the DataFrame.
+            The method will return a list with the conflicting tiles.
+
+    Args:
+            :param st: Subtyping results.
+            :param df: DataFrame containing subtyping results.
+
+    Returns:
+            DataFrame containing the conflicting positive and negative tiles.
+    """
+    dfst = df.copy()
+    dfst['refposition'] = Series(dfst['refposition']).str.replace('negative', '')
+    dfst['refposition'] = to_numeric(dfst['refposition'], downcast='unsigned', errors='coerce')
+
+    if st.subtype:
+        if 'is_kmer_freq_okay' in df:
+            dfst = dfst[(dfst['subtype'] == str(st.subtype)) & (dfst['is_kmer_freq_okay'])]
+        else:  # fasta files
+            dfst = dfst[(dfst['subtype'] == str(st.subtype))]
+
+    pos_tile_positions = dfst[dfst['is_pos_tile']]['refposition'].tolist()
+    neg_tiles = dfst[~dfst['is_pos_tile']]
+    conflicting_tiles = neg_tiles[neg_tiles['refposition'].isin(pos_tile_positions)]
+
+    return conflicting_tiles
+
+
+def get_num_pos_neg_tiles(st: Subtype, df: DataFrame) -> Tuple[int, int]:
+    """ This method gets the number of positive and negative tiles.
+    Note:
+            The purpose of this method is to find the count of positive and negative tiles, and return them to the
+            caller.
+
+    Args:
+            :param st: Subtyping results.
+            :param df: DataFrame containing subtyping results.
+
+    Returns:
+            Tuple[int,int] containing the count of positive and negative tiles.
+    """
+    num_pos_tiles = 0
+    num_neg_tiles = 0
+
+    if st.subtype:
+        dfst = df[(df['subtype'] == str(st.subtype))]
+        num_pos_tiles = dfst[dfst['is_pos_tile']].shape[0]
+        num_neg_tiles = dfst[~dfst['is_pos_tile']].shape[0]
+
+    return num_pos_tiles, num_neg_tiles
+
+
+def possible_subtypes_exist_in_df(st: Subtype, df: DataFrame) -> list:
+    """ This method checks if the downstream subtypes' tiles are present within the DataFrame
+    Note:
+            The purpose of this method is to check if the downstream subtypes' tiles exist within the result.
+            If they're not present then we know that the result may or not be confident.
+
+    Args:
+            :param st: Subtyping results.
+            :param df: DataFrame containing subtyping results.
+
+    Returns:
+            list containing the non present subtypes.
+    """
+    non_present_subtypes = []
+    possible_subtypes = st.possible_downstream_subtypes
+
+    if possible_subtypes:
+        for subtype in possible_subtypes:
+            if subtype not in df['subtype']:
+                non_present_subtypes.append(subtype)
+
+    return non_present_subtypes