From a64e1e05135423013a8fc359e97d5f484e704fce Mon Sep 17 00:00:00 2001 From: jrober84 Date: Tue, 2 Mar 2021 15:00:19 -0500 Subject: [PATCH 01/10] increased max kmer freq to 10000000 for high coverage amplicon datasets --- bio_hansel/subtyping_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio_hansel/subtyping_params.py b/bio_hansel/subtyping_params.py index 2c9a5fd..96aa947 100644 --- a/bio_hansel/subtyping_params.py +++ b/bio_hansel/subtyping_params.py @@ -10,7 +10,7 @@ class SubtypingParams(object): min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int))) max_kmer_freq = attr.ib(default=10000, validator=attr.validators.instance_of((float, int))) min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int))) - max_degenerate_kmers = attr.ib(default=100000, validator=attr.validators.instance_of(int)) + max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int)) @max_perc_missing_kmers.validator def _validate_max_perc_missing_kmers(self, attribute, value): From 6c5b5666ae922271d5b61c34d20981b265b6b7e1 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Tue, 2 Mar 2021 15:01:08 -0500 Subject: [PATCH 02/10] increased max kmer freq to 10000000 for high coverage amplicon datasets --- bio_hansel/subtyping_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio_hansel/subtyping_params.py b/bio_hansel/subtyping_params.py index 96aa947..3ab7d2d 100644 --- a/bio_hansel/subtyping_params.py +++ b/bio_hansel/subtyping_params.py @@ -8,7 +8,7 @@ class SubtypingParams(object): min_ambiguous_kmers = attr.ib(default=3, validator=attr.validators.instance_of(int)) max_perc_intermediate_kmers = attr.ib(default=0.05, validator=attr.validators.instance_of(float)) min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int))) - max_kmer_freq = attr.ib(default=10000, validator=attr.validators.instance_of((float, int))) + max_kmer_freq = attr.ib(default=1000000, validator=attr.validators.instance_of((float, int))) min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int))) max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int)) From 0f45df17eb3b2b8040bbc73632fed8f847020852 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Tue, 2 Mar 2021 15:06:24 -0500 Subject: [PATCH 03/10] removed qc_message column from k-mer results because it is highly redundant and adds large amounts of data to batch samples --- bio_hansel/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 1096a6e..534e0d5 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -258,6 +258,8 @@ def main(): if output_kmer_results: if len(dfs) > 0: dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False) + #Error message is redundant accross each of the k-mers + dfall.drop(columns=['qc_message']) dfall = df_field_fillna(dfall) dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table) logging.info('Kmer results written to "{}".'.format(output_kmer_results)) From 34d8e99e89eca9c8420d8fd4246d70682fcefaca Mon Sep 17 00:00:00 2001 From: jrober84 Date: Tue, 2 Mar 2021 16:27:03 -0500 Subject: [PATCH 04/10] removed qc_message column from k-mer results because it is highly redundant and adds large amounts of data to batch samples --- bio_hansel/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 534e0d5..9852b7c 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -12,15 +12,15 @@ import attr import pandas as pd -from . import program_desc, __version__, program_name -from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL -from .subtype import Subtype -from .subtype_stats import subtype_counts -from .subtyper import \ +from bio_hansel import program_desc, __version__, program_name +from bio_hansel.const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL +from bio_hansel.subtype import Subtype +from bio_hansel.subtype_stats import subtype_counts +from bio_hansel.subtyper import \ subtype_contigs_samples, \ subtype_reads_samples -from .metadata import read_metadata_table, merge_results_with_metadata -from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir, +from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata +from bio_hansel.utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir, group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna) SCRIPT_NAME = 'hansel' @@ -259,7 +259,7 @@ def main(): if len(dfs) > 0: dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False) #Error message is redundant accross each of the k-mers - dfall.drop(columns=['qc_message']) + dfall = dfall.drop(columns=['qc_message']) dfall = df_field_fillna(dfall) dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table) logging.info('Kmer results written to "{}".'.format(output_kmer_results)) From 37afbeaf77a20f082d76e2a5956db048d33e5677 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Tue, 2 Mar 2021 16:38:46 -0500 Subject: [PATCH 05/10] added minimum k-mer fraction as additional filtering option --- bio_hansel/main.py | 3 +++ bio_hansel/subtyper.py | 21 +++++++++++++++++++++ bio_hansel/subtyping_params.py | 1 + bio_hansel/utils.py | 2 ++ 4 files changed, 27 insertions(+) diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 9852b7c..9f34a6c 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -81,6 +81,9 @@ def init_parser(): parser.add_argument('--min-kmer-freq', type=int, help='Min k-mer freq/coverage') + parser.add_argument('--min-kmer-frac', + type=float, + help='Proportion of k-mer required for detection (0.0 - 1)') parser.add_argument('--max-kmer-freq', type=int, help='Max k-mer freq/coverage') diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index 6140eec..8b026a9 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -228,6 +228,24 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]], outputs = [x.get() for x in res] return outputs +def filter_by_kmer_fraction(df,min_kmer_frac=0.05): + """Filter out noisy kmers from high coverage datasets + + Args: + df: BioHansel k-mer frequence pandas df + min_kmer_frac: float 0 - 1 on the minimum fraction a kmer needs to be to be considered valid + + Returns: + - pd.DataFrame with k-mers which satisfy the min-fraction + """ + position_counts = df['refposition'].value_counts().rename_axis('position').reset_index(name='counts') + valid_indexes = [] + for index,row in df.iterrows(): + frac = row['refposition'] / position_counts.loc[position_counts['position'] == row['refposition'], 'counts'].iloc[0] + if frac > min_kmer_frac: + valid_indexes.append(index) + return df[df.index.isin(valid_indexes)] + def subtype_reads(reads: Union[str, List[str]], genome_name: str, @@ -285,6 +303,9 @@ def subtype_reads(reads: Union[str, List[str]], df['subtype'] = subtypes df['is_pos_kmer'] = ~df.kmername.str.contains('negative') df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq) + #apply a scaled approach for filtering of k-mers required for high coverage amplicon data + df = filter_by_kmer_fraction(df,subtyping_params.min_kmer_frac) + st.avg_kmer_coverage = df['freq'].mean() st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts) st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) diff --git a/bio_hansel/subtyping_params.py b/bio_hansel/subtyping_params.py index 3ab7d2d..699aa51 100644 --- a/bio_hansel/subtyping_params.py +++ b/bio_hansel/subtyping_params.py @@ -8,6 +8,7 @@ class SubtypingParams(object): min_ambiguous_kmers = attr.ib(default=3, validator=attr.validators.instance_of(int)) max_perc_intermediate_kmers = attr.ib(default=0.05, validator=attr.validators.instance_of(float)) min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int))) + min_kmer_frac = attr.ib(default=0.05, validator=attr.validators.instance_of(float)) max_kmer_freq = attr.ib(default=1000000, validator=attr.validators.instance_of((float, int))) min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int))) max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int)) diff --git a/bio_hansel/utils.py b/bio_hansel/utils.py index 919cc9c..7d92bc7 100644 --- a/bio_hansel/utils.py +++ b/bio_hansel/utils.py @@ -200,6 +200,8 @@ def init_subtyping_params(args: Optional[Any] = None, subtyping_params.min_coverage_warning = args.low_cov_warning if args.min_kmer_freq: subtyping_params.min_kmer_freq = args.min_kmer_freq + if args.min_kmer_frac: + subtyping_params.min_kmer_frac = args.min_kmer_frac if args.max_kmer_freq: subtyping_params.max_kmer_freq = args.max_kmer_freq if args.max_degenerate_kmers: From cdc7004a9cdfe91e85838d727e5a018a4038627a Mon Sep 17 00:00:00 2001 From: jrober84 Date: Thu, 4 Mar 2021 09:50:06 -0500 Subject: [PATCH 06/10] improved documentation of the new kmer filtering function --- bio_hansel/subtyper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index 8b026a9..ba203da 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -229,7 +229,9 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]], return outputs def filter_by_kmer_fraction(df,min_kmer_frac=0.05): - """Filter out noisy kmers from high coverage datasets + """Filter out low frequency kmers from high coverage datasets which are likely the result of sequencing error + Positions will be variably covered in a dataset so the total number of kmers for each position are summed + and any k-mer which accounts for less than the min_kmer_frac will be removed from the df Args: df: BioHansel k-mer frequence pandas df From cb9f22edd2cc073d9fe183304049ac0a034b2adc Mon Sep 17 00:00:00 2001 From: jrober84 Date: Thu, 4 Mar 2021 10:57:52 -0500 Subject: [PATCH 07/10] fixed issue with using count of position instead of frequency --- bio_hansel/subtyper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index ba203da..f86b44c 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -240,10 +240,10 @@ def filter_by_kmer_fraction(df,min_kmer_frac=0.05): Returns: - pd.DataFrame with k-mers which satisfy the min-fraction """ - position_counts = df['refposition'].value_counts().rename_axis('position').reset_index(name='counts') + position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().reset_index() valid_indexes = [] for index,row in df.iterrows(): - frac = row['refposition'] / position_counts.loc[position_counts['position'] == row['refposition'], 'counts'].iloc[0] + frac = row['freq'] / position_frequencies.loc[position_frequencies['refposition'] == row['refposition'], 'freq'].iloc[0] if frac > min_kmer_frac: valid_indexes.append(index) return df[df.index.isin(valid_indexes)] From a4d5572d863c535b8b17f83e7b45ad2f2d1b6af6 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Thu, 4 Mar 2021 11:23:38 -0500 Subject: [PATCH 08/10] enhanced detailed report with additional information and df no longer being filtered for failed kmers --- bio_hansel/subtyper.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index f86b44c..603c1c6 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -228,25 +228,29 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]], outputs = [x.get() for x in res] return outputs -def filter_by_kmer_fraction(df,min_kmer_frac=0.05): - """Filter out low frequency kmers from high coverage datasets which are likely the result of sequencing error - Positions will be variably covered in a dataset so the total number of kmers for each position are summed - and any k-mer which accounts for less than the min_kmer_frac will be removed from the df +def calc_kmer_fraction(df,min_kmer_frac=0.05): + """Calculate the percentage each k-mer frequence represents for a given position Args: df: BioHansel k-mer frequence pandas df min_kmer_frac: float 0 - 1 on the minimum fraction a kmer needs to be to be considered valid Returns: - - pd.DataFrame with k-mers which satisfy the min-fraction + - pd.DataFrame with k-mers with kmer_fraction column """ position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().reset_index() - valid_indexes = [] + percentages = [] + total_refposition_kmer_frequencies = [] for index,row in df.iterrows(): - frac = row['freq'] / position_frequencies.loc[position_frequencies['refposition'] == row['refposition'], 'freq'].iloc[0] - if frac > min_kmer_frac: - valid_indexes.append(index) - return df[df.index.isin(valid_indexes)] + total_freq = position_frequencies.loc[position_frequencies['refposition'] == row['refposition'], 'freq'].iloc[0] + if total_freq > 0: + percentages.append(row['freq'] / total_freq) + else: + percentages.append(0.0) + total_refposition_kmer_frequencies.append(total_freq) + df['kmer_fraction'] = percentages + df['total_refposition_kmer_frequency'] = total_refposition_kmer_frequencies + return df def subtype_reads(reads: Union[str, List[str]], @@ -306,11 +310,11 @@ def subtype_reads(reads: Union[str, List[str]], df['is_pos_kmer'] = ~df.kmername.str.contains('negative') df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq) #apply a scaled approach for filtering of k-mers required for high coverage amplicon data - df = filter_by_kmer_fraction(df,subtyping_params.min_kmer_frac) - + df = calc_kmer_fraction(df,subtyping_params.min_kmer_frac) + df['is_kmer_fraction_okay'] = df.kmer_fraction >= subtyping_params.min_kmer_frac st.avg_kmer_coverage = df['freq'].mean() - st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts) - st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) + st, filtered_df = process_subtyping_results(st, df[(df.is_kmer_freq_okay & df.is_kmer_fraction_okay)], scheme_subtype_counts) + st.qc_status, st.qc_message = perform_quality_check(st, filtered_df, subtyping_params) df['file_path'] = str(st.file_path) df['sample'] = genome_name df['scheme'] = scheme_name or scheme From 8a1c62c3b9e4f4a51a5e1be2b6075b6f67ec17c5 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Thu, 4 Mar 2021 11:34:15 -0500 Subject: [PATCH 09/10] updated tests with new fields for read kmer reports --- tests/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/__init__.py b/tests/__init__.py index 172afa9..5365b10 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -19,6 +19,7 @@ 'is_pos_kmer', 'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message'] exp_fastq_cols = ['kmername', 'refposition', 'subtype', 'seq', 'freq', 'is_pos_kmer', 'is_kmer_freq_okay', + 'kmer_fraction','total_refposition_kmer_frequency','is_kmer_fraction_okay', 'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message'] From 15aa720251923aefc27e2f9ef3a60666b68009d4 Mon Sep 17 00:00:00 2001 From: jrober84 Date: Thu, 4 Mar 2021 13:23:44 -0500 Subject: [PATCH 10/10] simplified calc kmer fraction function --- bio_hansel/subtyper.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index 603c1c6..1f95041 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -228,28 +228,30 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]], outputs = [x.get() for x in res] return outputs -def calc_kmer_fraction(df,min_kmer_frac=0.05): - """Calculate the percentage each k-mer frequence represents for a given position +def get_kmer_fraction(row): + """Calculate the percentage frequency of a given position + + Args: + df: BioHansel k-mer frequence pandas df row + Returns: + - float of percentage abundance + """ + total_freq = row.total_refposition_kmer_frequency + return row.freq / total_freq if total_freq > 0 else 0.0 + +def calc_kmer_fraction(df): + """Calculate the percentage each k-mer frequency represents for a given position Args: df: BioHansel k-mer frequence pandas df - min_kmer_frac: float 0 - 1 on the minimum fraction a kmer needs to be to be considered valid Returns: - - pd.DataFrame with k-mers with kmer_fraction column + - pd.DataFrame with k-mers with kmer_fraction and total_refposition_kmer_frequency columns added """ - position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().reset_index() - percentages = [] - total_refposition_kmer_frequencies = [] - for index,row in df.iterrows(): - total_freq = position_frequencies.loc[position_frequencies['refposition'] == row['refposition'], 'freq'].iloc[0] - if total_freq > 0: - percentages.append(row['freq'] / total_freq) - else: - percentages.append(0.0) - total_refposition_kmer_frequencies.append(total_freq) - df['kmer_fraction'] = percentages - df['total_refposition_kmer_frequency'] = total_refposition_kmer_frequencies + position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().to_dict() + df['total_refposition_kmer_frequency'] = df.apply(lambda row: position_frequencies['freq'].get(row.refposition, 0), axis=1) + df['kmer_fraction'] = df.apply(get_kmer_fraction, axis=1) + return df @@ -310,7 +312,7 @@ def subtype_reads(reads: Union[str, List[str]], df['is_pos_kmer'] = ~df.kmername.str.contains('negative') df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq) #apply a scaled approach for filtering of k-mers required for high coverage amplicon data - df = calc_kmer_fraction(df,subtyping_params.min_kmer_frac) + df = calc_kmer_fraction(df) df['is_kmer_fraction_okay'] = df.kmer_fraction >= subtyping_params.min_kmer_frac st.avg_kmer_coverage = df['freq'].mean() st, filtered_df = process_subtyping_results(st, df[(df.is_kmer_freq_okay & df.is_kmer_fraction_okay)], scheme_subtype_counts)