Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Covid19 #140

Merged
merged 10 commits into from
Mar 4, 2021
19 changes: 12 additions & 7 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
import attr
import pandas as pd

from . import program_desc, __version__, program_name
from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from .subtype import Subtype
from .subtype_stats import subtype_counts
from .subtyper import \
from bio_hansel import program_desc, __version__, program_name
from bio_hansel.const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from bio_hansel.subtype import Subtype
from bio_hansel.subtype_stats import subtype_counts
from bio_hansel.subtyper import \
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_results_with_metadata
from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata
from bio_hansel.utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna)

SCRIPT_NAME = 'hansel'
Expand Down Expand Up @@ -81,6 +81,9 @@ def init_parser():
parser.add_argument('--min-kmer-freq',
type=int,
help='Min k-mer freq/coverage')
parser.add_argument('--min-kmer-frac',
type=float,
help='Proportion of k-mer required for detection (0.0 - 1)')
parser.add_argument('--max-kmer-freq',
type=int,
help='Max k-mer freq/coverage')
Expand Down Expand Up @@ -258,6 +261,8 @@ def main():
if output_kmer_results:
if len(dfs) > 0:
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False)
#Error message is redundant accross each of the k-mers
dfall = dfall.drop(columns=['qc_message'])
dfall = df_field_fillna(dfall)
dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table)
logging.info('Kmer results written to "{}".'.format(output_kmer_results))
Expand Down
33 changes: 31 additions & 2 deletions bio_hansel/subtyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,32 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]],
outputs = [x.get() for x in res]
return outputs

def get_kmer_fraction(row):
"""Calculate the percentage frequency of a given position

Args:
df: BioHansel k-mer frequence pandas df row
Returns:
- float of percentage abundance
"""
total_freq = row.total_refposition_kmer_frequency
return row.freq / total_freq if total_freq > 0 else 0.0

def calc_kmer_fraction(df):
"""Calculate the percentage each k-mer frequency represents for a given position

Args:
df: BioHansel k-mer frequence pandas df

Returns:
- pd.DataFrame with k-mers with kmer_fraction and total_refposition_kmer_frequency columns added
"""
position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().to_dict()
df['total_refposition_kmer_frequency'] = df.apply(lambda row: position_frequencies['freq'].get(row.refposition, 0), axis=1)
df['kmer_fraction'] = df.apply(get_kmer_fraction, axis=1)

return df


def subtype_reads(reads: Union[str, List[str]],
genome_name: str,
Expand Down Expand Up @@ -285,9 +311,12 @@ def subtype_reads(reads: Union[str, List[str]],
df['subtype'] = subtypes
df['is_pos_kmer'] = ~df.kmername.str.contains('negative')
df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq)
#apply a scaled approach for filtering of k-mers required for high coverage amplicon data
df = calc_kmer_fraction(df)
df['is_kmer_fraction_okay'] = df.kmer_fraction >= subtyping_params.min_kmer_frac
st.avg_kmer_coverage = df['freq'].mean()
st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
st, filtered_df = process_subtyping_results(st, df[(df.is_kmer_freq_okay & df.is_kmer_fraction_okay)], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, filtered_df, subtyping_params)
df['file_path'] = str(st.file_path)
df['sample'] = genome_name
df['scheme'] = scheme_name or scheme
Expand Down
5 changes: 3 additions & 2 deletions bio_hansel/subtyping_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ class SubtypingParams(object):
min_ambiguous_kmers = attr.ib(default=3, validator=attr.validators.instance_of(int))
max_perc_intermediate_kmers = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int)))
max_kmer_freq = attr.ib(default=10000, validator=attr.validators.instance_of((float, int)))
min_kmer_frac = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
max_kmer_freq = attr.ib(default=1000000, validator=attr.validators.instance_of((float, int)))
min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int)))
max_degenerate_kmers = attr.ib(default=100000, validator=attr.validators.instance_of(int))
max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int))

@max_perc_missing_kmers.validator
def _validate_max_perc_missing_kmers(self, attribute, value):
Expand Down
2 changes: 2 additions & 0 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ def init_subtyping_params(args: Optional[Any] = None,
subtyping_params.min_coverage_warning = args.low_cov_warning
if args.min_kmer_freq:
subtyping_params.min_kmer_freq = args.min_kmer_freq
if args.min_kmer_frac:
subtyping_params.min_kmer_frac = args.min_kmer_frac
if args.max_kmer_freq:
subtyping_params.max_kmer_freq = args.max_kmer_freq
if args.max_degenerate_kmers:
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'is_pos_kmer', 'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']

exp_fastq_cols = ['kmername', 'refposition', 'subtype', 'seq', 'freq', 'is_pos_kmer', 'is_kmer_freq_okay',
'kmer_fraction','total_refposition_kmer_frequency','is_kmer_fraction_okay',
'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']


Expand Down