Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Covid19 #140

Merged
merged 10 commits into from
Mar 4, 2021
19 changes: 12 additions & 7 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
import attr
import pandas as pd

from . import program_desc, __version__, program_name
from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from .subtype import Subtype
from .subtype_stats import subtype_counts
from .subtyper import \
from bio_hansel import program_desc, __version__, program_name
from bio_hansel.const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from bio_hansel.subtype import Subtype
from bio_hansel.subtype_stats import subtype_counts
from bio_hansel.subtyper import \
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_results_with_metadata
from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata
from bio_hansel.utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna)

SCRIPT_NAME = 'hansel'
Expand Down Expand Up @@ -81,6 +81,9 @@ def init_parser():
parser.add_argument('--min-kmer-freq',
type=int,
help='Min k-mer freq/coverage')
parser.add_argument('--min-kmer-frac',
type=float,
help='Proportion of k-mer required for detection (0.0 - 1)')
parser.add_argument('--max-kmer-freq',
type=int,
help='Max k-mer freq/coverage')
Expand Down Expand Up @@ -258,6 +261,8 @@ def main():
if output_kmer_results:
if len(dfs) > 0:
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False)
#Error message is redundant accross each of the k-mers
dfall = dfall.drop(columns=['qc_message'])
dfall = df_field_fillna(dfall)
dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table)
logging.info('Kmer results written to "{}".'.format(output_kmer_results))
Expand Down
31 changes: 29 additions & 2 deletions bio_hansel/subtyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,30 @@ def parallel_query_reads(reads: List[Tuple[List[str], str]],
outputs = [x.get() for x in res]
return outputs

def calc_kmer_fraction(df,min_kmer_frac=0.05):
"""Calculate the percentage each k-mer frequence represents for a given position

Args:
df: BioHansel k-mer frequence pandas df
min_kmer_frac: float 0 - 1 on the minimum fraction a kmer needs to be to be considered valid

Returns:
- pd.DataFrame with k-mers with kmer_fraction column
"""
position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().reset_index()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be easier to use a dict:

position_frequencies = df[['refposition','freq']].groupby(['refposition']).sum().to_dict()

The dict should have refposition keys and summed frequency values, so getting total_freq would be easier and clearer:

total_freq = position_frequencies[row.refposition]

percentages = []
total_refposition_kmer_frequencies = []
for index,row in df.iterrows():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend using .itertuples() for performance reasons:

for row in df.itertuples():
    refposition = row.refposition # cannot do string based access (row['refposition']) with tuples

You could also look into using apply instead of using a for-loop:

def get_kmer_fraction(row):
    total_freq = position_frequencies.get(row.refposition, 0)
    return row.freq / total_freq if total_freq > 0 else 0.0

df['kmer_fraction'] = df.apply(get_kmer_fraction, axis=1)
df['total_refposition_kmer_frequency'] = df.apply(lambda row: position_frequencies.get(row.refposition, 0), axis=1) 

total_freq = position_frequencies.loc[position_frequencies['refposition'] == row['refposition'], 'freq'].iloc[0]
if total_freq > 0:
percentages.append(row['freq'] / total_freq)
else:
percentages.append(0.0)
total_refposition_kmer_frequencies.append(total_freq)
df['kmer_fraction'] = percentages
df['total_refposition_kmer_frequency'] = total_refposition_kmer_frequencies
return df


def subtype_reads(reads: Union[str, List[str]],
genome_name: str,
Expand Down Expand Up @@ -285,9 +309,12 @@ def subtype_reads(reads: Union[str, List[str]],
df['subtype'] = subtypes
df['is_pos_kmer'] = ~df.kmername.str.contains('negative')
df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq)
#apply a scaled approach for filtering of k-mers required for high coverage amplicon data
df = calc_kmer_fraction(df,subtyping_params.min_kmer_frac)
df['is_kmer_fraction_okay'] = df.kmer_fraction >= subtyping_params.min_kmer_frac
st.avg_kmer_coverage = df['freq'].mean()
st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
st, filtered_df = process_subtyping_results(st, df[(df.is_kmer_freq_okay & df.is_kmer_fraction_okay)], scheme_subtype_counts)
st.qc_status, st.qc_message = perform_quality_check(st, filtered_df, subtyping_params)
df['file_path'] = str(st.file_path)
df['sample'] = genome_name
df['scheme'] = scheme_name or scheme
Expand Down
5 changes: 3 additions & 2 deletions bio_hansel/subtyping_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ class SubtypingParams(object):
min_ambiguous_kmers = attr.ib(default=3, validator=attr.validators.instance_of(int))
max_perc_intermediate_kmers = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
min_kmer_freq = attr.ib(default=8, validator=attr.validators.instance_of((float, int)))
max_kmer_freq = attr.ib(default=10000, validator=attr.validators.instance_of((float, int)))
min_kmer_frac = attr.ib(default=0.05, validator=attr.validators.instance_of(float))
max_kmer_freq = attr.ib(default=1000000, validator=attr.validators.instance_of((float, int)))
min_coverage_warning = attr.ib(default=20, validator=attr.validators.instance_of((float, int)))
max_degenerate_kmers = attr.ib(default=100000, validator=attr.validators.instance_of(int))
max_degenerate_kmers = attr.ib(default=10000000, validator=attr.validators.instance_of(int))

@max_perc_missing_kmers.validator
def _validate_max_perc_missing_kmers(self, attribute, value):
Expand Down
2 changes: 2 additions & 0 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ def init_subtyping_params(args: Optional[Any] = None,
subtyping_params.min_coverage_warning = args.low_cov_warning
if args.min_kmer_freq:
subtyping_params.min_kmer_freq = args.min_kmer_freq
if args.min_kmer_frac:
subtyping_params.min_kmer_frac = args.min_kmer_frac
if args.max_kmer_freq:
subtyping_params.max_kmer_freq = args.max_kmer_freq
if args.max_degenerate_kmers:
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'is_pos_kmer', 'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']

exp_fastq_cols = ['kmername', 'refposition', 'subtype', 'seq', 'freq', 'is_pos_kmer', 'is_kmer_freq_okay',
'kmer_fraction','total_refposition_kmer_frequency','is_kmer_fraction_okay',
'sample', 'file_path', 'scheme', 'scheme_version', 'qc_status', 'qc_message']


Expand Down