Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix metadata NA value issue #120

Merged
merged 1 commit into from
Oct 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 23 additions & 30 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,9 @@
from .subtyper import \
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_metadata_with_summary_results
from .utils import \
genome_name_from_fasta_path, \
get_scheme_fasta, \
does_file_exist, \
collect_fastq_from_dir, \
group_fastqs, \
collect_fasta_from_dir, \
init_subtyping_params
from .metadata import read_metadata_table, merge_results_with_metadata
from .utils import (genome_name_from_fasta_path, get_scheme_fasta, does_file_exist, collect_fastq_from_dir,
group_fastqs, collect_fasta_from_dir, init_subtyping_params, df_field_fillna)

SCRIPT_NAME = 'hansel'
LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
Expand Down Expand Up @@ -70,7 +64,7 @@ def init_parser():
help='input fasta file path AND genome name')
parser.add_argument('-D', '--input-directory',
help='directory of input fasta files (.fasta|.fa|.fna) or FASTQ files (paired FASTQ should '
'have same basename with "_\d\.(fastq|fq)" postfix to be automatically paired) '
'have same basename with "_\\d\\.(fastq|fq)" postfix to be automatically paired) '
'(files can be Gzipped)')
parser.add_argument('-o', '--output-summary',
help='Subtyping summary output path (tab-delimited)')
Expand Down Expand Up @@ -126,7 +120,7 @@ def collect_inputs(args: Any) -> Tuple[List[Tuple[str, str]], List[Tuple[List[st
"""Collect all input files for analysis

Sample names are derived from the base filename with no extensions.
Sequencing reads are paired if they share a common filename name without "_\d".
Sequencing reads are paired if they share a common filename name without "_\\d".
Filepaths for contigs and reads files are collected from an input directory if provided.

Args:
Expand Down Expand Up @@ -172,7 +166,7 @@ def collect_inputs(args: Any) -> Tuple[List[Tuple[str, str]], List[Tuple[List[st
continue
filenames = [os.path.basename(y) for y in x]
common_prefix = os.path.commonprefix(filenames)
genome_name = re.sub(r'[\W\_]+$', r'', common_prefix)
genome_name = re.sub(r'[\W_]+$', r'', common_prefix)
if genome_name == '':
genome_name = filenames[0]
reads.append((x, genome_name))
Expand All @@ -192,32 +186,31 @@ def main():
does_file_exist(output_simple_summary_path, args.force)
does_file_exist(output_summary_path, args.force)
does_file_exist(output_kmer_results, args.force)
scheme = args.scheme # type: str
scheme_name = args.scheme_name # type: Optional[str]
scheme: str = args.scheme
scheme_name: Optional[str] = args.scheme_name
scheme_fasta = get_scheme_fasta(scheme)
scheme_subtype_counts = subtype_counts(scheme_fasta)
directory_path = args.input_directory
logging.debug(args)
subtyping_params = init_subtyping_params(args, scheme)
input_contigs, input_reads = collect_inputs(args)
if len(input_contigs) == 0 and len(input_reads) == 0:
raise Exception('No input files specified!')

df_md = None
try:
df_md = read_metadata_table(resource_filename(program_name, f'data/{scheme}/metadata.tsv'))
except Exception:
pass

md_path = resource_filename(program_name, f'data/{scheme}/metadata.tsv')
if os.path.exists(md_path):
df_md = read_metadata_table(md_path)

if args.scheme_metadata:
if df_md is None:
df_md = pd. DataFrame()
df_md = pd.concat([df_md, read_metadata_table(args.scheme_metadata)], axis=1)
df_md = df_md.loc[:, ~df_md.columns.duplicated()]
df_md = read_metadata_table(args.scheme_metadata)
else:
df_md = pd.concat([df_md, read_metadata_table(args.scheme_metadata)], axis=1)
df_md = df_md.loc[:, ~df_md.columns.duplicated()]

n_threads = args.threads

subtype_results: List[Tuple[Subtype, pd.DataFrame]] = [] # type: List[Tuple[Subtype, pd.DataFrame]]
subtype_results: List[Tuple[Subtype, pd.DataFrame]] = []
if len(input_contigs) > 0:
contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
scheme=scheme,
Expand All @@ -237,18 +230,18 @@ def main():
logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
subtype_results += reads_results

dfs: List[pd.DataFrame] = [df for st, df in subtype_results] # type: List[pd.DataFrame]
dfs: List[pd.DataFrame] = [df for st, df in subtype_results]
dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])

dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

if dfsummary['avg_kmer_coverage'].isnull().all():
dfsummary = dfsummary.drop(labels='avg_kmer_coverage', axis=1)

dfsummary['subtype'].fillna(value='#N/A', inplace=True)
dfsummary = df_field_fillna(dfsummary)

if df_md is not None:
dfsummary = merge_metadata_with_summary_results(dfsummary, df_md)
dfsummary = merge_results_with_metadata(dfsummary, df_md)

kwargs_for_pd_to_table = dict(sep='\t', index=None, float_format='%.3f')
kwargs_for_pd_to_json = dict(orient='records')
Expand All @@ -264,8 +257,8 @@ def main():

if output_kmer_results:
if len(dfs) > 0:
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False) # type: pd.DataFrame
dfall['subtype'].fillna(value='#N/A', inplace=True)
dfall: pd.DataFrame = pd.concat([df.sort_values('is_pos_kmer', ascending=False) for df in dfs], sort=False)
dfall = df_field_fillna(dfall)
dfall.to_csv(output_kmer_results, **kwargs_for_pd_to_table)
logging.info('Kmer results written to "{}".'.format(output_kmer_results))
if args.json:
Expand All @@ -283,7 +276,7 @@ def main():
df_simple_summary = dfsummary[['sample', 'subtype', 'qc_status', 'qc_message']]

if df_md is not None:
df_simple_summary = merge_metadata_with_summary_results(df_simple_summary, df_md)
df_simple_summary = merge_results_with_metadata(df_simple_summary, df_md)

df_simple_summary.to_csv(output_simple_summary_path, **kwargs_for_pd_to_table)
if args.json:
Expand Down
9 changes: 4 additions & 5 deletions bio_hansel/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,16 @@ def read_metadata_table(path: str) -> Optional[pd.DataFrame]:
list(FILE_EXT_TO_PD_READ_FUNC.keys())
))
return None
dfmd = FILE_EXT_TO_PD_READ_FUNC[file_ext](path) # type: pd.DataFrame
dfmd: pd.DataFrame = FILE_EXT_TO_PD_READ_FUNC[file_ext](path)
assert np.any(dfmd.columns == 'subtype'), 'Column with name "subtype" expected in metadata file "{}"'.format(path)
dfmd.subtype.fillna('#N/A', inplace=True)
dfmd.subtype = dfmd.subtype.astype(str)
logging.info('Read scheme metadata file "{}" into DataFrame with shape {}'.format(path, dfmd.shape))
return dfmd


def merge_metadata_with_summary_results(df_results: pd.DataFrame, df_metadata: pd.DataFrame) -> pd.DataFrame:
"""Merge subtype metadata table into subtype results table.
def merge_results_with_metadata(df_results: pd.DataFrame, df_metadata: pd.DataFrame) -> pd.DataFrame:
"""Merge subtype results table with metadata table.

Args:
df_results: Subtyping results table.
Expand All @@ -53,6 +54,4 @@ def merge_metadata_with_summary_results(df_results: pd.DataFrame, df_metadata: p
Returns:
Subtyping results with subtype metadata merged in if metadata is present for subtype results.
"""
df_results.subtype = df_results.subtype.fillna('')
df_results.subtype = df_results.subtype.astype(str)
return pd.merge(df_results, df_metadata, how='left', on='subtype')
2 changes: 1 addition & 1 deletion bio_hansel/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def parse_fastq(filepath):
with os.popen('zcat < {}'.format(filepath)) as f:
yield from _parse_fastq(f)
else:
with open(filepath, 'rU') as f:
with open(filepath, 'r') as f:
yield from _parse_fastq(f)


Expand Down
9 changes: 9 additions & 0 deletions bio_hansel/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
from collections import defaultdict

import pandas as pd

from .const import SCHEME_FASTAS, REGEX_FASTQ, REGEX_FASTA
from .subtyping_params import SubtypingParams

Expand Down Expand Up @@ -204,3 +206,10 @@ def init_subtyping_params(args: Optional[Any] = None,
subtyping_params.max_degenerate_kmers = args.max_degenerate_kmers

return subtyping_params


def df_field_fillna(df: pd.DataFrame, field:str = 'subtype', na: str = '#N/A') -> pd.DataFrame:
df[field].replace('', na, inplace=True)
df[field].fillna(value=na, inplace=True)
df[field] = df[field].astype(str)
return df
6 changes: 6 additions & 0 deletions tests/data/subtype-metadata-with-na.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
subtype a b c
1 a1 b1 c1
1.1 a1.1 b1.1 c1.1
2.2.1.1.1.1 a2.2.1.1.1.1 b2.2.1.1.1.1 c2.2.1.1.1.1
2.2.2.2.1.4 a2.2.2.2.1.4 b2.2.2.2.1.4 c2.2.2.2.1.4
#N/A a b c
34 changes: 27 additions & 7 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,39 @@
import pandas as pd
import numpy as np

from bio_hansel.metadata import read_metadata_table, merge_metadata_with_summary_results
from bio_hansel.metadata import read_metadata_table, merge_results_with_metadata
from bio_hansel.utils import df_field_fillna


def test_read_metadata():
df_results = pd.read_table('tests/data/subtyping-results.tsv')
df_results = df_field_fillna(df_results)
df_md = read_metadata_table('tests/data/subtype-metadata.tsv')
df_merged = merge_metadata_with_summary_results(df_results, df_md)
df_merged = merge_results_with_metadata(df_results, df_md)
assert np.all(df_md.columns.isin(df_merged.columns))
for idx, row in df_merged.iterrows():
# for metadata columns a,b,c, the merged metadata value should be the column name + subtype
# e.g. for subtype 1.1, a=a1.1; b=b1.1; c=c1.1
# if there was no subtype result, then the metadata field value should be NaN/None
for column in ['a', 'b', 'c']:
if row['subtype'] != '#N/A':
assert row[column] == column + row['subtype']
else:
assert np.isnan(row[column])


def test_read_metadata_with_na_subtype():
df_results = pd.read_table('tests/data/subtyping-results.tsv')
df_results = df_field_fillna(df_results)
df_md = read_metadata_table('tests/data/subtype-metadata-with-na.tsv')
df_merged = merge_results_with_metadata(df_results, df_md)
assert np.all(df_md.columns.isin(df_merged.columns))
for i, r in df_merged.iterrows():
for idx, row in df_merged.iterrows():
# for metadata columns a,b,c, the merged metadata value should be the column name + subtype
# e.g. for subtype 1.1, a=a1.1; b=b1.1; c=c1.1
# if there was no subtype result, then the metadata field value should be NaN/None
for c in list('abc'):
if r['subtype'] != '':
assert r[c] == c + r['subtype']
for column in ['a', 'b', 'c']:
if row['subtype'] != '#N/A':
assert row[column] == column + row['subtype']
else:
assert np.isnan(r[c])
assert row[column] == column, f'row={row}'