From 5eb56fb6bb82057b798fbe88b1923d27ab6982e2 Mon Sep 17 00:00:00 2001 From: Vince Reuter Date: Wed, 1 May 2019 17:14:43 -0400 Subject: [PATCH] relocate sequencing functions; release prep --- .gitignore | 5 +- docs/autodoc_build/ubiquerg.md | 335 ------------------------- docs/changelog.md | 8 +- tests/{generic => }/test_collection.py | 0 ubiquerg/__init__.py | 2 +- ubiquerg/ngs.py | 234 ----------------- 6 files changed, 5 insertions(+), 579 deletions(-) delete mode 100644 docs/autodoc_build/ubiquerg.md rename tests/{generic => }/test_collection.py (100%) delete mode 100644 ubiquerg/ngs.py diff --git a/.gitignore b/.gitignore index a1ac07a..fe03df3 100644 --- a/.gitignore +++ b/.gitignore @@ -53,10 +53,6 @@ Thumbs.db # libreoffice lock files: .~lock* -# Default-named test output -microtest/ -open_pipelines/ - # IDE-specific items .idea/ @@ -69,5 +65,6 @@ open_pipelines/ *RESERVE* # Build-related stuff +docs/autodoc_build dist/ ubiquerg.egg-info/ diff --git a/docs/autodoc_build/ubiquerg.md b/docs/autodoc_build/ubiquerg.md deleted file mode 100644 index 380560c..0000000 --- a/docs/autodoc_build/ubiquerg.md +++ /dev/null @@ -1,335 +0,0 @@ -# Package ubiquerg Documentation - -## Class PeekBamResult -PeekBamResult(read_lengths, paired) - - -### paired -Alias for field number 1 -```python -def paired(self) -``` - - - - -### read\_lengths -Alias for field number 0 -```python -def read_lengths(self) -``` - - - - -## Class UnsupportedFiletypeException -Restrict domain of file types. - - -## Class defaultdict -defaultdict(default_factory[, ...]) --> dict with default factory - -The default factory is called without arguments to produce -a new value when a key is not present, in __getitem__ only. -A defaultdict compares equal to a dict with the same items. -All remaining arguments are treated the same as if they were -passed to the dict constructor, including keyword arguments. - - -### count\_fail\_reads -Count the number of reads that failed platform/vendor quality checks. -```python -def count_fail_reads(file_name, paired_end, prog_path) -``` - -**Parameters:** - -- `file_name` -- `str`: name/path to file to examine -- `paired_end` -- ``: this parameter is ignored; samtools automaticallycorrectly responds depending on the data in the bamfile; we leave the option here just for consistency, since all the other counting functions require the parameter; this makes it easier to swap counting functions during pipeline development. -- `prog_path` -- `str`: path to main program/tool to use for the counting - - -**Returns:** - -`int`: count of failed reads - - - - -### count\_flag\_reads -Counts the number of reads with the specified flag. -```python -def count_flag_reads(file_name, flag, paired_end, prog_path) -``` - -**Parameters:** - -- `file_name` -- `str`: name/path to file to examine -- `flag` -- `str int |`: SAM flag value to be read -- `paired_end` -- ``: this parameter is ignored; samtools automaticallycorrectly responds depending on the data in the bamfile; we leave the option here just for consistency, since all the other counting functions require the parameter; this makes it easier to swap counting functions during pipeline development. -- `prog_path` -- `str`: path to main program/tool to use for the counting - - -**Returns:** - -`str`: terminal-like text output - - - - -### count\_lines -Uses the command-line utility wc to count the number of lines in a file. - -For MacOS, must strip leading whitespace from wc. -```python -def count_lines(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: name of file whose lines are to be counted - - -**Returns:** - -`str`: terminal-like text output - - - - -### count\_lines\_zip -Count number of lines in a zipped file. - -This function eses the command-line utility wc to count the number of lines -in a file. For MacOS, strip leading whitespace from wc. -```python -def count_lines_zip(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: path to file in which to count lines - - -**Returns:** - -`str`: terminal-like text output - - - - -### count\_reads -Count reads in a file. - -Paired-end reads count as 2 in this function. -For paired-end reads, this function assumes that the reads are split -into 2 files, so it divides line count by 2 instead of 4. -This will thus give an incorrect result if your paired-end fastq files -are in only a single file (you must divide by 2 again). -```python -def count_reads(file_name, paired_end, prog_path) -``` - -**Parameters:** - -- `file_name` -- `str`: Name/path of file whose reads are to be counted. -- `paired_end` -- `bool`: Whether the file contains paired-end reads. -- `prog_path` -- `str`: path to main program/tool to use for the counting - - -**Returns:** - -`str`: terminal-like text output (if input is SAM/BAM), or actualcount value (if input isn't SAM/BAM) - - - - -### get\_input\_ext -Get the extension of the input_file. - -This function assumes you're using .bam, .fastq/.fq, or .fastq.gz/.fq.gz. -```python -def get_input_ext(input_file) -``` - -**Parameters:** - -- `input_file` -- `str`: name/path of file for which to get extension - - -**Returns:** - -`str`: standardized extension - - -**Raises:** - -- `ubiquerg.ngs.UnsupportedFiletypeException`: if the given file nameor path has an extension that's not supported - - - - -### is\_collection\_like -Determine whether an object is collection-like. -```python -def is_collection_like(c) -``` - -**Parameters:** - -- `c` -- `object`: Object to test as collection - - -**Returns:** - -`bool`: Whether the argument is a (non-string) collection - - - - -### is\_fastq -Determine whether indicated file appears to be in FASTQ format. -```python -def is_fastq(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: Name/path of file to check as FASTQ. - - -**Returns:** - -`bool`: Whether indicated file appears to be in FASTQ format, zippedor unzipped. - - - - -### is\_gzipped\_fastq -Determine whether indicated file appears to be a gzipped FASTQ. -```python -def is_gzipped_fastq(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: Name/path of file to check as gzipped FASTQ. - - -**Returns:** - -`bool`: Whether indicated file appears to be in gzipped FASTQ format. - - - - -### is\_sam\_or\_bam -Determine whether a file appears to be in a SAM format. -```python -def is_sam_or_bam(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: Name/path of file to check as SAM-formatted. - - -**Returns:** - -`bool`: Whether file appears to be SAM-formatted - - - - -### is\_unzipped\_fastq -Determine whether indicated file appears to be an unzipped FASTQ. -```python -def is_unzipped_fastq(file_name) -``` - -**Parameters:** - -- `file_name` -- `str`: Name/path of file to check as unzipped FASTQ. - - -**Returns:** - -`bool`: Whether indicated file appears to be in unzipped FASTQ format. - - - - -### namedtuple -Returns a new subclass of tuple with named fields. - ->>> Point = namedtuple('Point', ['x', 'y']) ->>> Point.__doc__ # docstring for the new class -'Point(x, y)' ->>> p = Point(11, y=22) # instantiate with positional args or keywords ->>> p[0] + p[1] # indexable like a plain tuple -33 ->>> x, y = p # unpack like a regular tuple ->>> x, y -(11, 22) ->>> p.x + p.y # fields also accessible by name -33 ->>> d = p._asdict() # convert to a dictionary ->>> d['x'] -11 ->>> Point(**d) # convert from a dictionary -Point(x=11, y=22) ->>> p._replace(x=100) # _replace() is like str.replace() but targets named fields -Point(x=100, y=22) -```python -def namedtuple(typename, field_names, *, verbose=False, rename=False, module=None) -``` - - - - -### peek\_read\_lengths\_and\_paired\_counts\_from\_bam -Counting read lengths and paired reads in a sample from a BAM. -```python -def peek_read_lengths_and_paired_counts_from_bam(bam, sample_size) -``` - -**Parameters:** - -- `bam` -- `str`: path to BAM file to examine -- `sample_size` -- `int`: number of reads to look at for estimation - - -**Returns:** - -`defaultdict[int, int], int`: read length observation counts, andnumber of paired reads observed - - -**Raises:** - -- `OSError`: - - - - -### samtools\_view -Run samtools view, with flexible parameters and post-processing. - -This is used to implement the various read counting functions. -```python -def samtools_view(file_name, param, prog_path, postpend='') -``` - -**Parameters:** - -- `file_name` -- `str`: name/path of reads tile to use -- `param` -- `str`: String of parameters to pass to samtools view -- `prog_path` -- `str`: path to the samtools program -- `postpend` -- `str`: String to append to the samtools command;useful to add cut, sort, wc operations to the samtools view output. - - -**Returns:** - -`str`: terminal-like text output - - - diff --git a/docs/changelog.md b/docs/changelog.md index ada8158..9ce72e9 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,10 +1,8 @@ # Changelog -## [0.0.2] -### Added -- Added more utility functions -### Changed -- `check_bam` is renamed to `peek_read_lengths_and_paired_counts_from_bam` +## [0.0.2] - 2019-05-01 +## Changed +- Restrict offerings to most generic functionality ## [0.0.1] - 2019-04-30 - First release version diff --git a/tests/generic/test_collection.py b/tests/test_collection.py similarity index 100% rename from tests/generic/test_collection.py rename to tests/test_collection.py diff --git a/ubiquerg/__init__.py b/ubiquerg/__init__.py index 7e83680..466f849 100644 --- a/ubiquerg/__init__.py +++ b/ubiquerg/__init__.py @@ -1,3 +1,3 @@ """ Package exports """ from .collection import * -from .ngs import * +from ._version import __version__ diff --git a/ubiquerg/ngs.py b/ubiquerg/ngs.py deleted file mode 100644 index fd078c6..0000000 --- a/ubiquerg/ngs.py +++ /dev/null @@ -1,234 +0,0 @@ -""" NGS file utilities """ - -from collections import defaultdict, namedtuple -import os -import subprocess - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - -PeekBamResult = namedtuple("PeekBamResult", ["read_lengths", "paired"]) - - -def count_fail_reads(file_name, paired_end, prog_path): - """ - Count the number of reads that failed platform/vendor quality checks. - - :param str file_name: name/path to file to examine - :param paired_end: this parameter is ignored; samtools automatically - correctly responds depending on the data in the bamfile; we leave the - option here just for consistency, since all the other counting - functions require the parameter; this makes it easier to swap counting - functions during pipeline development. - :param str prog_path: path to main program/tool to use for the counting - :return int: count of failed reads - """ - return int(count_flag_reads(file_name, 512, paired_end, prog_path)) - - -def count_flag_reads(file_name, flag, paired_end, prog_path): - """ - Counts the number of reads with the specified flag. - - :param str file_name: name/path to file to examine - :param str int | flag: SAM flag value to be read - :param paired_end: this parameter is ignored; samtools automatically - correctly responds depending on the data in the bamfile; we leave the - option here just for consistency, since all the other counting - functions require the parameter; this makes it easier to swap counting - functions during pipeline development. - :param str prog_path: path to main program/tool to use for the counting - :return str: terminal-like text output - """ - - param = " -c -f" + str(flag) - if file_name.endswith("sam"): - param += " -S" - return samtools_view(file_name, param=param, prog_path=prog_path) - - -def count_lines(file_name): - """ - Uses the command-line utility wc to count the number of lines in a file. - - For MacOS, must strip leading whitespace from wc. - - :param str file_name: name of file whose lines are to be counted - :return str: terminal-like text output - """ - cmd = "wc -l " + file_name + " | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '" - return subprocess.check_output(cmd, shell=True).decode().strip() - - -def count_lines_zip(file_name): - """ - Count number of lines in a zipped file. - - This function eses the command-line utility wc to count the number of lines - in a file. For MacOS, strip leading whitespace from wc. - - :param str file_name: path to file in which to count lines - :return str: terminal-like text output - """ - cmd = "gunzip -c " + file_name + " | wc -l | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '" - return subprocess.check_output(cmd, shell=True).decode().strip() - - -def count_reads(file_name, paired_end, prog_path): - """ - Count reads in a file. - - Paired-end reads count as 2 in this function. - For paired-end reads, this function assumes that the reads are split - into 2 files, so it divides line count by 2 instead of 4. - This will thus give an incorrect result if your paired-end fastq files - are in only a single file (you must divide by 2 again). - - :param str file_name: Name/path of file whose reads are to be counted. - :param bool paired_end: Whether the file contains paired-end reads. - :param str prog_path: path to main program/tool to use for the counting - :return str: terminal-like text output (if input is SAM/BAM), or actual - count value (if input isn't SAM/BAM) - """ - - _, ext = os.path.splitext(file_name) - if not (is_sam_or_bam(file_name) or is_fastq(file_name)): - # TODO: make this an exception and force caller to handle that - # rather than relying on knowledge of possibility of negative value. - return -1 - - if is_sam_or_bam(file_name): - param_text = "-c" if ext == ".bam" else "-c -S" - return samtools_view(file_name, param=param_text, prog_path=prog_path) - else: - num_lines = count_lines_zip(file_name) \ - if is_gzipped_fastq(file_name) \ - else count_lines(file_name) - divisor = 2 if paired_end else 4 - return int(num_lines) / divisor - - -def get_input_ext(input_file): - """ - Get the extension of the input_file. - - This function assumes you're using .bam, .fastq/.fq, or .fastq.gz/.fq.gz. - - :param str input_file: name/path of file for which to get extension - :return str: standardized extension - :raise ubiquerg.ngs.UnsupportedFiletypeException: if the given file name - or path has an extension that's not supported - """ - if input_file.endswith(".bam"): - return ".bam" - elif input_file.endswith(".fastq.gz") or input_file.endswith(".fq.gz"): - return ".fastq.gz" - elif input_file.endswith(".fastq") or input_file.endswith(".fq"): - return ".fastq" - errmsg = "'{}'; this pipeline can only deal with .bam, .fastq, " \ - "or .fastq.gz files".format(input_file) - raise UnsupportedFiletypeException(errmsg) - - -def is_fastq(file_name): - """ - Determine whether indicated file appears to be in FASTQ format. - - :param str file_name: Name/path of file to check as FASTQ. - :return bool: Whether indicated file appears to be in FASTQ format, zipped - or unzipped. - """ - return is_unzipped_fastq(file_name) or is_gzipped_fastq(file_name) - - -def is_gzipped_fastq(file_name): - """ - Determine whether indicated file appears to be a gzipped FASTQ. - - :param str file_name: Name/path of file to check as gzipped FASTQ. - :return bool: Whether indicated file appears to be in gzipped FASTQ format. - """ - _, ext = os.path.splitext(file_name) - return file_name.endswith(".fastq.gz") or file_name.endswith(".fq.gz") - - -def is_sam_or_bam(file_name): - """ - Determine whether a file appears to be in a SAM format. - - :param str file_name: Name/path of file to check as SAM-formatted. - :return bool: Whether file appears to be SAM-formatted - """ - _, ext = os.path.splitext(file_name) - return ext in [".bam", ".sam"] - - -def is_unzipped_fastq(file_name): - """ - Determine whether indicated file appears to be an unzipped FASTQ. - - :param str file_name: Name/path of file to check as unzipped FASTQ. - :return bool: Whether indicated file appears to be in unzipped FASTQ format. - """ - _, ext = os.path.splitext(file_name) - return ext in [".fastq", ".fq"] - - -def peek_read_lengths_and_paired_counts_from_bam(bam, sample_size): - """ - Counting read lengths and paired reads in a sample from a BAM. - - :param str bam: path to BAM file to examine - :param int sample_size: number of reads to look at for estimation - :return defaultdict[int, int], int: read length observation counts, and - number of paired reads observed - :raise OSError: - """ - try: - p = subprocess.Popen(['samtools', 'view', bam], stdout=subprocess.PIPE) - # Count paired alignments - paired = 0 - read_lengths = defaultdict(int) - for _ in range(sample_size): - line = p.stdout.readline().decode().split("\t") - flag = int(line[1]) - read_lengths[len(line[9])] += 1 - if 1 & flag: # check decimal flag contains 1 (paired) - paired += 1 - p.kill() - except OSError: - reason = "Note (samtools not in path): For NGS inputs, " \ - "pep needs samtools to auto-populate " \ - "'read_length' and 'read_type' attributes; " \ - "these attributes were not populated." - raise OSError(reason) - - return PeekBamResult(read_lengths, paired) - - -def samtools_view(file_name, param, prog_path, postpend=""): - """ - Run samtools view, with flexible parameters and post-processing. - - This is used to implement the various read counting functions. - - :param str file_name: name/path of reads tile to use - :param str param: String of parameters to pass to samtools view - :param str prog_path: path to the samtools program - :param str postpend: String to append to the samtools command; - useful to add cut, sort, wc operations to the samtools view output. - :return str: terminal-like text output - """ - cmd = "{prog} view {opts} {f} {extra}".format( - prog=prog_path, opts=param, f=file_name, extra=postpend) - # in python 3, check_output returns a byte string which causes issues. - # with python 3.6 we could use argument: "encoding='UTF-8'"" - return subprocess.check_output(cmd, shell=True).decode().strip() - - -class UnsupportedFiletypeException(Exception): - """ Restrict domain of file types. """ - # Use superclass ctor to allow file name/path or extension to pass - # through as the message for why this error is occurring. - pass