Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge devel into master #187

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions kb_python/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
GENE_NAME = 'gene'
FEATURE_NAME = 'feature'
TRANSCRIPT_NAME = 'transcript'
GENOMEBAM_FILENAME = 'pseudoalignments.bam'
GENOMEBAM_INDEX_FILENAME = 'pseudoalignments.bam.bai'

UNFILTERED_COUNTS_DIR = 'counts_unfiltered'
FILTERED_COUNTS_DIR = 'counts_filtered'
Expand Down
71 changes: 70 additions & 1 deletion kb_python/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
FLENS_FILENAME,
GENE_NAME,
GENES_FILENAME,
GENOMEBAM_FILENAME,
GENOMEBAM_INDEX_FILENAME,
INSPECT_FILENAME,
INSPECT_INTERNAL_FILENAME,
INSPECT_UMI_FILENAME,
Expand Down Expand Up @@ -90,7 +92,10 @@ def kallisto_bus(
n: bool = False,
k: bool = False,
paired: bool = False,
genomebam: bool = False,
strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None,
gtf_path: Optional[str] = None,
chromosomes_path: Optional[str] = None,
) -> Dict[str, str]:
"""Runs `kallisto bus`.

Expand All @@ -106,7 +111,13 @@ def kallisto_bus(
defaults to `False`
paired: Whether or not to supply the `--paired` flag, only used for
bulk and smartseq2 samples, defaults to `False`
genomebam: Project pseudoalignments to genome sorted BAM file, defaults to
`False`
strand: Strandedness, defaults to `None`
gtf_path: GTF file for transcriptome information (required for --genomebam),
defaults to `None`
chromosomes_path: Tab separated file with chromosome names and lengths
(optional for --genomebam, but recommended), defaults to `None`

Returns:
Dictionary containing paths to generated files
Expand Down Expand Up @@ -137,6 +148,16 @@ def kallisto_bus(
if paired:
command += ['--paired']
results['flens'] = os.path.join(out_dir, FLENS_FILENAME)
if genomebam:
command += ['--genomebam']
if gtf_path is not None:
command += ['-g', gtf_path]
if chromosomes_path is not None:
command += ['-c', chromosomes_path]
results['genomebam'] = os.path.join(out_dir, GENOMEBAM_FILENAME)
results['genomebam_index'] = os.path.join(
out_dir, GENOMEBAM_INDEX_FILENAME
)
if strand == 'unstranded':
command += ['--unstranded']
elif strand == 'forward':
Expand Down Expand Up @@ -955,9 +976,12 @@ def count(
fragment_l: Optional[int] = None,
fragment_s: Optional[int] = None,
paired: bool = False,
genomebam: bool = False,
strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None,
umi_gene: bool = False,
em: bool = False,
gtf_path: Optional[str] = None,
chromosomes_path: Optional[str] = None,
) -> Dict[str, Union[str, Dict[str, str]]]:
"""Generates count matrices for single-cell RNA seq.

Expand Down Expand Up @@ -998,11 +1022,17 @@ def count(
fragment_s: Standard deviation of fragment lengths, defaults to `None`
paired: Whether the fastqs are paired. Has no effect when a single
batch file is provided. Defaults to `False`
genomebam: Project pseudoalignments to genome sorted BAM file, defaults to
`False`
strand: Strandedness, defaults to `None`
umi_gene: Whether to perform gene-level UMI collapsing, defaults to
`False`
em: Whether to estimate gene abundances using EM algorithm,
defaults to `False`
gtf_path: GTF file for transcriptome information (required for --genomebam),
defaults to `None`
chromosomes_path: Tab separated file with chromosome names and lengths
(optional for --genomebam, but recommended), defaults to `None`

Returns:
Dictionary containing paths to generated files
Expand Down Expand Up @@ -1042,7 +1072,10 @@ def count(
out_dir,
threads=threads,
paired=paired,
genomebam=genomebam,
strand=strand,
gtf_path=gtf_path,
chromosomes_path=chromosomes_path,
)
else:
logger.info(
Expand Down Expand Up @@ -1271,7 +1304,10 @@ def count_smartseq3(
h5ad: bool = False,
by_name: bool = False,
inspect: bool = True,
genomebam: bool = False,
strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None,
gtf_path: Optional[str] = None,
chromosomes_path: Optional[str] = None,
) -> Dict[str, Union[str, Dict[str, str]]]:
"""Generates count matrices for Smartseq3.

Expand All @@ -1297,7 +1333,13 @@ def count_smartseq3(
`tcc=False`.
inspect: Whether or not to inspect the output BUS file and generate
the inspect.json
genomebam: Project pseudoalignments to genome sorted BAM file, defaults to
`False`
strand: Strandedness, defaults to `None`
gtf_path: GTF file for transcriptome information (required for --genomebam),
defaults to `None`
chromosomes_path: Tab separated file with chromosome names and lengths
(optional for --genomebam, but recommended), defaults to `None`

Returns:
Dictionary containing paths to generated files
Expand Down Expand Up @@ -1333,7 +1375,10 @@ def count_smartseq3(
out_dir,
threads=threads,
paired=True,
genomebam=genomebam,
strand=strand,
gtf_path=gtf_path,
chromosomes_path=chromosomes_path
)
else:
logger.info(
Expand Down Expand Up @@ -1511,9 +1556,12 @@ def count_velocity(
fragment_l: Optional[int] = None,
fragment_s: Optional[int] = None,
paired: bool = False,
genomebam: bool = False,
strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None,
umi_gene: bool = False,
em: bool = False,
gtf_path: Optional[str] = None,
chromosomes_path: Optional[str] = None,
) -> Dict[str, Union[Dict[str, str], str]]:
"""Generates RNA velocity matrices for single-cell RNA seq.

Expand Down Expand Up @@ -1556,11 +1604,17 @@ def count_velocity(
fragment_s: Standard deviation of fragment lengths, defaults to `None`
paired: Whether the fastqs are paired. Has no effect when a single
batch file is provided. Defaults to `False`
genomebam: Project pseudoalignments to genome sorted BAM file, defaults to
`False`
strand: Strandedness, defaults to `None`
umi_gene: Whether to perform gene-level UMI collapsing, defaults to
`False`
em: Whether to estimate gene abundances using EM algorithm, defaults to
`False`
gtf_path: GTF file for transcriptome information (required for --genomebam),
defaults to `None`
chromosomes_path: Tab separated file with chromosome names and lengths
(optional for --genomebam, but recommended), defaults to `None`

Returns:
Dictionary containing path to generated index
Expand Down Expand Up @@ -1597,7 +1651,10 @@ def count_velocity(
out_dir,
threads=threads,
paired=paired,
strand=strand
genomebam=genomebam,
strand=strand,
gtf_path=gtf_path,
chromosomes_path=chromosomes_path,
)
else:
logger.info(
Expand Down Expand Up @@ -1932,7 +1989,10 @@ def count_velocity_smartseq3(
h5ad: bool = False,
by_name: bool = False,
inspect: bool = True,
genomebam: bool = False,
strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None,
gtf_path: Optional[str] = None,
chromosomes_path: Optional[str] = None,
) -> Dict[str, Union[str, Dict[str, str]]]:
"""Generates count matrices for Smartseq3.

Expand All @@ -1958,7 +2018,13 @@ def count_velocity_smartseq3(
`tcc=False`.
inspect: Whether or not to inspect the output BUS file and generate
the inspect.json
genomebam: Project pseudoalignments to genome sorted BAM file, defaults to
`False`
strand: Strandedness, defaults to `None`
gtf_path: GTF file for transcriptome information (required for --genomebam),
defaults to `None`
chromosomes_path: Tab separated file with chromosome names and lengths
(optional for --genomebam, but recommended), defaults to `None`

Returns:
Dictionary containing paths to generated files
Expand Down Expand Up @@ -1993,7 +2059,10 @@ def count_velocity_smartseq3(
out_dir,
threads=threads,
paired=True,
genomebam=genomebam,
strand=strand,
gtf_path=gtf_path,
chromosomes_path=chromosomes_path
)
else:
logger.info(
Expand Down
50 changes: 46 additions & 4 deletions kb_python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,13 @@ def parse_count(
if args.tcc and args.gene_names:
parser.error('`--gene-names` may not be used with `--tcc`')

if args.genomebam and not args.gtf:
parser.error('`--gtf` must be provided when using `--genomebam`.')
if args.genomebam and not args.chromosomes:
logger.warning(
'`--chromosomes` is recommended when using `--genomebam`'
)

# Check if batch TSV was provided.
batch_path = None
if len(args.fastqs) == 1:
Expand Down Expand Up @@ -483,8 +490,11 @@ def parse_count(
loom=args.loom,
h5ad=args.h5ad,
inspect=not args.no_inspect,
genomebam=args.genomebam,
strand=args.strand,
by_name=args.gene_names
by_name=args.gene_names,
gtf_path=args.gtf,
chromosomes_path=args.chromosomes,
)
else:
from .count import count_velocity
Expand Down Expand Up @@ -514,10 +524,13 @@ def parse_count(
fragment_l=args.fragment_l,
fragment_s=args.fragment_s,
paired=args.parity == 'paired',
genomebam=args.genomebam,
strand=args.strand,
umi_gene=args.umi_gene,
em=args.em,
by_name=args.gene_names
by_name=args.gene_names,
gtf_path=args.gtf,
chromosomes_path=args.chromosomes,
)
else:
if args.workflow == 'kite:10xFB' and args.x.upper() != '10XV3':
Expand All @@ -542,8 +555,11 @@ def parse_count(
loom=args.loom,
h5ad=args.h5ad,
inspect=not args.no_inspect,
genomebam=args.genomebam,
strand=args.strand,
by_name=args.gene_names
by_name=args.gene_names,
gtf_path=args.gtf,
chromosomes_path=args.chromosomes,
)
else:
from .count import count
Expand Down Expand Up @@ -572,10 +588,13 @@ def parse_count(
fragment_l=args.fragment_l,
fragment_s=args.fragment_s,
paired=args.parity == 'paired',
genomebam=args.genomebam,
strand=args.strand,
umi_gene=args.umi_gene,
em=args.em,
by_name=args.gene_names
by_name=args.gene_names,
gtf_path=args.gtf,
chromosomes_path=args.chromosomes,
)


Expand Down Expand Up @@ -991,6 +1010,29 @@ def setup_count_args(
default=None,
choices=['unstranded', 'forward', 'reverse']
)
parser_count.add_argument(
'--genomebam',
help='Project pseudoalignments to genome sorted BAM file.',
action='store_true',
default=False,
)
parser_count.add_argument(
'--gtf',
help=(
'GTF file for transcriptome information (required for --genomebam).'
),
type=str,
default=None,
)
parser_count.add_argument(
'--chromosomes',
metavar='chrom.sizes',
help=(
'Tab separated file with chromosome names and lengths (optional for --genomebam, but recommended).'
),
type=str,
default=None,
)
parser_count.add_argument(
'--workflow',
help=(
Expand Down
23 changes: 22 additions & 1 deletion kb_python/ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,28 @@ def download_reference(

logger.info('Extracting files from {}'.format(local_path))
with tarfile.open(local_path, 'r:gz') as f:
f.extractall(temp_dir)

def is_within_directory(directory, target):

abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)

prefix = os.path.commonprefix([abs_directory, abs_target])

return prefix == abs_directory

def safe_extract(
tar, path=".", members=None, *, numeric_owner=False
):

for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")

tar.extractall(path, members, numeric_owner=numeric_owner)

safe_extract(f, temp_dir)

for option in reference.files:
os.rename(
Expand Down