From 8bf75ec8f5cbf630d14cb35850fed232cc2b43dc Mon Sep 17 00:00:00 2001 From: TrellixVulnTeam Date: Wed, 26 Oct 2022 09:34:11 +0000 Subject: [PATCH 1/5] Adding tarfile member sanitization to extractall() --- kb_python/ref.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kb_python/ref.py b/kb_python/ref.py index d6caf63..ef250ec 100755 --- a/kb_python/ref.py +++ b/kb_python/ref.py @@ -341,7 +341,26 @@ def download_reference( logger.info('Extracting files from {}'.format(local_path)) with tarfile.open(local_path, 'r:gz') as f: - f.extractall(temp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(f, temp_dir) for option in reference.files: os.rename( From d9a03a0c2dc2642d03fbfa0f25b4a50dcdf3ec10 Mon Sep 17 00:00:00 2001 From: ricomnl Date: Wed, 4 Jan 2023 11:30:42 -0800 Subject: [PATCH 2/5] Added flags to create genomebam file with kallisto bus --- kb_python/constants.py | 2 + kb_python/count.py | 71 ++++++++++- kb_python/main.py | 45 ++++++- tests/test_count.py | 282 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 369 insertions(+), 31 deletions(-) diff --git a/kb_python/constants.py b/kb_python/constants.py index e893186..37f655e 100755 --- a/kb_python/constants.py +++ b/kb_python/constants.py @@ -31,6 +31,8 @@ GENE_NAME = 'gene' FEATURE_NAME = 'feature' TRANSCRIPT_NAME = 'transcript' +GENOMEBAM_FILENAME = 'pseudoalignments.bam' +GENOMEBAM_INDEX_FILENAME = 'pseudoalignments.bam.bai' UNFILTERED_COUNTS_DIR = 'counts_unfiltered' FILTERED_COUNTS_DIR = 'counts_filtered' diff --git a/kb_python/count.py b/kb_python/count.py index f15bfff..de08e46 100755 --- a/kb_python/count.py +++ b/kb_python/count.py @@ -33,6 +33,8 @@ FLENS_FILENAME, GENE_NAME, GENES_FILENAME, + GENOMEBAM_FILENAME, + GENOMEBAM_INDEX_FILENAME, INSPECT_FILENAME, INSPECT_INTERNAL_FILENAME, INSPECT_UMI_FILENAME, @@ -90,7 +92,10 @@ def kallisto_bus( n: bool = False, k: bool = False, paired: bool = False, + genomebam: bool = False, strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None, + gtf_path: Optional[str] = None, + chromosomes_path: Optional[str] = None, ) -> Dict[str, str]: """Runs `kallisto bus`. @@ -106,7 +111,13 @@ def kallisto_bus( defaults to `False` paired: Whether or not to supply the `--paired` flag, only used for bulk and smartseq2 samples, defaults to `False` + genomebam: Project pseudoalignments to genome sorted BAM file, defaults to + `False` strand: Strandedness, defaults to `None` + gtf_path: GTF file for transcriptome information (required for --genomebam), + defaults to `None` + chromosomes_path: Tab separated file with chromosome names and lengths + (optional for --genomebam, but recommended), defaults to `None` Returns: Dictionary containing paths to generated files @@ -137,6 +148,16 @@ def kallisto_bus( if paired: command += ['--paired'] results['flens'] = os.path.join(out_dir, FLENS_FILENAME) + if genomebam: + command += ['--genomebam'] + if gtf_path is not None: + command += ['-g', gtf_path] + if chromosomes_path is not None: + command += ['-c', chromosomes_path] + results['genomebam'] = os.path.join(out_dir, GENOMEBAM_FILENAME) + results['genomebam_index'] = os.path.join( + out_dir, GENOMEBAM_INDEX_FILENAME + ) if strand == 'unstranded': command += ['--unstranded'] elif strand == 'forward': @@ -955,9 +976,12 @@ def count( fragment_l: Optional[int] = None, fragment_s: Optional[int] = None, paired: bool = False, + genomebam: bool = False, strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None, umi_gene: bool = False, em: bool = False, + gtf_path: Optional[str] = None, + chromosomes_path: Optional[str] = None, ) -> Dict[str, Union[str, Dict[str, str]]]: """Generates count matrices for single-cell RNA seq. @@ -998,11 +1022,17 @@ def count( fragment_s: Standard deviation of fragment lengths, defaults to `None` paired: Whether the fastqs are paired. Has no effect when a single batch file is provided. Defaults to `False` + genomebam: Project pseudoalignments to genome sorted BAM file, defaults to + `False` strand: Strandedness, defaults to `None` umi_gene: Whether to perform gene-level UMI collapsing, defaults to `False` em: Whether to estimate gene abundances using EM algorithm, defaults to `False` + gtf_path: GTF file for transcriptome information (required for --genomebam), + defaults to `None` + chromosomes_path: Tab separated file with chromosome names and lengths + (optional for --genomebam, but recommended), defaults to `None` Returns: Dictionary containing paths to generated files @@ -1042,7 +1072,10 @@ def count( out_dir, threads=threads, paired=paired, + genomebam=genomebam, strand=strand, + gtf_path=gtf_path, + chromosomes_path=chromosomes_path, ) else: logger.info( @@ -1271,7 +1304,10 @@ def count_smartseq3( h5ad: bool = False, by_name: bool = False, inspect: bool = True, + genomebam: bool = False, strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None, + gtf_path: Optional[str] = None, + chromosomes_path: Optional[str] = None, ) -> Dict[str, Union[str, Dict[str, str]]]: """Generates count matrices for Smartseq3. @@ -1297,7 +1333,13 @@ def count_smartseq3( `tcc=False`. inspect: Whether or not to inspect the output BUS file and generate the inspect.json + genomebam: Project pseudoalignments to genome sorted BAM file, defaults to + `False` strand: Strandedness, defaults to `None` + gtf_path: GTF file for transcriptome information (required for --genomebam), + defaults to `None` + chromosomes_path: Tab separated file with chromosome names and lengths + (optional for --genomebam, but recommended), defaults to `None` Returns: Dictionary containing paths to generated files @@ -1333,7 +1375,10 @@ def count_smartseq3( out_dir, threads=threads, paired=True, + genomebam=genomebam, strand=strand, + gtf_path=gtf_path, + chromosomes_path=chromosomes_path ) else: logger.info( @@ -1511,9 +1556,12 @@ def count_velocity( fragment_l: Optional[int] = None, fragment_s: Optional[int] = None, paired: bool = False, + genomebam: bool = False, strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None, umi_gene: bool = False, em: bool = False, + gtf_path: Optional[str] = None, + chromosomes_path: Optional[str] = None, ) -> Dict[str, Union[Dict[str, str], str]]: """Generates RNA velocity matrices for single-cell RNA seq. @@ -1556,11 +1604,17 @@ def count_velocity( fragment_s: Standard deviation of fragment lengths, defaults to `None` paired: Whether the fastqs are paired. Has no effect when a single batch file is provided. Defaults to `False` + genomebam: Project pseudoalignments to genome sorted BAM file, defaults to + `False` strand: Strandedness, defaults to `None` umi_gene: Whether to perform gene-level UMI collapsing, defaults to `False` em: Whether to estimate gene abundances using EM algorithm, defaults to `False` + gtf_path: GTF file for transcriptome information (required for --genomebam), + defaults to `None` + chromosomes_path: Tab separated file with chromosome names and lengths + (optional for --genomebam, but recommended), defaults to `None` Returns: Dictionary containing path to generated index @@ -1597,7 +1651,10 @@ def count_velocity( out_dir, threads=threads, paired=paired, - strand=strand + genomebam=genomebam, + strand=strand, + gtf_path=gtf_path, + chromosomes_path=chromosomes_path, ) else: logger.info( @@ -1932,7 +1989,10 @@ def count_velocity_smartseq3( h5ad: bool = False, by_name: bool = False, inspect: bool = True, + genomebam: bool = False, strand: Optional[Literal['unstranded', 'forward', 'reverse']] = None, + gtf_path: Optional[str] = None, + chromosomes_path: Optional[str] = None, ) -> Dict[str, Union[str, Dict[str, str]]]: """Generates count matrices for Smartseq3. @@ -1958,7 +2018,13 @@ def count_velocity_smartseq3( `tcc=False`. inspect: Whether or not to inspect the output BUS file and generate the inspect.json + genomebam: Project pseudoalignments to genome sorted BAM file, defaults to + `False` strand: Strandedness, defaults to `None` + gtf_path: GTF file for transcriptome information (required for --genomebam), + defaults to `None` + chromosomes_path: Tab separated file with chromosome names and lengths + (optional for --genomebam, but recommended), defaults to `None` Returns: Dictionary containing paths to generated files @@ -1993,7 +2059,10 @@ def count_velocity_smartseq3( out_dir, threads=threads, paired=True, + genomebam=genomebam, strand=strand, + gtf_path=gtf_path, + chromosomes_path=chromosomes_path ) else: logger.info( diff --git a/kb_python/main.py b/kb_python/main.py index 0dcc4b1..bd71512 100755 --- a/kb_python/main.py +++ b/kb_python/main.py @@ -346,6 +346,9 @@ def parse_count( if args.tcc and args.gene_names: parser.error('`--gene-names` may not be used with `--tcc`') + if args.genomebam and not args.gtf: + parser.error('`--gtf` must be provided when using `--genomebam`.') + # Check if batch TSV was provided. batch_path = None if len(args.fastqs) == 1: @@ -483,8 +486,11 @@ def parse_count( loom=args.loom, h5ad=args.h5ad, inspect=not args.no_inspect, + genomebam=args.genomebam, strand=args.strand, - by_name=args.gene_names + by_name=args.gene_names, + gtf_path=args.gtf, + chromosomes_path=args.chromosomes, ) else: from .count import count_velocity @@ -514,10 +520,13 @@ def parse_count( fragment_l=args.fragment_l, fragment_s=args.fragment_s, paired=args.parity == 'paired', + genomebam=args.genomebam, strand=args.strand, umi_gene=args.umi_gene, em=args.em, - by_name=args.gene_names + by_name=args.gene_names, + gtf_path=args.gtf, + chromosomes_path=args.chromosomes, ) else: if args.workflow == 'kite:10xFB' and args.x.upper() != '10XV3': @@ -542,8 +551,11 @@ def parse_count( loom=args.loom, h5ad=args.h5ad, inspect=not args.no_inspect, + genomebam=args.genomebam, strand=args.strand, - by_name=args.gene_names + by_name=args.gene_names, + gtf_path=args.gtf, + chromosomes_path=args.chromosomes, ) else: from .count import count @@ -572,10 +584,13 @@ def parse_count( fragment_l=args.fragment_l, fragment_s=args.fragment_s, paired=args.parity == 'paired', + genomebam=args.genomebam, strand=args.strand, umi_gene=args.umi_gene, em=args.em, - by_name=args.gene_names + by_name=args.gene_names, + gtf_path=args.gtf, + chromosomes_path=args.chromosomes, ) @@ -991,6 +1006,28 @@ def setup_count_args( default=None, choices=['unstranded', 'forward', 'reverse'] ) + parser_count.add_argument( + '--genomebam', + help='Project pseudoalignments to genome sorted BAM file.', + action='store_true', + default=False, + ) + parser_count.add_argument( + '--gtf', + help=( + 'GTF file for transcriptome information (required for --genomebam).', + ), + type=str, + default=None, + ) + parser_count.add_argument( + '--chromosomes', + help=( + 'Tab separated file with chromosome names and lengths (optional for --genomebam, but recommended).' + ), + type=str, + default=None, + ) parser_count.add_argument( '--workflow', help=( diff --git a/tests/test_count.py b/tests/test_count.py index 96978d3..ab86974 100755 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -31,6 +31,8 @@ FLD_FILENAME, FLENS_FILENAME, GENES_FILENAME, + GENOMEBAM_FILENAME, + GENOMEBAM_INDEX_FILENAME, INSPECT_FILENAME, INSPECT_INTERNAL_FILENAME, INSPECT_UMI_FILENAME, @@ -1103,7 +1105,10 @@ def test_count_with_whitelist(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -1248,7 +1253,10 @@ def test_count_report(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -1394,7 +1402,10 @@ def test_count_convert(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -1554,7 +1565,10 @@ def test_count_cellranger(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -1726,7 +1740,10 @@ def test_count_filter(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(2, bustools_sort.call_count) bustools_sort.assert_has_calls([ @@ -1876,7 +1893,10 @@ def test_count_without_whitelist(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -2010,7 +2030,10 @@ def test_count_kite_convert(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -2185,7 +2208,10 @@ def test_count_kite_filter(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(2, bustools_sort.call_count) bustools_sort.assert_has_calls([ @@ -2345,7 +2371,10 @@ def test_count_kite_FB(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(3, bustools_sort.call_count) bustools_sort.assert_has_calls([ @@ -2496,7 +2525,10 @@ def test_count_bulk_multi_paired(self): out_dir, threads=threads, paired=True, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -2655,7 +2687,10 @@ def test_count_bulk_multi_single(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -2797,7 +2832,10 @@ def test_count_bulk_demux_paired(self): out_dir, threads=threads, paired=True, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) bustools_sort.assert_called_once_with( bus_path, @@ -2922,7 +2960,10 @@ def test_count_bulk_demux_single(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) bustools_sort.assert_called_once_with( bus_path, @@ -3088,7 +3129,10 @@ def test_count_bulk_demux_paired_tcc(self): out_dir, threads=threads, paired=True, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) bustools_sort.assert_called_once_with( bus_path, @@ -3265,7 +3309,10 @@ def test_count_bulk_demux_single_tcc(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) bustools_sort.assert_called_once_with( bus_path, @@ -3486,7 +3533,10 @@ def test_count_smartseq3(self): out_dir, threads=threads, paired=True, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -3844,7 +3894,10 @@ def test_count_smartseq3_tcc(self): out_dir, threads=threads, paired=True, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -4052,7 +4105,160 @@ def test_count_strand(self): out_dir, threads=threads, paired=False, - strand='unstranded' + genomebam=False, + strand='unstranded', + gtf_path=None, + chromosomes_path=None, + ) + self.assertEqual(bustools_sort.call_count, 2) + bustools_sort.assert_has_calls([ + call( + bus_path, + bus_s_path, + temp_dir=temp_dir, + threads=threads, + memory=memory + ), + call( + bus_sc_path, + bus_scs_path, + temp_dir=temp_dir, + threads=threads, + memory=memory + ) + ]) + bustools_inspect.assert_called_once_with( + bus_s_path, + inspect_path, + whitelist_path=self.whitelist_path, + ) + copy_or_create_whitelist.assert_not_called() + bustools_correct.assert_called_once_with( + bus_s_path, bus_sc_path, self.whitelist_path + ) + bustools_count.assert_called_once_with( + bus_scs_path, + counts_prefix, + self.t2g_path, + ecmap_path, + txnames_path, + tcc=False, + mm=False, + cm=False, + umi_gene=False, + em=False, + ) + convert_matrix.assert_not_called() + filter_with_bustools.assert_not_called() + + STATS.start.assert_called_once() + STATS.end.assert_called_once() + STATS.save.assert_called_once_with( + os.path.join(out_dir, KB_INFO_FILENAME) + ) + import_matrix_as_anndata.assert_not_called() + render_report.assert_not_called() + + def test_count_genomebam(self): + with mock.patch('kb_python.count.stream_fastqs') as stream_fastqs,\ + mock.patch('kb_python.count.kallisto_bus') as kallisto_bus,\ + mock.patch('kb_python.count.bustools_sort') as bustools_sort,\ + mock.patch('kb_python.count.bustools_inspect') as bustools_inspect,\ + mock.patch('kb_python.count.copy_or_create_whitelist') as copy_or_create_whitelist,\ + mock.patch('kb_python.count.bustools_correct') as bustools_correct,\ + mock.patch('kb_python.count.bustools_count') as bustools_count,\ + mock.patch('kb_python.count.convert_matrix') as convert_matrix,\ + mock.patch('kb_python.count.filter_with_bustools') as filter_with_bustools,\ + mock.patch('kb_python.count.STATS') as STATS,\ + mock.patch('kb_python.count.render_report') as render_report,\ + mock.patch('kb_python.count.import_matrix_as_anndata') as import_matrix_as_anndata: + out_dir = self.temp_dir + temp_dir = self.temp_dir + counts_prefix = os.path.join( + out_dir, UNFILTERED_COUNTS_DIR, COUNTS_PREFIX + ) + threads = 99999 + memory = 'TEST' + bus_path = os.path.join(out_dir, BUS_FILENAME) + ecmap_path = os.path.join(out_dir, ECMAP_FILENAME) + txnames_path = os.path.join(out_dir, TXNAMES_FILENAME) + info_path = os.path.join(out_dir, KALLISTO_INFO_FILENAME) + genomebam_path = os.path.join(out_dir, GENOMEBAM_FILENAME) + genomebam_index_path = os.path.join( + out_dir, GENOMEBAM_INDEX_FILENAME + ) + inspect_path = os.path.join(out_dir, INSPECT_FILENAME) + bus_s_path = os.path.join(temp_dir, BUS_S_FILENAME) + bus_sc_path = os.path.join(temp_dir, BUS_SC_FILENAME) + bus_scs_path = os.path.join(out_dir, BUS_UNFILTERED_FILENAME) + stream_fastqs.return_value = self.fastqs + kallisto_bus.return_value = { + 'bus': bus_path, + 'ecmap': ecmap_path, + 'txnames': txnames_path, + 'info': info_path, + 'genomebam': genomebam_path, + 'genomebam_index': genomebam_index_path, + } + bustools_sort.side_effect = [{ + 'bus': bus_s_path + }, { + 'bus': bus_scs_path + }] + bustools_inspect.return_value = {'inspect': inspect_path} + bustools_correct.return_value = {'bus': bus_sc_path} + bustools_count.return_value = { + 'mtx': '{}.mtx'.format(counts_prefix), + 'genes': '{}.genes.txt'.format(counts_prefix), + 'barcodes': '{}.barcodes.txt'.format(counts_prefix), + } + STATS.save.return_value = 'stats' + + self.assertEqual({ + 'stats': 'stats', + 'unfiltered': { + 'bus': bus_path, + 'ecmap': ecmap_path, + 'txnames': txnames_path, + 'info': info_path, + 'genomebam': genomebam_path, + 'genomebam_index': genomebam_index_path, + 'inspect': inspect_path, + 'bus_scs': bus_scs_path, + 'mtx': '{}.mtx'.format(counts_prefix), + 'genes': '{}.genes.txt'.format(counts_prefix), + 'barcodes': '{}.barcodes.txt'.format(counts_prefix), + } + }, + count.count( + self.index_path, + self.t2g_path, + self.technology, + out_dir, + self.fastqs, + whitelist_path=self.whitelist_path, + temp_dir=temp_dir, + threads=threads, + memory=memory, + genomebam=True, + strand='unstranded', + gtf_path=self.gtf_path, + )) + + stream_fastqs.assert_called_once_with( + self.fastqs, temp_dir=temp_dir + ) + kallisto_bus.assert_called_once_with( + self.fastqs, + self.index_path, + self.technology, + out_dir, + threads=threads, + paired=False, + genomebam=True, + strand='unstranded', + gtf_path=self.gtf_path, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 2) bustools_sort.assert_has_calls([ @@ -4277,7 +4483,10 @@ def test_count_velocity_with_whitelist(self): out_dir, threads=threads, paired=False, - strand=None + genomebam=False, + strand=None, + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ @@ -4592,8 +4801,11 @@ def test_count_velocity_cellranger(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ @@ -4897,8 +5109,11 @@ def test_count_velocity_report(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ @@ -5210,8 +5425,11 @@ def test_count_velocity_convert(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ @@ -5502,8 +5720,11 @@ def test_count_velocity_without_whitelist(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ @@ -5878,8 +6099,11 @@ def test_count_velocity_filter(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 6) bustools_sort.assert_has_calls([ @@ -6310,8 +6534,11 @@ def test_count_velocity_filter_convert(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand=None, - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 6) bustools_sort.assert_has_calls([ @@ -6710,8 +6937,11 @@ def test_count_velocity_strand(self): self.technology, out_dir, threads=threads, + paired=False, + genomebam=False, strand='unstranded', - paired=False + gtf_path=None, + chromosomes_path=None, ) self.assertEqual(bustools_sort.call_count, 4) bustools_sort.assert_has_calls([ From 0b9bf587e36e31387b22c455f11927a293ad9493 Mon Sep 17 00:00:00 2001 From: ricomnl Date: Wed, 4 Jan 2023 11:43:29 -0800 Subject: [PATCH 3/5] Fixed tuple issue and stored more expressive metavar for chromosomes arg --- kb_python/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kb_python/main.py b/kb_python/main.py index bd71512..32d0db7 100755 --- a/kb_python/main.py +++ b/kb_python/main.py @@ -1015,13 +1015,14 @@ def setup_count_args( parser_count.add_argument( '--gtf', help=( - 'GTF file for transcriptome information (required for --genomebam).', + 'GTF file for transcriptome information (required for --genomebam).' ), type=str, default=None, ) parser_count.add_argument( '--chromosomes', + metavar='chrom.sizes', help=( 'Tab separated file with chromosome names and lengths (optional for --genomebam, but recommended).' ), From de7666ec0e83c4c79f28b219d5349da75373d96a Mon Sep 17 00:00:00 2001 From: Lioscro Date: Sat, 14 Jan 2023 12:08:31 -0500 Subject: [PATCH 4/5] lint fix --- kb_python/ref.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/kb_python/ref.py b/kb_python/ref.py index ef250ec..dc560be 100755 --- a/kb_python/ref.py +++ b/kb_python/ref.py @@ -341,25 +341,27 @@ def download_reference( logger.info('Extracting files from {}'.format(local_path)) with tarfile.open(local_path, 'r:gz') as f: + def is_within_directory(directory, target): - + abs_directory = os.path.abspath(directory) abs_target = os.path.abspath(target) - + prefix = os.path.commonprefix([abs_directory, abs_target]) - + return prefix == abs_directory - - def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - + + def safe_extract( + tar, path=".", members=None, *, numeric_owner=False + ): + for member in tar.getmembers(): member_path = os.path.join(path, member.name) if not is_within_directory(path, member_path): raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - + + tar.extractall(path, members, numeric_owner=numeric_owner) + safe_extract(f, temp_dir) for option in reference.files: From 78d675e512df848bd0653b0c7b03e05ea2a10c80 Mon Sep 17 00:00:00 2001 From: Lioscro Date: Sat, 14 Jan 2023 12:11:18 -0500 Subject: [PATCH 5/5] print warning when --chromosomes not provided with --genomebam --- kb_python/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kb_python/main.py b/kb_python/main.py index 32d0db7..5922430 100755 --- a/kb_python/main.py +++ b/kb_python/main.py @@ -348,6 +348,10 @@ def parse_count( if args.genomebam and not args.gtf: parser.error('`--gtf` must be provided when using `--genomebam`.') + if args.genomebam and not args.chromosomes: + logger.warning( + '`--chromosomes` is recommended when using `--genomebam`' + ) # Check if batch TSV was provided. batch_path = None