From 3cbb80db00df2d268ba382fc5bfcc44a6ce84614 Mon Sep 17 00:00:00 2001 From: Nick Waters Date: Fri, 19 Apr 2019 14:33:04 +0100 Subject: [PATCH] add comandline options for roarying --- README.md | 36 ++++++++++++++--------------- annofilt/get_complete_genomes.py | 7 ++++++ annofilt/make_annofilt_pangenome.py | 11 +++++---- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f08a5e5..99cc5a9 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@ [![Build Status](https://travis-ci.org/nickp60/annofilt.svg?branch=master)](https://travis-ci.org/nickp60/annofilt)[![Coverage Status](https://coveralls.io/repos/github/nickp60/annofilt/badge.svg?branch=master)](https://coveralls.io/github/nickp60/annofilt?branch=master) -``` -#get a subset of Complete e coli to analyze -Rscript ../riboSeed/scripts/getCompleteGenomeSubset.R /home/nicholas/GitHub/riboSeed/manuscript_results/entropy/assembly_summary.txt ./colis/ Escherichia coli -gunzip ./colis/genomes/* -cd ./colis/genomes/ -# for each of them, run prokka to get annotation -counter=0; for i in *.fna; do prokka --outdir ./${i} --prefix coli13 --compliant --genus Escherichia --species coli --cpus 4 ${i}.fna; counter=$(($counter + 1)); done -# I got bored after 11 genomes. thats enough for a pangenome? right? right? why are you shaking your head -# build a pangenome -roary -p 4 -f 11complete_colis -e -r -v ./colis/genomes/*/*.gff - -# our test organism's annotation -prokka BA000007.2.fasta --outdir sample_prokka --cpus 4 - -# we also have a mini assembly to test on (see riboSeed toy genome) -prokka --outdir ./assembly_sample --compliant --genus Escherichia --species coli --cpus 4 ../riboSeed/manuscript_results/simulated_genome/test_consensus/final_de_fere_novo_assembly/contigs.fasta -``` + + + + + + + + + + + + + + + + + ![annofilt](https://github.com/nickp60/annofilt/blob/master/docs/icon/icon.svg) # The Problem @@ -45,7 +45,7 @@ create filtered .gff file # Building a reference pangenome of trusted genes To verify the length of annotated genes, we compare annotation length, alignement coverage, and evalue to a pangenme built of well-currated annotations for a given strain. To build a pangenome for your strain of interest, do the following: -1. Download as many complete genomes (in gff format) from RefSeq as desired (minimum of 10?, maybe?) +1. Download as many complete genomes from RefSeq as desired (minimum of 10?, maybe?) with `get_compete_genomes`` 2. Run Roary. This is a good time to explore their stringincy options for percentage identity (which defaults to 95%) 3. Move the `pan_genome_reference.fa` file to a convenient location for use with annofilt. This contains a representative nucleotide sequences for each gene in the core. diff --git a/annofilt/get_complete_genomes.py b/annofilt/get_complete_genomes.py index 485c7c3..1c93401 100644 --- a/annofilt/get_complete_genomes.py +++ b/annofilt/get_complete_genomes.py @@ -130,6 +130,12 @@ def main(args=None, logger=None): # get args if args is None: args = get_args() + if args.max_strains is None: + args.max_strains = args.number_of_strains + 10 + else: + if args.max_strains < args.number_of_strains: + raise ValueError("Maximum number of strains to try cannot be " + + "less than the number of genomes") output_root = os.path.abspath(os.path.expanduser(args.output)) if not os.path.isdir(output_root): sys.stderr.write("creating output directory %s\n" % output_root) @@ -171,6 +177,7 @@ def main(args=None, logger=None): logger.debug(" unzipping %s" % this_path) # -f force overwrite (on osx, -o overwrites like with `unzip`, but not linux?) unzip_cmd = "gunzip -f %s" % this_path + this_path = this_path.replace(".gz", "") subprocess.run(unzip_cmd, shell=sys.platform != "win32", stdout=subprocess.PIPE, diff --git a/annofilt/make_annofilt_pangenome.py b/annofilt/make_annofilt_pangenome.py index 969b743..3de7f1e 100644 --- a/annofilt/make_annofilt_pangenome.py +++ b/annofilt/make_annofilt_pangenome.py @@ -30,10 +30,11 @@ def get_args(): # pragma nocover help="output dir", required=True) optional = parser.add_argument_group('optional arguments') optional.add_argument( - "--assembly_summary", - dest="assembly_summary", - help="Path to assembly_summary.txt from NCBI; if not present, " + - "will be downloaded") + "--add_roary_args", + default="-e -n", + help="Args to pass on to Roary for pangenome construction. " + + "Default args to give to roary. " + + "-e -n means a fast core multifast is generated with MAFFT") optional.add_argument( "-t", "--threads", @@ -106,7 +107,7 @@ def main(args=None, logger=None): stderr=subprocess.PIPE, check=True) roary_out = os.path.join(output_root, args.experiment_name) roary_cmd = str( - "roary -p {args.threads} -f {roary_out} -r -e -n " + + "roary -p {args.threads} -f {roary_out} {args.add_roary_args} " + "-v {output_root}/*/*.gff >> {output_root}/roary.log 2>&1").format( **locals()) logger.info("Preparing Roary cmd")