add comandline options for roarying

nickp60 · Apr 19, 2019 · 3cbb80d · 3cbb80d
1 parent 29dfd8a
commit 3cbb80d
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,21 +1,21 @@
 [![Build Status](https://travis-ci.org/nickp60/annofilt.svg?branch=master)](https://travis-ci.org/nickp60/annofilt)[![Coverage Status](https://coveralls.io/repos/github/nickp60/annofilt/badge.svg?branch=master)](https://coveralls.io/github/nickp60/annofilt?branch=master)
-```
-#get a subset of Complete e coli to analyze
-Rscript ../riboSeed/scripts/getCompleteGenomeSubset.R /home/nicholas/GitHub/riboSeed/manuscript_results/entropy/assembly_summary.txt ./colis/ Escherichia coli
-gunzip ./colis/genomes/*
-cd ./colis/genomes/
-# for each of them, run prokka to get annotation
-counter=0; for i in *.fna; do prokka --outdir ./${i} --prefix coli13 --compliant --genus Escherichia --species coli --cpus 4 ${i}.fna; counter=$(($counter + 1)); done
-# I got bored after 11 genomes.  thats enough for a pangenome?  right? right? why are you shaking your head
-# build a pangenome
-roary -p 4 -f 11complete_colis -e -r -v ./colis/genomes/*/*.gff
-
-# our test organism's annotation
-prokka BA000007.2.fasta --outdir sample_prokka --cpus 4
-
-# we also have a mini assembly to test on (see riboSeed toy genome)
-prokka --outdir ./assembly_sample --compliant --genus Escherichia --species coli --cpus 4 ../riboSeed/manuscript_results/simulated_genome/test_consensus/final_de_fere_novo_assembly/contigs.fasta
-```
+<!-- ``` -->
+<!-- #get a subset of Complete e coli to analyze -->
+<!-- Rscript ../riboSeed/scripts/getCompleteGenomeSubset.R /home/nicholas/GitHub/riboSeed/manuscript_results/entropy/assembly_summary.txt ./colis/ Escherichia coli -->
+<!-- gunzip ./colis/genomes/* -->
+<!-- cd ./colis/genomes/ -->
+<!-- # for each of them, run prokka to get annotation -->
+<!-- counter=0; for i in *.fna; do prokka --outdir ./${i} --prefix coli13 --compliant --genus Escherichia --species coli --cpus 4 ${i}.fna; counter=$(($counter + 1)); done -->
+<!-- # I got bored after 11 genomes.  thats enough for a pangenome?  right? right? why are you shaking your head -->
+<!-- # build a pangenome -->
+<!-- roary -p 4 -f 11complete_colis -e -r -v ./colis/genomes/*/*.gff -->
+
+<!-- # our test organism's annotation -->
+<!-- prokka BA000007.2.fasta --outdir sample_prokka --cpus 4 -->
+
+<!-- # we also have a mini assembly to test on (see riboSeed toy genome) -->
+<!-- prokka --outdir ./assembly_sample --compliant --genus Escherichia --species coli --cpus 4 ../riboSeed/manuscript_results/simulated_genome/test_consensus/final_de_fere_novo_assembly/contigs.fasta -->
+<!-- ``` -->
 ![annofilt](https://github.com/nickp60/annofilt/blob/master/docs/icon/icon.svg)
 
 # The Problem
@@ -45,7 +45,7 @@ create filtered .gff file
 # Building a reference pangenome of trusted genes
 To verify the length of annotated genes, we compare annotation length, alignement coverage, and evalue to a pangenme built of well-currated annotations for a given strain.  To build a pangenome for your strain of interest, do the following:
 
-1. Download as many complete genomes (in gff format) from RefSeq as desired (minimum of 10?, maybe?)
+1. Download as many complete genomes from RefSeq as desired (minimum of 10?, maybe?) with `get_compete_genomes``
 2. Run Roary.  This is a good time to explore their stringincy options for percentage identity (which defaults to 95%)
 3. Move the `pan_genome_reference.fa` file to a convenient location for use with annofilt.  This contains a representative nucleotide sequences for each gene in the core.
 

diff --git a/annofilt/get_complete_genomes.py b/annofilt/get_complete_genomes.py
@@ -130,6 +130,12 @@ def main(args=None, logger=None):
     # get args
     if args is None:
         args = get_args()
+    if args.max_strains is None:
+        args.max_strains = args.number_of_strains + 10
+    else:
+        if args.max_strains < args.number_of_strains:
+            raise ValueError("Maximum number of strains to try cannot be " +
+                             "less than the number of genomes")
     output_root = os.path.abspath(os.path.expanduser(args.output))
     if not os.path.isdir(output_root):
         sys.stderr.write("creating output directory %s\n" % output_root)
@@ -171,6 +177,7 @@ def main(args=None, logger=None):
         logger.debug("    unzipping %s" % this_path)
         # -f force overwrite (on osx, -o overwrites like with `unzip`, but not linux?)
         unzip_cmd = "gunzip -f %s" % this_path
+        this_path = this_path.replace(".gz", "")
 
         subprocess.run(unzip_cmd, shell=sys.platform != "win32",
                        stdout=subprocess.PIPE,

diff --git a/annofilt/make_annofilt_pangenome.py b/annofilt/make_annofilt_pangenome.py
@@ -30,10 +30,11 @@ def get_args():  # pragma nocover
         help="output dir", required=True)
     optional = parser.add_argument_group('optional arguments')
     optional.add_argument(
-        "--assembly_summary",
-        dest="assembly_summary",
-        help="Path to assembly_summary.txt from NCBI; if not present, " +
-        "will be downloaded")
+        "--add_roary_args",
+        default="-e -n",
+        help="Args to pass on to Roary for pangenome construction. " +
+        "Default args to give to roary. " +
+        "-e -n means a fast core multifast is generated with MAFFT")
     optional.add_argument(
         "-t",
         "--threads",
@@ -106,7 +107,7 @@ def main(args=None, logger=None):
                            stderr=subprocess.PIPE, check=True)
     roary_out = os.path.join(output_root, args.experiment_name)
     roary_cmd = str(
-        "roary -p {args.threads} -f {roary_out} -r -e -n " +
+        "roary -p {args.threads} -f {roary_out} {args.add_roary_args} " +
         "-v {output_root}/*/*.gff >> {output_root}/roary.log 2>&1").format(
             **locals())
     logger.info("Preparing Roary cmd")