Skip to content

Commit

Permalink
add comandline options for roarying
Browse files Browse the repository at this point in the history
  • Loading branch information
nickp60 committed Apr 19, 2019
1 parent 29dfd8a commit 3cbb80d
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 23 deletions.
36 changes: 18 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
[![Build Status](https://travis-ci.org/nickp60/annofilt.svg?branch=master)](https://travis-ci.org/nickp60/annofilt)[![Coverage Status](https://coveralls.io/repos/github/nickp60/annofilt/badge.svg?branch=master)](https://coveralls.io/github/nickp60/annofilt?branch=master)
```
#get a subset of Complete e coli to analyze
Rscript ../riboSeed/scripts/getCompleteGenomeSubset.R /home/nicholas/GitHub/riboSeed/manuscript_results/entropy/assembly_summary.txt ./colis/ Escherichia coli
gunzip ./colis/genomes/*
cd ./colis/genomes/
# for each of them, run prokka to get annotation
counter=0; for i in *.fna; do prokka --outdir ./${i} --prefix coli13 --compliant --genus Escherichia --species coli --cpus 4 ${i}.fna; counter=$(($counter + 1)); done
# I got bored after 11 genomes. thats enough for a pangenome? right? right? why are you shaking your head
# build a pangenome
roary -p 4 -f 11complete_colis -e -r -v ./colis/genomes/*/*.gff
# our test organism's annotation
prokka BA000007.2.fasta --outdir sample_prokka --cpus 4
# we also have a mini assembly to test on (see riboSeed toy genome)
prokka --outdir ./assembly_sample --compliant --genus Escherichia --species coli --cpus 4 ../riboSeed/manuscript_results/simulated_genome/test_consensus/final_de_fere_novo_assembly/contigs.fasta
```
<!-- ``` -->
<!-- #get a subset of Complete e coli to analyze -->
<!-- Rscript ../riboSeed/scripts/getCompleteGenomeSubset.R /home/nicholas/GitHub/riboSeed/manuscript_results/entropy/assembly_summary.txt ./colis/ Escherichia coli -->
<!-- gunzip ./colis/genomes/* -->
<!-- cd ./colis/genomes/ -->
<!-- # for each of them, run prokka to get annotation -->
<!-- counter=0; for i in *.fna; do prokka --outdir ./${i} --prefix coli13 --compliant --genus Escherichia --species coli --cpus 4 ${i}.fna; counter=$(($counter + 1)); done -->
<!-- # I got bored after 11 genomes. thats enough for a pangenome? right? right? why are you shaking your head -->
<!-- # build a pangenome -->
<!-- roary -p 4 -f 11complete_colis -e -r -v ./colis/genomes/*/*.gff -->

<!-- # our test organism's annotation -->
<!-- prokka BA000007.2.fasta --outdir sample_prokka --cpus 4 -->

<!-- # we also have a mini assembly to test on (see riboSeed toy genome) -->
<!-- prokka --outdir ./assembly_sample --compliant --genus Escherichia --species coli --cpus 4 ../riboSeed/manuscript_results/simulated_genome/test_consensus/final_de_fere_novo_assembly/contigs.fasta -->
<!-- ``` -->
![annofilt](https://github.com/nickp60/annofilt/blob/master/docs/icon/icon.svg)

# The Problem
Expand Down Expand Up @@ -45,7 +45,7 @@ create filtered .gff file
# Building a reference pangenome of trusted genes
To verify the length of annotated genes, we compare annotation length, alignement coverage, and evalue to a pangenme built of well-currated annotations for a given strain. To build a pangenome for your strain of interest, do the following:

1. Download as many complete genomes (in gff format) from RefSeq as desired (minimum of 10?, maybe?)
1. Download as many complete genomes from RefSeq as desired (minimum of 10?, maybe?) with `get_compete_genomes``
2. Run Roary. This is a good time to explore their stringincy options for percentage identity (which defaults to 95%)
3. Move the `pan_genome_reference.fa` file to a convenient location for use with annofilt. This contains a representative nucleotide sequences for each gene in the core.

Expand Down
7 changes: 7 additions & 0 deletions annofilt/get_complete_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ def main(args=None, logger=None):
# get args
if args is None:
args = get_args()
if args.max_strains is None:
args.max_strains = args.number_of_strains + 10
else:
if args.max_strains < args.number_of_strains:
raise ValueError("Maximum number of strains to try cannot be " +
"less than the number of genomes")
output_root = os.path.abspath(os.path.expanduser(args.output))
if not os.path.isdir(output_root):
sys.stderr.write("creating output directory %s\n" % output_root)
Expand Down Expand Up @@ -171,6 +177,7 @@ def main(args=None, logger=None):
logger.debug(" unzipping %s" % this_path)
# -f force overwrite (on osx, -o overwrites like with `unzip`, but not linux?)
unzip_cmd = "gunzip -f %s" % this_path
this_path = this_path.replace(".gz", "")

subprocess.run(unzip_cmd, shell=sys.platform != "win32",
stdout=subprocess.PIPE,
Expand Down
11 changes: 6 additions & 5 deletions annofilt/make_annofilt_pangenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def get_args(): # pragma nocover
help="output dir", required=True)
optional = parser.add_argument_group('optional arguments')
optional.add_argument(
"--assembly_summary",
dest="assembly_summary",
help="Path to assembly_summary.txt from NCBI; if not present, " +
"will be downloaded")
"--add_roary_args",
default="-e -n",
help="Args to pass on to Roary for pangenome construction. " +
"Default args to give to roary. " +
"-e -n means a fast core multifast is generated with MAFFT")
optional.add_argument(
"-t",
"--threads",
Expand Down Expand Up @@ -106,7 +107,7 @@ def main(args=None, logger=None):
stderr=subprocess.PIPE, check=True)
roary_out = os.path.join(output_root, args.experiment_name)
roary_cmd = str(
"roary -p {args.threads} -f {roary_out} -r -e -n " +
"roary -p {args.threads} -f {roary_out} {args.add_roary_args} " +
"-v {output_root}/*/*.gff >> {output_root}/roary.log 2>&1").format(
**locals())
logger.info("Preparing Roary cmd")
Expand Down

0 comments on commit 3cbb80d

Please sign in to comment.