Skip to content

Commit

Permalink
restructure to move many sanity checks to the bachhand of the workflows
Browse files Browse the repository at this point in the history
This commit also includes bug fixes, so that metaspades is now working, and also this closes #954.
  • Loading branch information
ShaiberAlon committed Oct 22, 2018
1 parent 22d2e9b commit 21c0255
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 115 deletions.
19 changes: 7 additions & 12 deletions anvio/workflows/contigs/Snakefile
Expand Up @@ -31,15 +31,6 @@ if not slave_mode:

localrules: annotate_contigs_database

group_names = []
fasta_txt_file = M.get_param_value_from_config('fasta_txt', repress_default=True)

if fasta_txt_file:
filesnpaths.is_file_exists(fasta_txt_file)
fasta_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
group_names = list(fasta_information.keys())
references_mode = True

# setting configuration for optional steps
run_remove_human_dna_using_centrifuge = M.get_param_value_from_config(["remove_human_dna_using_centrifuge", "run"]) == True

Expand All @@ -63,6 +54,10 @@ if run_taxonomy_with_centrifuge:

"config file. See documentation for more details.")

# TODO: it would have been better to have this in tha backhand
# there is no reason contigs workflow would be familiar with references mode.
# it should be that this is set to the default for contigs workflow
# and then in the case of assembly in metagenomics workflow then this value would be override
def get_raw_fasta(wildcards):
'''
Define the path to the input fasta files.
Expand All @@ -72,16 +67,16 @@ def get_raw_fasta(wildcards):
This function also deals with the different cases of "reference mode"
Vs. "assembly mode".
'''
if references_mode:
if M.references_mode:
# in 'reference mode' the input is the reference fasta
contigs = fasta_information[wildcards.group]['path']
contigs = M.fasta_information[wildcards.group]['path']
else:
# by default the input fasta is the assembly output
contigs = dirs_dict["FASTA_DIR"] + "/%s/final.contigs.fa" % wildcards.group
return contigs

rule generate_and_annotate_contigs_db:
input: expand(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done", group=group_names)
input: expand(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done", group=M.group_names)


rule anvi_script_reformat_fasta_prefix_only:
Expand Down
17 changes: 17 additions & 0 deletions anvio/workflows/contigs/__init__.py
Expand Up @@ -6,7 +6,9 @@


import anvio
import anvio.utils as u
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.workflows import WorkflowSuperClass
from anvio.errors import ConfigError
Expand Down Expand Up @@ -46,6 +48,8 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
self.run = run
self.progress = progress

self.group_names = []

# initialize the base class
WorkflowSuperClass.__init__(self)

Expand Down Expand Up @@ -93,3 +97,16 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
'--ignore-internal-stop-codons']

self.rule_acceptable_params_dict['anvi_gen_contigs_database'] = gen_contigs_params


def init(self):
super().init()

fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True)

if fasta_txt_file:
filesnpaths.is_file_exists(fasta_txt_file)
self.fasta_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
self.group_names = list(self.fasta_information.keys())
self.references_mode = True

98 changes: 9 additions & 89 deletions anvio/workflows/metagenomics/Snakefile
Expand Up @@ -101,86 +101,6 @@ if number_of_assemblers_in_config_file > 1:
"software, yet all the following software are " \
"included in your config file: %s. Please include only one." % assembly_software_in_config)

# get a list of the sample names
sample_names = list(M.samples_information['sample'])

references_mode = M.get_param_value_from_config('references_mode', repress_default=True)
fasta_txt_file = M.get_param_value_from_config('fasta_txt', repress_default=True)
if references_mode:
try:
filesnpaths.is_file_exists(fasta_txt_file)
except FilesNPathsError as e:
raise ConfigError('In references mode you must supply a fasta_txt file.')

if not references_mode:
# if it is reference mode then the group names have been assigned in the contigs Snakefile
# if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names
group_names = sample_names

if fasta_txt_file and not references_mode:
raise ConfigError("In order to use reference fasta files you must set\
\"'references_mode': true\" in your config file, yet\
you didn't, but at the same time you supplied the following\
fasta_txt: %s. So we don't know what to do with this\
fasta_txt" % fasta_txt_file)

# Collecting information regarding groups.
if "group" in M.samples_information.columns:
# if groups were specified then members of a groups will be co-assembled.
group_names = list(M.samples_information['group'].unique())
# creating a dictionary with groups as keys and number of samples in
# the groups as values
group_sizes = M.samples_information['group'].value_counts().to_dict()

if references_mode:
# sanity check to see that groups specified in samples.txt match
# the names of fasta.
mismatch = set(group_names) - set(fasta_information.keys())
if mismatch:
raise ConfigError("Group names specified in the samples.txt \
file must match the names of fasta \
in the fasta.txt file. These are the \
mismatches: %s" % mismatch)
groups_in_fasta_information_but_not_in_samples_txt = set(fasta_information.keys()) - set(group_names)
if groups_in_fasta_information_but_not_in_samples_txt:
run.warning('The following group names appear in your fasta_txt\
but do not appear in your samples_txt. Maybe this is\
ok with you, but we thought you should know. This means\
that the metagenomics workflow will simply ignore these\
groups.')

else:
if references_mode:
# if the user didn't provide a group column in the samples.txt,
# in references mode the default is 'all_against_all'.
run.warning("No groups were provided in your samples_txt,\
hence 'all_against_all' mode has been automatically\
set to True.")
M.set_config_param('all_against_all', True)
else:
# if no groups were specified then each sample would be assembled
# separately
run.warning("No groups were specified in your samples_txt. This is fine.\
But we thought you should know. Any assembly will be performed\
on individual samples (i.e. NO co-assembly).")
M.samples_information['group'] = M.samples_information['sample']
group_names = list(sample_names)
group_sizes = dict.fromkeys(group_names,1)

if M.get_param_value_from_config('all_against_all', repress_default=True):
# in all_against_all, the size of each group is as big as the number
# of samples.
group_sizes = dict.fromkeys(group_names,len(sample_names))


if not references_mode and not (M.get_param_value_from_config(['anvi_script_reformat_fasta','run']) == True):
# in assembly mode (i.e. not in references mode) we always have
# to run reformat_fasta. The only reason for this is that
# the megahit output is temporary, and if we dont run
# reformat_fasta we will delete the output of meghit at the
# end of the workflow without saving a copy.
raise ConfigError("You can't skip reformat_fasta in assembly mode \
please change your config.json file")

# by default fastq files will be zipped after qc is done
run_gzip_fastqs = M.get_param_value_from_config(['gzip_fastqs', 'run']) == True
Expand All @@ -195,7 +115,7 @@ run_idba_ud = M.get_param_value_from_config(['idba_ud', 'run'])

run_megahit = M.get_param_value_from_config(['megahit', 'run']) == True

if not references_mode and number_of_assemblers_in_config_file!=1 :
if not M.references_mode and number_of_assemblers_in_config_file!=1 :
# Sanity check for assembly mode
# Make sure that user specified an assembler
raise ConfigError("You didn't specify any assembler for your metagenomes, and yet\
Expand All @@ -215,9 +135,9 @@ rule metagenomics_workflow_target_rule:
The final product of the workflow is an anvi'o merged profile directory
for each group
'''
input: expand("{DIR}/{group}/PROFILE.db", DIR=dirs_dict["MERGE_DIR"], group=group_names),
contigs_annotated = expand(dirs_dict["CONTIGS_DIR"] + "/{group}-annotate_contigs_database.done", group=group_names),
qc_report = dirs_dict["QC_DIR"] + "/qc-report.txt" if run_qc else expand(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db", group=group_names)
input: expand("{DIR}/{group}/PROFILE.db", DIR=dirs_dict["MERGE_DIR"], group=M.group_names),
contigs_annotated = expand(dirs_dict["CONTIGS_DIR"] + "/{group}-annotate_contigs_database.done", group=M.group_names),
qc_report = dirs_dict["QC_DIR"] + "/qc-report.txt" if run_qc else expand(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db", group=M.group_names)


rule iu_gen_configs:
Expand All @@ -231,7 +151,7 @@ rule iu_gen_configs:
# the input file is marked as 'ancient' so snakemake wouldn't run it
# just because a new path-to-raw-fastq-files.txt file was created.
input: ancient(M.get_param_value_from_config(['samples_txt']))
output: expand("{DIR}/{sample}.ini", DIR=dirs_dict["QC_DIR"], sample=sample_names)
output: expand("{DIR}/{sample}.ini", DIR=dirs_dict["QC_DIR"], sample=M.sample_names)
params:
dir=dirs_dict["QC_DIR"],
r1_prefix = M.get_rule_param("iu_gen_configs", "--r1-prefix"),
Expand Down Expand Up @@ -334,7 +254,7 @@ rule remove_short_reads_based_on_references:
rule gen_qc_report:
version: 1.0
log: dirs_dict["LOGS_DIR"] + "/gen_qc_report.log"
input: expand(dirs_dict["QC_DIR"] + "/{sample}-STATS.txt", sample=sample_names)
input: expand(dirs_dict["QC_DIR"] + "/{sample}-STATS.txt", sample=M.sample_names)
output: dirs_dict["QC_DIR"] + "/qc-report.txt"
threads: M.T('gen_qc_report')
resources: nodes = M.T('gen_qc_report')
Expand Down Expand Up @@ -439,7 +359,7 @@ rule merge_fastas_for_co_assembly:
r2 = r2_unzipped

# run fq2fa
fq2fa_output = os.path.join(dirs_dict["QC_DIR"], "%s_%s-merged.temp.fa" %s (r1, r2))
fq2fa_output = os.path.join(dirs_dict["QC_DIR"], "R1_R2-merged.temp.fa")
shell("fq2fa --merge %s %s %s >> %s 2>&1" % (r1, r2, fq2fa_output, log))

if need_to_uncompress_fastqs(r1, r2):
Expand Down Expand Up @@ -759,7 +679,7 @@ def get_cluster_contigs_param(wildcards):
else:
# if profiling to individual assembly then clustering contigs
# see --cluster-contigs in the help manu of anvi-profile
cluster_contigs = '--cluster-contigs' if group_sizes[wildcards.group] == 1 else ''
cluster_contigs = '--cluster-contigs' if M.group_sizes[wildcards.group] == 1 else ''
return cluster_contigs


Expand Down Expand Up @@ -953,7 +873,7 @@ rule anvi_merge:
# creating the expected output files for the rule
create_fake_output_files(_message, output)

elif group_sizes[wildcards.group] == 1:
elif M.group_sizes[wildcards.group] == 1:
# for individual assemblies, create a symlink to the profile database
#shell("ln -s {params.profile_dir}/*/* -t {params.output_dir} >> {log} 2>&1")
#shell("touch -h {params.profile_dir}/*/*")
Expand Down
114 changes: 102 additions & 12 deletions anvio/workflows/metagenomics/__init__.py
Expand Up @@ -44,6 +44,11 @@ def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
self.run_metaspades = None
self.use_scaffold_from_metaspades = None
self.remove_short_reads_based_on_references = None
self.references_mode = None
self.fasta_txt_file = None
self.samples_txt_file = None
self.sample_names = None
self.group_sizes = None

# initialize the base class
ContigsDBWorkflow.__init__(self)
Expand Down Expand Up @@ -139,29 +144,114 @@ def init(self):
# getting the samples information (names, [group], path to r1, path to r2) from samples.txt
self.samples_information = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False)

if 'sample' not in self.samples_information.columns.values:
raise ConfigError("You know what. This '%s' file does not look anything like\
a samples file." % self.samples_txt_file)

for sample in self.samples_information['sample']:
try:
u.check_sample_id(sample)
except ConfigError as e:
raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
%s" % (self.samples_txt_file, e))

self.sanity_check()

# get a list of the sample names
self.sample_names = list(self.samples_information['sample'])
self.run_metaspades = self.get_param_value_from_config(['metaspades', 'run'])
self.use_scaffold_from_metaspades = self.get_param_value_from_config(['metaspades', 'use_scaffolds'])
self.references_mode = self.get_param_value_from_config('references_mode', repress_default=True)
self.fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True)

self.sanity_check()


def sanity_check(self):
self.sanity_check_for_samples_txt()
self.sanity_check_for_kraken()
self.sanity_check_for_refereces_txt()


def sanity_check_for_refereces_txt(self):
if self.references_mode:
try:
filesnpaths.is_file_exists(self.fasta_txt_file)
except FilesNPathsError as e:
raise ConfigError('In references mode you must supply a fasta_txt file.')


if not self.references_mode:
# if it is reference mode then the group names have been assigned in the contigs Snakefile
# if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names
self.group_names = self.sample_names

if self.fasta_txt_file and not self.references_mode:
raise ConfigError("In order to use reference fasta files you must set\
\"'references_mode': true\" in your config file, yet\
you didn't, but at the same time you supplied the following\
fasta_txt: %s. So we don't know what to do with this\
fasta_txt" % self.fasta_txt_file)

# Collecting information regarding groups.
if "group" in self.samples_information.columns:
# if groups were specified then members of a groups will be co-assembled.
self.group_names = list(self.samples_information['group'].unique())
# creating a dictionary with groups as keys and number of samples in
# the groups as values
self.group_sizes = self.samples_information['group'].value_counts().to_dict()

if self.references_mode:
# sanity check to see that groups specified in samples.txt match
# the names of fasta.
mismatch = set(self.group_names) - set(self.fasta_information.keys())
if mismatch:
raise ConfigError("Group names specified in the samples.txt \
file must match the names of fasta \
in the fasta.txt file. These are the \
mismatches: %s" % mismatch)
groups_in_fasta_information_but_not_in_samples_txt = set(self.fasta_information.keys()) - set(self.group_names)
if groups_in_fasta_information_but_not_in_samples_txt:
run.warning('The following group names appear in your fasta_txt\
but do not appear in your samples_txt. Maybe this is\
ok with you, but we thought you should know. This means\
that the metagenomics workflow will simply ignore these\
groups.')

else:
if self.references_mode:
# if the user didn't provide a group column in the samples.txt,
# in references mode the default is 'all_against_all'.
run.warning("No groups were provided in your samples_txt,\
hence 'all_against_all' mode has been automatically\
set to True.")
self.set_config_param('all_against_all', True)
else:
# if no groups were specified then each sample would be assembled
# separately
run.warning("No groups were specified in your samples_txt. This is fine.\
But we thought you should know. Any assembly will be performed\
on individual samples (i.e. NO co-assembly).")
self.samples_information['group'] = self.samples_information['sample']
self.group_names = list(self.sample_names)
self.group_sizes = dict.fromkeys(self.group_names,1)

if self.get_param_value_from_config('all_against_all', repress_default=True):
# in all_against_all, the size of each group is as big as the number
# of samples.
self.group_sizes = dict.fromkeys(self.group_names,len(self.sample_names))


if not self.references_mode and not (self.get_param_value_from_config(['anvi_script_reformat_fasta','run']) == True):
# in assembly mode (i.e. not in references mode) we always have
# to run reformat_fasta. The only reason for this is that
# the megahit output is temporary, and if we dont run
# reformat_fasta we will delete the output of meghit at the
# end of the workflow without saving a copy.
raise ConfigError("You can't skip reformat_fasta in assembly mode \
please change your config.json file")


def sanity_check_for_samples_txt(self):
if 'sample' not in self.samples_information.columns.values:
raise ConfigError("You know what. This '%s' file does not look anything like\
a samples file." % self.samples_txt_file)

for sample in self.samples_information['sample']:
try:
u.check_sample_id(sample)
except ConfigError as e:
raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
%s" % (self.samples_txt_file, e))

fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
if bad_fastq_names:
Expand Down
3 changes: 1 addition & 2 deletions anvio/workflows/pangenomics/Snakefile
Expand Up @@ -46,8 +46,7 @@ else:
# the default name for the input fasta.txt file is "fasta.txt"
fasta_txt = M.get_param_value_from_config("fasta_txt")
filesnpaths.is_file_exists(fasta_txt)
fasta_information = pd.read_csv(fasta_txt, sep='\t', index_col=0).to_dict(orient='index')
ALL_SAMPLES = list(fasta_information.keys())
ALL_SAMPLES = list(M.fasta_information.keys())

internal_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--internal-genomes"])
external_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--external-genomes"])
Expand Down

0 comments on commit 21c0255

Please sign in to comment.