restructure to move many sanity checks to the bachhand of the workflows

This commit also includes bug fixes, so that metaspades is now working, and also this closes #954.
merenlab · Oct 22, 2018 · 21c0255 · 21c0255
1 parent 22d2e9b
commit 21c0255
Show file tree

Hide file tree

Showing 5 changed files with 136 additions and 115 deletions.
diff --git a/anvio/workflows/contigs/Snakefile b/anvio/workflows/contigs/Snakefile
@@ -31,15 +31,6 @@ if not slave_mode:
 
 localrules: annotate_contigs_database
 
-group_names = []
-fasta_txt_file = M.get_param_value_from_config('fasta_txt', repress_default=True)
-
-if fasta_txt_file:
-    filesnpaths.is_file_exists(fasta_txt_file)
-    fasta_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
-    group_names = list(fasta_information.keys())
-    references_mode = True
-
 # setting configuration for optional steps
 run_remove_human_dna_using_centrifuge = M.get_param_value_from_config(["remove_human_dna_using_centrifuge", "run"]) == True
 
@@ -63,6 +54,10 @@ if run_taxonomy_with_centrifuge:
 
                           "config file. See documentation for more details.")
 
+# TODO: it would have been better to have this in tha backhand
+# there is no reason contigs workflow would be familiar with references mode.
+# it should be that this is set to the default for contigs workflow
+# and then in the case of assembly in metagenomics workflow then this value would be override
 def get_raw_fasta(wildcards):
     '''
         Define the path to the input fasta files.
@@ -72,16 +67,16 @@ def get_raw_fasta(wildcards):
         This function also deals with the different cases of "reference mode"
         Vs. "assembly mode".
     '''
-    if references_mode:
+    if M.references_mode:
         # in 'reference mode' the input is the reference fasta
-        contigs = fasta_information[wildcards.group]['path']
+        contigs = M.fasta_information[wildcards.group]['path']
     else:
         # by default the input fasta is the assembly output
         contigs = dirs_dict["FASTA_DIR"] + "/%s/final.contigs.fa" % wildcards.group
     return contigs
 
 rule generate_and_annotate_contigs_db:
-    input: expand(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done", group=group_names)
+    input: expand(dirs_dict['CONTIGS_DIR'] + "/{group}-annotate_contigs_database.done", group=M.group_names)
 
 
 rule anvi_script_reformat_fasta_prefix_only:

diff --git a/anvio/workflows/contigs/__init__.py b/anvio/workflows/contigs/__init__.py
@@ -6,7 +6,9 @@
 
 
 import anvio
+import anvio.utils as u
 import anvio.terminal as terminal
+import anvio.filesnpaths as filesnpaths
 
 from anvio.workflows import WorkflowSuperClass
 from anvio.errors import ConfigError
@@ -46,6 +48,8 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
         self.run = run
         self.progress = progress
 
+        self.group_names = []
+
         # initialize the base class
         WorkflowSuperClass.__init__(self)
 
@@ -93,3 +97,16 @@ def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
                               '--ignore-internal-stop-codons']
 
         self.rule_acceptable_params_dict['anvi_gen_contigs_database'] = gen_contigs_params
+
+
+    def init(self):
+        super().init()
+
+        fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True)
+
+        if fasta_txt_file:
+            filesnpaths.is_file_exists(fasta_txt_file)
+            self.fasta_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
+            self.group_names = list(self.fasta_information.keys())
+            self.references_mode = True
+
diff --git a/anvio/workflows/metagenomics/Snakefile b/anvio/workflows/metagenomics/Snakefile
@@ -101,86 +101,6 @@ if number_of_assemblers_in_config_file > 1:
                         "software, yet all the following software are " \
                         "included in your config file: %s. Please include only one." % assembly_software_in_config)
 
-# get a list of the sample names
-sample_names = list(M.samples_information['sample'])
-
-references_mode = M.get_param_value_from_config('references_mode', repress_default=True)
-fasta_txt_file = M.get_param_value_from_config('fasta_txt', repress_default=True)
-if references_mode:
-    try:
-        filesnpaths.is_file_exists(fasta_txt_file)
-    except FilesNPathsError as e:
-        raise ConfigError('In references mode you must supply a fasta_txt file.')
-
-if not references_mode:
-    # if it is reference mode then the group names have been assigned in the contigs Snakefile
-    # if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names
-    group_names = sample_names
-
-if fasta_txt_file and not references_mode:
-    raise ConfigError("In order to use reference fasta files you must set\
-                   \"'references_mode': true\" in your config file, yet\
-                   you didn't, but at the same time you supplied the following\
-                   fasta_txt: %s. So we don't know what to do with this\
-                   fasta_txt" % fasta_txt_file)
-
-# Collecting information regarding groups.
-if "group" in M.samples_information.columns:
-    # if groups were specified then members of a groups will be co-assembled.
-    group_names = list(M.samples_information['group'].unique())
-    # creating a dictionary with groups as keys and number of samples in
-    # the groups as values
-    group_sizes = M.samples_information['group'].value_counts().to_dict()
-
-    if references_mode:
-        # sanity check to see that groups specified in samples.txt match
-        # the names of fasta.
-        mismatch = set(group_names) - set(fasta_information.keys())
-        if mismatch:
-            raise ConfigError("Group names specified in the samples.txt \
-                               file must match the names of fasta \
-                               in the fasta.txt file. These are the \
-                               mismatches: %s" % mismatch)
-        groups_in_fasta_information_but_not_in_samples_txt = set(fasta_information.keys()) - set(group_names)
-        if groups_in_fasta_information_but_not_in_samples_txt:
-            run.warning('The following group names appear in your fasta_txt\
-                         but do not appear in your samples_txt. Maybe this is\
-                         ok with you, but we thought you should know. This means\
-                         that the metagenomics workflow will simply ignore these\
-                         groups.')
-
-else:
-    if references_mode:
-        # if the user didn't provide a group column in the samples.txt,
-        # in references mode the default is 'all_against_all'.
-        run.warning("No groups were provided in your samples_txt,\
-                     hence 'all_against_all' mode has been automatically\
-                     set to True.")
-        M.set_config_param('all_against_all', True)
-    else:
-        # if no groups were specified then each sample would be assembled
-        # separately
-        run.warning("No groups were specified in your samples_txt. This is fine.\
-                     But we thought you should know. Any assembly will be performed\
-                     on individual samples (i.e. NO co-assembly).")
-        M.samples_information['group'] = M.samples_information['sample']
-        group_names = list(sample_names)
-        group_sizes = dict.fromkeys(group_names,1)
-
-if M.get_param_value_from_config('all_against_all', repress_default=True):
-    # in all_against_all, the size of each group is as big as the number
-    # of samples.
-    group_sizes = dict.fromkeys(group_names,len(sample_names))
-
-
-if not references_mode and not (M.get_param_value_from_config(['anvi_script_reformat_fasta','run']) == True):
-    # in assembly mode (i.e. not in references mode) we always have
-    # to run reformat_fasta. The only reason for this is that
-    # the megahit output is temporary, and if we dont run
-    # reformat_fasta we will delete the output of meghit at the
-    # end of the workflow without saving a copy.
-    raise ConfigError("You can't skip reformat_fasta in assembly mode \
-                        please change your config.json file")
 
 # by default fastq files will be zipped after qc is done
 run_gzip_fastqs = M.get_param_value_from_config(['gzip_fastqs', 'run']) == True
@@ -195,7 +115,7 @@ run_idba_ud = M.get_param_value_from_config(['idba_ud', 'run'])
 
 run_megahit = M.get_param_value_from_config(['megahit', 'run']) == True
 
-if not references_mode and number_of_assemblers_in_config_file!=1 :
+if not M.references_mode and number_of_assemblers_in_config_file!=1 :
     # Sanity check for assembly mode
     # Make sure that user specified an assembler
     raise ConfigError("You didn't specify any assembler for your metagenomes, and yet\
@@ -215,9 +135,9 @@ rule metagenomics_workflow_target_rule:
         The final product of the workflow is an anvi'o merged profile directory
         for each group
     '''
-    input: expand("{DIR}/{group}/PROFILE.db", DIR=dirs_dict["MERGE_DIR"], group=group_names),
-           contigs_annotated = expand(dirs_dict["CONTIGS_DIR"] + "/{group}-annotate_contigs_database.done", group=group_names),
-           qc_report = dirs_dict["QC_DIR"] + "/qc-report.txt" if run_qc else expand(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db", group=group_names)
+    input: expand("{DIR}/{group}/PROFILE.db", DIR=dirs_dict["MERGE_DIR"], group=M.group_names),
+           contigs_annotated = expand(dirs_dict["CONTIGS_DIR"] + "/{group}-annotate_contigs_database.done", group=M.group_names),
+           qc_report = dirs_dict["QC_DIR"] + "/qc-report.txt" if run_qc else expand(dirs_dict["CONTIGS_DIR"] + "/{group}-contigs.db", group=M.group_names)
 
 
 rule iu_gen_configs:
@@ -231,7 +151,7 @@ rule iu_gen_configs:
     # the input file is marked as 'ancient' so snakemake wouldn't run it
     # just because a new path-to-raw-fastq-files.txt file was created.
     input: ancient(M.get_param_value_from_config(['samples_txt']))
-    output: expand("{DIR}/{sample}.ini", DIR=dirs_dict["QC_DIR"], sample=sample_names)
+    output: expand("{DIR}/{sample}.ini", DIR=dirs_dict["QC_DIR"], sample=M.sample_names)
     params:
         dir=dirs_dict["QC_DIR"],
         r1_prefix = M.get_rule_param("iu_gen_configs", "--r1-prefix"),
@@ -334,7 +254,7 @@ rule remove_short_reads_based_on_references:
 rule gen_qc_report:
     version: 1.0
     log: dirs_dict["LOGS_DIR"] + "/gen_qc_report.log"
-    input: expand(dirs_dict["QC_DIR"] + "/{sample}-STATS.txt", sample=sample_names)
+    input: expand(dirs_dict["QC_DIR"] + "/{sample}-STATS.txt", sample=M.sample_names)
     output: dirs_dict["QC_DIR"] + "/qc-report.txt"
     threads: M.T('gen_qc_report')
     resources: nodes = M.T('gen_qc_report')
@@ -439,7 +359,7 @@ rule merge_fastas_for_co_assembly:
                 r2 = r2_unzipped
 
             # run fq2fa
-            fq2fa_output = os.path.join(dirs_dict["QC_DIR"], "%s_%s-merged.temp.fa" %s (r1, r2))
+            fq2fa_output = os.path.join(dirs_dict["QC_DIR"], "R1_R2-merged.temp.fa")
             shell("fq2fa --merge %s %s %s >> %s 2>&1" % (r1, r2, fq2fa_output, log))
 
             if need_to_uncompress_fastqs(r1, r2):
@@ -759,7 +679,7 @@ def get_cluster_contigs_param(wildcards):
     else:
         # if profiling to individual assembly then clustering contigs
         # see --cluster-contigs in the help manu of anvi-profile
-        cluster_contigs = '--cluster-contigs' if group_sizes[wildcards.group] == 1 else ''
+        cluster_contigs = '--cluster-contigs' if M.group_sizes[wildcards.group] == 1 else ''
     return cluster_contigs
 
 
@@ -953,7 +873,7 @@ rule anvi_merge:
             # creating the expected output files for the rule
             create_fake_output_files(_message, output)
 
-        elif group_sizes[wildcards.group] == 1:
+        elif M.group_sizes[wildcards.group] == 1:
             # for individual assemblies, create a symlink to the profile database
             #shell("ln -s {params.profile_dir}/*/* -t {params.output_dir} >> {log} 2>&1")
             #shell("touch -h {params.profile_dir}/*/*")

diff --git a/anvio/workflows/metagenomics/__init__.py b/anvio/workflows/metagenomics/__init__.py
@@ -44,6 +44,11 @@ def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
         self.run_metaspades = None
         self.use_scaffold_from_metaspades = None
         self.remove_short_reads_based_on_references = None
+        self.references_mode = None
+        self.fasta_txt_file = None
+        self.samples_txt_file = None
+        self.sample_names = None
+        self.group_sizes = None
 
         # initialize the base class
         ContigsDBWorkflow.__init__(self)
@@ -139,29 +144,114 @@ def init(self):
         # getting the samples information (names, [group], path to r1, path to r2) from samples.txt
         self.samples_information = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False)
 
-        if 'sample' not in self.samples_information.columns.values:
-            raise ConfigError("You know what. This '%s' file does not look anything like\
-                               a samples file." % self.samples_txt_file)
-
-        for sample in self.samples_information['sample']:
-            try:
-                u.check_sample_id(sample)
-            except ConfigError as e:
-                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
-                                   %s" % (self.samples_txt_file, e))
-
-        self.sanity_check()
 
+        # get a list of the sample names
+        self.sample_names = list(self.samples_information['sample'])
         self.run_metaspades = self.get_param_value_from_config(['metaspades', 'run'])
         self.use_scaffold_from_metaspades = self.get_param_value_from_config(['metaspades', 'use_scaffolds'])
+        self.references_mode = self.get_param_value_from_config('references_mode', repress_default=True)
+        self.fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True)
+
+        self.sanity_check()
 
 
     def sanity_check(self):
         self.sanity_check_for_samples_txt()
         self.sanity_check_for_kraken()
+        self.sanity_check_for_refereces_txt()
+
+
+    def sanity_check_for_refereces_txt(self):
+        if self.references_mode:
+            try:
+                filesnpaths.is_file_exists(self.fasta_txt_file)
+            except FilesNPathsError as e:
+                raise ConfigError('In references mode you must supply a fasta_txt file.')
+
+
+        if not self.references_mode:
+            # if it is reference mode then the group names have been assigned in the contigs Snakefile
+            # if it is not reference mode and no groups are supplied in the samples_txt then group names are sample names
+            self.group_names = self.sample_names
+
+        if self.fasta_txt_file and not self.references_mode:
+            raise ConfigError("In order to use reference fasta files you must set\
+                           \"'references_mode': true\" in your config file, yet\
+                           you didn't, but at the same time you supplied the following\
+                           fasta_txt: %s. So we don't know what to do with this\
+                           fasta_txt" % self.fasta_txt_file)
+
+        # Collecting information regarding groups.
+        if "group" in self.samples_information.columns:
+            # if groups were specified then members of a groups will be co-assembled.
+            self.group_names = list(self.samples_information['group'].unique())
+            # creating a dictionary with groups as keys and number of samples in
+            # the groups as values
+            self.group_sizes = self.samples_information['group'].value_counts().to_dict()
+
+            if self.references_mode:
+                # sanity check to see that groups specified in samples.txt match
+                # the names of fasta.
+                mismatch = set(self.group_names) - set(self.fasta_information.keys())
+                if mismatch:
+                    raise ConfigError("Group names specified in the samples.txt \
+                                       file must match the names of fasta \
+                                       in the fasta.txt file. These are the \
+                                       mismatches: %s" % mismatch)
+                groups_in_fasta_information_but_not_in_samples_txt = set(self.fasta_information.keys()) - set(self.group_names)
+                if groups_in_fasta_information_but_not_in_samples_txt:
+                    run.warning('The following group names appear in your fasta_txt\
+                                 but do not appear in your samples_txt. Maybe this is\
+                                 ok with you, but we thought you should know. This means\
+                                 that the metagenomics workflow will simply ignore these\
+                                 groups.')
+
+        else:
+            if self.references_mode:
+                # if the user didn't provide a group column in the samples.txt,
+                # in references mode the default is 'all_against_all'.
+                run.warning("No groups were provided in your samples_txt,\
+                             hence 'all_against_all' mode has been automatically\
+                             set to True.")
+                self.set_config_param('all_against_all', True)
+            else:
+                # if no groups were specified then each sample would be assembled
+                # separately
+                run.warning("No groups were specified in your samples_txt. This is fine.\
+                             But we thought you should know. Any assembly will be performed\
+                             on individual samples (i.e. NO co-assembly).")
+                self.samples_information['group'] = self.samples_information['sample']
+                self.group_names = list(self.sample_names)
+                self.group_sizes = dict.fromkeys(self.group_names,1)
+
+        if self.get_param_value_from_config('all_against_all', repress_default=True):
+            # in all_against_all, the size of each group is as big as the number
+            # of samples.
+            self.group_sizes = dict.fromkeys(self.group_names,len(self.sample_names))
+
+
+        if not self.references_mode and not (self.get_param_value_from_config(['anvi_script_reformat_fasta','run']) == True):
+            # in assembly mode (i.e. not in references mode) we always have
+            # to run reformat_fasta. The only reason for this is that
+            # the megahit output is temporary, and if we dont run
+            # reformat_fasta we will delete the output of meghit at the
+            # end of the workflow without saving a copy.
+            raise ConfigError("You can't skip reformat_fasta in assembly mode \
+                                please change your config.json file")
 
 
     def sanity_check_for_samples_txt(self):
+        if 'sample' not in self.samples_information.columns.values:
+            raise ConfigError("You know what. This '%s' file does not look anything like\
+                               a samples file." % self.samples_txt_file)
+
+        for sample in self.samples_information['sample']:
+            try:
+                u.check_sample_id(sample)
+            except ConfigError as e:
+                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
+                                   %s" % (self.samples_txt_file, e))
+
         fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
         bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
         if bad_fastq_names:

diff --git a/anvio/workflows/pangenomics/Snakefile b/anvio/workflows/pangenomics/Snakefile
@@ -46,8 +46,7 @@ else:
 # the default name for the input fasta.txt file is "fasta.txt"
 fasta_txt = M.get_param_value_from_config("fasta_txt")
 filesnpaths.is_file_exists(fasta_txt)
-fasta_information = pd.read_csv(fasta_txt, sep='\t', index_col=0).to_dict(orient='index')
-ALL_SAMPLES = list(fasta_information.keys())
+ALL_SAMPLES = list(M.fasta_information.keys())
 
 internal_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--internal-genomes"])
 external_genomes_file = M.get_param_value_from_config(["anvi_gen_genomes_storage", "--external-genomes"])