Merge pull request #360 from metagenome-atlas/dram

DRAM functional annotation
metagenome-atlas · Sep 13, 2021 · bc220ef · bc220ef
2 parents b848e0a + 8bc509d
commit bc220ef
Show file tree

Hide file tree

Showing 8 changed files with 303 additions and 16 deletions.
diff --git a/atlas/Snakefile b/atlas/Snakefile
@@ -221,6 +221,7 @@ include: "rules/binning.smk"
 include: "rules/genomes.smk"
 include: "rules/genecatalog.smk"
 include: "rules/gtdbtk.smk"
+include: "rules/dram.smk"
 
 
 CONDAENV = "envs"  # overwrite definition in download.smk

diff --git a/atlas/annotate.smk b/atlas/annotate.smk
@@ -0,0 +1,20 @@
+import os
+import re
+import sys
+import tempfile
+#import pandas as pd
+#import numpy as np
+
+from snakemake.utils import logger, min_version
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(workflow.snakefile)),"scripts"))
+import utils
+
+from conf import update_config
+config= update_config(config)
+
+TMPDIR = config.get('tmpdir',tempfile.gettempdir() )
+
+#CONDAENV = "envs" # overwrite definition in download.smk
+
+include: "rules/dram.smk"
diff --git a/atlas/envs/dram.yaml b/atlas/envs/dram.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - dram=1.2
diff --git a/atlas/rules/dram.smk b/atlas/rules/dram.smk
@@ -0,0 +1,206 @@
+
+DBDIR = config['database_dir']
+
+
+
+
+def get_dram_config(wildcards):
+    return config.get('dram_config_file', f"{DBDIR}/DRAM.config")
+
+rule dram_download:
+    output:
+        dbdir= directory(f"{DBDIR}/Dram/"),
+        config= f"{DBDIR}/DRAM.config"
+    threads:
+        config['threads']
+    resources:
+        mem= config['mem'],
+        time= config["runtime"]["default"]
+    log:
+        "log/dram/download_dram.log"
+    benchmark:
+        "log/benchmarks/dram/download_dram.tsv"
+    conda:
+        "../envs/dram.yaml"
+    shell:
+        " DRAM-setup.py prepare_databases "
+        " --output_dir {output.dbdir} "
+        " --threads {threads} "
+        " --verbose "
+        " --skip_uniref "
+        " &> {log} "
+        " ; "
+        " DRAM-setup.py export_config --output_file {output.config}"
+
+
+
+
+
+localrules: DRAM_set_db_loc, get_lists_of_genomes
+rule DRAM_set_db_loc:
+    input:
+        get_dram_config
+    output:
+        touch("logs/dram_config_imported")
+    threads:
+        1
+    conda:
+        "../envs/dram.yaml"
+    shell:
+        "DRAM-setup.py import_config --config_loc {input}"
+
+
+checkpoint get_lists_of_genomes:
+    input:
+        get_genome_folder
+    output:
+        directory(temp("annotations/dram/genome_lists"))
+    run:
+        from glob import glob
+        all_fasta_files = glob(input[0]+"/*.fasta")
+
+        if len(all_fasta_files)==0:
+            raise Exception(f"No genome fasta files found in folder {input[0]}")
+
+        os.makedirs(output[0])
+
+        N= 10 # N subsets
+        for i in range(0,len(all_fasta_files)//N+1):
+
+            with open(output[0]+f'/subset_{i+1}.txt','w') as outf:
+                outf.write(' '.join( all_fasta_files[i*N:(i+1)*N]))
+
+
+
+def dram_annotate_input(wildcards,input):
+
+    fasta_files= open(input.genome_list).read().strip().split()
+
+    assert len(fasta_files) > 0
+
+    return ' '.join(f'--input_fasta {fasta}' for fasta in fasta_files )
+
+rule DRAM_annotate:
+    input:
+        genome_folder= get_genome_folder,
+        genome_list= "annotations/dram/genome_lists/{subset}.txt",
+        flag= rules.DRAM_set_db_loc.output
+    output:
+        outdir= directory("annotations/dram/intermediate_files/{subset}")
+    threads:
+        config['threads']
+    resources:
+        mem= config['simplejob_mem'],
+        time= config['runtime']['default']
+    conda:
+        "../envs/dram.yaml"
+    shadow:
+        "shallow"
+    params:
+        gtdb_file="gtdbtk.bac120.summary.tsv",
+        input = dram_annotate_input
+    log:
+        "log/dram/run_dram/{subset}.log"
+    benchmark:
+        "log/benchmarks/dram/run_dram/{subset}.tsv"
+    shell:
+        " DRAM.py annotate "
+        "  {params.input} "
+        " --output_dir {output.outdir} "
+        " --prodigal_mode single "
+        " --threads {threads} "
+        " --verbose &> {log}"
+
+
+def get_all_dram(wildcards):
+
+    genome_lists_folder= checkpoints.get_lists_of_genomes.get().output[0]
+
+    all_subsets = glob_wildcards(os.path.join(genome_lists_folder,"{subset}.txt")).subset
+
+    if len(all_subsets)==0:
+
+        raise Exception(f"No genome list file is found in {genome_lists_folder}\n"
+                        "Remove this empty folder and start anew.")
+
+
+
+    return expand(rules.DRAM_annotate.output.outdir,
+           subset=all_subsets
+           )
+
+
+DRAM_ANNOTATON_FILES = ['annotations.tsv','rrnas.tsv','trnas.tsv']
+
+localrules: concat_annotations
+rule concat_annotations:
+    input:
+        get_all_dram
+    output:
+        expand("annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES)
+    resources:
+        time= config['runtime']['default'],
+    #     mem = config['mem']
+    run:
+        #from utils import io
+        for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES):
+
+            input_files = [os.path.join(dram_folder,annotation_file) for dram_folder in input  ]
+
+            # drop files that don't exist for rrna and trna
+            if not i==0:
+                input_files = [f for f in input_files if os.path.exists(f) ]
+
+
+            shell(f"head -n1 {input_files[0]} > {output[i]} ")
+            for f in input_files:
+                shell(f"tail -n+2 {f} >> {output[i]}")
+
+            #io.pandas_concat(input_files, output[i],sep='\t',index_col=0, axis=0)
+
+rule DRAM_destill:
+    input:
+        expand("annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES),
+        flag= rules.DRAM_set_db_loc.output
+    output:
+        outdir= directory("annotations/dram/distil")
+    threads:
+        1
+    resources:
+        mem= config['mem'],
+        time= config['runtime']['default']
+    conda:
+        "../envs/dram.yaml"
+    log:
+        "log/dram/distil.log"
+    shell:
+        " DRAM.py distill "
+        " --input_file {input[0]}"
+        " --rrna_path {input[1]}"
+        " --trna_path {input[2]}"
+        " --output_dir {output} "
+        "  &> {log}"
+
+
+rule get_all_modules:
+    input:
+        "annotations/dram/annotations.tsv",
+    output:
+        "annotations/dram/kegg_modules.tsv"
+    threads:
+        1
+    resources:
+        mem= config['simplejob_mem'],
+        time= config['runtime']['default']
+    conda:
+        "../envs/dram.yaml"
+    log:
+        "log/dram/get_all_modules.log"
+    script:
+        "../scripts/DRAM_get_all_modules.py"
+
+
+rule dram:
+    input:
+        "annotations/dram/distil",
+        "annotations/dram/kegg_modules.tsv"
diff --git a/atlas/rules/genomes.smk b/atlas/rules/genomes.smk
@@ -1,9 +1,3 @@
-if "genome_dir" in config:
-    genome_dir = config["genome_dir"]
-    assert os.path.exists(genome_dir), f"genome_dir ({genome_dir}) doesn't exist"
-else:
-    genome_dir = "genomes/genomes"
-
 
 ## dRep
 localrules:
@@ -173,11 +167,23 @@ checkpoint rename_genomes:
         "rename_genomes.py"
 
 
-def get_genomes_(wildcards):
+def get_genome_folder(wildcards):
+
+    if 'genome_dir' in config:
+
+        genome_dir = config['genome_folder']
+
+        assert os.path.exists(genome_dir), f"genome_dir ({genome_dir}) doesn't exist"
 
-    if genome_dir == "genomes/genomes":
-        checkpoints.rename_genomes.get()  # test if checkpoint passed
+    else:
+        genome_dir= checkpoints.rename_genomes.get().output.dir
+
+    return genome_dir
+
+
+def get_genomes_(wildcards):
 
+    genome_dir =get_genome_folder(wildcards)
     genomes = glob_wildcards(os.path.join(genome_dir, "{genome}.fasta")).genome
 
     if len(genomes) == 0:
@@ -195,7 +201,7 @@ def get_genomes_(wildcards):
 rule run_all_checkm_lineage_wf:
     input:
         touched_output="logs/checkm_init.txt",
-        dir=genome_dir,
+        dir= get_genome_folder,
     output:
         "genomes/checkm/completeness.tsv",
         "genomes/checkm/storage/tree/concatenated.fasta",
@@ -230,7 +236,7 @@ rule run_all_checkm_lineage_wf:
 
 rule build_db_genomes:
     input:
-        genome_dir,
+        get_genome_folder,
     output:
         index="ref/genome/3/summary.txt",
         fasta=temp("genomes/all_contigs.fasta"),
@@ -422,7 +428,7 @@ rule combine_bined_coverages_MAGs:
 
 # rule predict_genes_genomes:
 #     input:
-#         dir=genome_dir
+#         dir= get_genome_folder
 #     output:
 #         directory("genomes/annotations/genes")
 #     conda:
@@ -514,7 +520,7 @@ rule run_prokka_bins:
 
 
 def genome_all_prokka_input(wildcards):
-    genome_dir = genome_dir
+    genome_dir = get_genome_folder
     return expand(
         "genomes/annotations/prokka/{genome}/{genome}.tsv",
         genome=glob_wildcards(os.path.join(genome_dir, "{genome}.fasta")).genome,

diff --git a/atlas/rules/gtdbtk.smk b/atlas/rules/gtdbtk.smk
@@ -3,7 +3,7 @@ gtdb_dir = "genomes/taxonomy/gtdb"
 
 rule identify:
     input:
-        dir=genome_dir,
+        dir= get_genome_folder,
         flag=rules.download_gtdb.output,
     output:
         directory(f"{gtdb_dir}/identify"),
@@ -45,7 +45,7 @@ checkpoint align:
 rule classify:
     input:
         rules.align.output,
-        genome_dir=genome_dir,
+        genome_dir= get_genome_folder,
     output:
         directory(f"{gtdb_dir}/classify"),
     threads: config["threads"]  #pplacer needs much memory for not many threads

diff --git a/atlas/scripts/DRAM_get_all_modules.py b/atlas/scripts/DRAM_get_all_modules.py
@@ -0,0 +1,44 @@
+#! /usr/bin/env python3
+
+
+import sys,os
+import logging, traceback
+logging.basicConfig(filename=snakemake.log[0],
+                    level=logging.INFO,
+                    format='%(asctime)s %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S',
+                    )
+def handle_exception(exc_type, exc_value, exc_traceback):
+    if issubclass(exc_type, KeyboardInterrupt):
+        sys.__excepthook__(exc_type, exc_value, exc_traceback)
+        return
+
+    logger.error(''.join(["Uncaught exception: ",
+                         *traceback.format_exception(exc_type, exc_value, exc_traceback)
+                         ])
+                 )
+# Install exception handler
+sys.excepthook = handle_exception
+
+
+import pandas as pd
+
+annotation_file=snakemake.input[0]
+mouse_output_table= snakemake.output[0]
+
+from mag_annotator.utils import get_database_locs
+from mag_annotator.summarize_genomes import build_module_net,make_module_coverage_frame
+
+annotations = pd.read_csv(annotation_file, sep='\t', index_col=0)
+db_locs = get_database_locs()
+if 'module_step_form' not in db_locs:
+    raise ValueError('Module step form location must be set in order to summarize genomes')
+
+module_steps_form = pd.read_csv(db_locs['module_step_form'], sep='\t')
+
+all_module_nets = {module: build_module_net(module_df)
+                   for module, module_df in module_steps_form.groupby('module') }
+
+module_coverage_frame = make_module_coverage_frame(annotations, all_module_nets, groupby_column='fasta')
+
+module_coverage_frame.to_csv(mouse_output_table, sep='\t')
diff --git a/atlas/scripts/utils/io.py b/atlas/scripts/utils/io.py
@@ -84,4 +84,8 @@ def pandas_concat(
 
     out = pd.concat(Tables, axis=axis, **concat_arguments).sort_index()
 
-    out.to_csv(output_table, sep=sep, **save_arguments)
+    del Tables
+
+    out.to_csv(output_table,sep=sep,**save_arguments)
+
+    del out