Skip to content

Commit

Permalink
Merge pull request #360 from metagenome-atlas/dram
Browse files Browse the repository at this point in the history
DRAM functional annotation
  • Loading branch information
SilasK committed Sep 13, 2021
2 parents b848e0a + 8bc509d commit bc220ef
Show file tree
Hide file tree
Showing 8 changed files with 303 additions and 16 deletions.
1 change: 1 addition & 0 deletions atlas/Snakefile
Expand Up @@ -221,6 +221,7 @@ include: "rules/binning.smk"
include: "rules/genomes.smk"
include: "rules/genecatalog.smk"
include: "rules/gtdbtk.smk"
include: "rules/dram.smk"


CONDAENV = "envs" # overwrite definition in download.smk
Expand Down
20 changes: 20 additions & 0 deletions atlas/annotate.smk
@@ -0,0 +1,20 @@
import os
import re
import sys
import tempfile
#import pandas as pd
#import numpy as np

from snakemake.utils import logger, min_version

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(workflow.snakefile)),"scripts"))
import utils

from conf import update_config
config= update_config(config)

TMPDIR = config.get('tmpdir',tempfile.gettempdir() )

#CONDAENV = "envs" # overwrite definition in download.smk

include: "rules/dram.smk"
6 changes: 6 additions & 0 deletions atlas/envs/dram.yaml
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- dram=1.2
206 changes: 206 additions & 0 deletions atlas/rules/dram.smk
@@ -0,0 +1,206 @@

DBDIR = config['database_dir']




def get_dram_config(wildcards):
return config.get('dram_config_file', f"{DBDIR}/DRAM.config")

rule dram_download:
output:
dbdir= directory(f"{DBDIR}/Dram/"),
config= f"{DBDIR}/DRAM.config"
threads:
config['threads']
resources:
mem= config['mem'],
time= config["runtime"]["default"]
log:
"log/dram/download_dram.log"
benchmark:
"log/benchmarks/dram/download_dram.tsv"
conda:
"../envs/dram.yaml"
shell:
" DRAM-setup.py prepare_databases "
" --output_dir {output.dbdir} "
" --threads {threads} "
" --verbose "
" --skip_uniref "
" &> {log} "
" ; "
" DRAM-setup.py export_config --output_file {output.config}"





localrules: DRAM_set_db_loc, get_lists_of_genomes
rule DRAM_set_db_loc:
input:
get_dram_config
output:
touch("logs/dram_config_imported")
threads:
1
conda:
"../envs/dram.yaml"
shell:
"DRAM-setup.py import_config --config_loc {input}"


checkpoint get_lists_of_genomes:
input:
get_genome_folder
output:
directory(temp("annotations/dram/genome_lists"))
run:
from glob import glob
all_fasta_files = glob(input[0]+"/*.fasta")

if len(all_fasta_files)==0:
raise Exception(f"No genome fasta files found in folder {input[0]}")

os.makedirs(output[0])

N= 10 # N subsets
for i in range(0,len(all_fasta_files)//N+1):

with open(output[0]+f'/subset_{i+1}.txt','w') as outf:
outf.write(' '.join( all_fasta_files[i*N:(i+1)*N]))



def dram_annotate_input(wildcards,input):

fasta_files= open(input.genome_list).read().strip().split()

assert len(fasta_files) > 0

return ' '.join(f'--input_fasta {fasta}' for fasta in fasta_files )

rule DRAM_annotate:
input:
genome_folder= get_genome_folder,
genome_list= "annotations/dram/genome_lists/{subset}.txt",
flag= rules.DRAM_set_db_loc.output
output:
outdir= directory("annotations/dram/intermediate_files/{subset}")
threads:
config['threads']
resources:
mem= config['simplejob_mem'],
time= config['runtime']['default']
conda:
"../envs/dram.yaml"
shadow:
"shallow"
params:
gtdb_file="gtdbtk.bac120.summary.tsv",
input = dram_annotate_input
log:
"log/dram/run_dram/{subset}.log"
benchmark:
"log/benchmarks/dram/run_dram/{subset}.tsv"
shell:
" DRAM.py annotate "
" {params.input} "
" --output_dir {output.outdir} "
" --prodigal_mode single "
" --threads {threads} "
" --verbose &> {log}"


def get_all_dram(wildcards):

genome_lists_folder= checkpoints.get_lists_of_genomes.get().output[0]

all_subsets = glob_wildcards(os.path.join(genome_lists_folder,"{subset}.txt")).subset

if len(all_subsets)==0:

raise Exception(f"No genome list file is found in {genome_lists_folder}\n"
"Remove this empty folder and start anew.")



return expand(rules.DRAM_annotate.output.outdir,
subset=all_subsets
)


DRAM_ANNOTATON_FILES = ['annotations.tsv','rrnas.tsv','trnas.tsv']

localrules: concat_annotations
rule concat_annotations:
input:
get_all_dram
output:
expand("annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES)
resources:
time= config['runtime']['default'],
# mem = config['mem']
run:
#from utils import io
for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES):

input_files = [os.path.join(dram_folder,annotation_file) for dram_folder in input ]

# drop files that don't exist for rrna and trna
if not i==0:
input_files = [f for f in input_files if os.path.exists(f) ]


shell(f"head -n1 {input_files[0]} > {output[i]} ")
for f in input_files:
shell(f"tail -n+2 {f} >> {output[i]}")

#io.pandas_concat(input_files, output[i],sep='\t',index_col=0, axis=0)

rule DRAM_destill:
input:
expand("annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES),
flag= rules.DRAM_set_db_loc.output
output:
outdir= directory("annotations/dram/distil")
threads:
1
resources:
mem= config['mem'],
time= config['runtime']['default']
conda:
"../envs/dram.yaml"
log:
"log/dram/distil.log"
shell:
" DRAM.py distill "
" --input_file {input[0]}"
" --rrna_path {input[1]}"
" --trna_path {input[2]}"
" --output_dir {output} "
" &> {log}"


rule get_all_modules:
input:
"annotations/dram/annotations.tsv",
output:
"annotations/dram/kegg_modules.tsv"
threads:
1
resources:
mem= config['simplejob_mem'],
time= config['runtime']['default']
conda:
"../envs/dram.yaml"
log:
"log/dram/get_all_modules.log"
script:
"../scripts/DRAM_get_all_modules.py"


rule dram:
input:
"annotations/dram/distil",
"annotations/dram/kegg_modules.tsv"
32 changes: 19 additions & 13 deletions atlas/rules/genomes.smk
@@ -1,9 +1,3 @@
if "genome_dir" in config:
genome_dir = config["genome_dir"]
assert os.path.exists(genome_dir), f"genome_dir ({genome_dir}) doesn't exist"
else:
genome_dir = "genomes/genomes"


## dRep
localrules:
Expand Down Expand Up @@ -173,11 +167,23 @@ checkpoint rename_genomes:
"rename_genomes.py"


def get_genomes_(wildcards):
def get_genome_folder(wildcards):

if 'genome_dir' in config:

genome_dir = config['genome_folder']

assert os.path.exists(genome_dir), f"genome_dir ({genome_dir}) doesn't exist"

if genome_dir == "genomes/genomes":
checkpoints.rename_genomes.get() # test if checkpoint passed
else:
genome_dir= checkpoints.rename_genomes.get().output.dir

return genome_dir


def get_genomes_(wildcards):

genome_dir =get_genome_folder(wildcards)
genomes = glob_wildcards(os.path.join(genome_dir, "{genome}.fasta")).genome

if len(genomes) == 0:
Expand All @@ -195,7 +201,7 @@ def get_genomes_(wildcards):
rule run_all_checkm_lineage_wf:
input:
touched_output="logs/checkm_init.txt",
dir=genome_dir,
dir= get_genome_folder,
output:
"genomes/checkm/completeness.tsv",
"genomes/checkm/storage/tree/concatenated.fasta",
Expand Down Expand Up @@ -230,7 +236,7 @@ rule run_all_checkm_lineage_wf:

rule build_db_genomes:
input:
genome_dir,
get_genome_folder,
output:
index="ref/genome/3/summary.txt",
fasta=temp("genomes/all_contigs.fasta"),
Expand Down Expand Up @@ -422,7 +428,7 @@ rule combine_bined_coverages_MAGs:

# rule predict_genes_genomes:
# input:
# dir=genome_dir
# dir= get_genome_folder
# output:
# directory("genomes/annotations/genes")
# conda:
Expand Down Expand Up @@ -514,7 +520,7 @@ rule run_prokka_bins:


def genome_all_prokka_input(wildcards):
genome_dir = genome_dir
genome_dir = get_genome_folder
return expand(
"genomes/annotations/prokka/{genome}/{genome}.tsv",
genome=glob_wildcards(os.path.join(genome_dir, "{genome}.fasta")).genome,
Expand Down
4 changes: 2 additions & 2 deletions atlas/rules/gtdbtk.smk
Expand Up @@ -3,7 +3,7 @@ gtdb_dir = "genomes/taxonomy/gtdb"

rule identify:
input:
dir=genome_dir,
dir= get_genome_folder,
flag=rules.download_gtdb.output,
output:
directory(f"{gtdb_dir}/identify"),
Expand Down Expand Up @@ -45,7 +45,7 @@ checkpoint align:
rule classify:
input:
rules.align.output,
genome_dir=genome_dir,
genome_dir= get_genome_folder,
output:
directory(f"{gtdb_dir}/classify"),
threads: config["threads"] #pplacer needs much memory for not many threads
Expand Down
44 changes: 44 additions & 0 deletions atlas/scripts/DRAM_get_all_modules.py
@@ -0,0 +1,44 @@
#! /usr/bin/env python3


import sys,os
import logging, traceback
logging.basicConfig(filename=snakemake.log[0],
level=logging.INFO,
format='%(asctime)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)
def handle_exception(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return

logger.error(''.join(["Uncaught exception: ",
*traceback.format_exception(exc_type, exc_value, exc_traceback)
])
)
# Install exception handler
sys.excepthook = handle_exception


import pandas as pd

annotation_file=snakemake.input[0]
mouse_output_table= snakemake.output[0]

from mag_annotator.utils import get_database_locs
from mag_annotator.summarize_genomes import build_module_net,make_module_coverage_frame

annotations = pd.read_csv(annotation_file, sep='\t', index_col=0)
db_locs = get_database_locs()
if 'module_step_form' not in db_locs:
raise ValueError('Module step form location must be set in order to summarize genomes')

module_steps_form = pd.read_csv(db_locs['module_step_form'], sep='\t')

all_module_nets = {module: build_module_net(module_df)
for module, module_df in module_steps_form.groupby('module') }

module_coverage_frame = make_module_coverage_frame(annotations, all_module_nets, groupby_column='fasta')

module_coverage_frame.to_csv(mouse_output_table, sep='\t')
6 changes: 5 additions & 1 deletion atlas/scripts/utils/io.py
Expand Up @@ -84,4 +84,8 @@ def pandas_concat(

out = pd.concat(Tables, axis=axis, **concat_arguments).sort_index()

out.to_csv(output_table, sep=sep, **save_arguments)
del Tables

out.to_csv(output_table,sep=sep,**save_arguments)

del out

0 comments on commit bc220ef

Please sign in to comment.