Skip to content

Commit

Permalink
support great KMCP
Browse files Browse the repository at this point in the history
  • Loading branch information
alienzj committed May 29, 2022
1 parent fd1e145 commit 0cf75f3
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 0 deletions.
28 changes: 28 additions & 0 deletions quantpi/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,33 @@ params:
reads_len: 100
level: ["G", "S"]

# DNA-to-DNA
kmcp:
do:
bacteriome: True
mycobiome: True
virome: True
database:
bacteriome: /home/zhujie/databases/ecogenomics/KMCP/bacteriome/GTDB_r202/gtdb.kmcp
mycobiome: /home/zhujie/databases/ecogenomics/KMCP/mycobiome/RefSeq_r208/refseq-fungi.kmcp
virome: /home/zhujie/databases/ecogenomics/KMCP/virome/GenBank_246/genbank-viral.kmcp
ncbi_taxdump: /home/zhujie/.taxonkit
search:
reads_mode: "Single-end" # "Paired-end" # Recommended Single-end
min_query_len: 30
min_query_cov: 0.55
external_opts: ""
profile:
min_query_cov: 0.55
min_chunks_reads: 50
min_chunks_fraction: 0.8
max_chunks_depth_stdev: 2
min_uniq_reads: 20
min_hic_ureads: 5
min_hic_ureads_qcov: 0.75
min_hic_ureads_prop: 0.1
external_opts: ""

# DNA-to-marker
metaphlan:
do_v2: False
Expand Down Expand Up @@ -253,6 +280,7 @@ envs:
simulate: "envs/simulate.yaml"
soap: "envs/soap.yaml"
kraken2: "envs/kraken2.yaml"
kmcp: "envs/kmcp.yaml"
metaphlan2: "envs/biobakery2.yaml"
metaphlan3: "envs/biobakery3.yaml"
humann2: "envs/biobakery2.yaml"
Expand Down
6 changes: 6 additions & 0 deletions quantpi/envs/kmcp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- kmcp=0.8.2
15 changes: 15 additions & 0 deletions quantpi/profiles/sge/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,21 @@ profiling_bracken_merge:
output: "logs/04.{rule}/{rule}.{wildcards.level}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.level}.{jobid}.e"

profiling_kmcp_search:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.kmcp_db}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.kmcp_db}.{jobid}.e"

profiling_kmcp_search_merge:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.e"

profiling_kmcp_profile:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.profiling_mode}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.profiling_mode}.{jobid}.e"

profiling_bgi_soap:
mem: "3G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.o"
Expand Down
15 changes: 15 additions & 0 deletions quantpi/profiles/slurm/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,21 @@ profiling_bracken_merge:
output: "logs/04.{rule}/{rule}.{wildcards.level}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.level}.{jobid}.e"

profiling_kmcp_search:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.kmcp_db}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.kmcp_db}.{jobid}.e"

profiling_kmcp_search_merge:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.e"

profiling_kmcp_profile:
mem: "1G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.profiling_mode}.{jobid}.o"
error: "logs/04.{rule}/{rule}.{wildcards.sample}.{wildcards.profiling_mode}.{jobid}.e"

profiling_bgi_soap:
mem: "3G"
output: "logs/04.{rule}/{rule}.{wildcards.sample}.{jobid}.o"
Expand Down
2 changes: 2 additions & 0 deletions quantpi/rules/profiling_function.smk
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ rule profiling_all:
rules.profiling_bracken_all.input,
rules.profiling_metaphlan2_all.input,
rules.profiling_metaphlan3_all.input,
rules.profiling_kmcp_all.input,
rules.profiling_bgi_soap_all.input,
rules.profiling_bowtie2_all.input,
rules.profiling_jgi_all.input,
Expand All @@ -803,6 +804,7 @@ localrules:
profiling_bracken_all,
profiling_metaphlan2_all,
profiling_metaphlan3_all,
profiling_kmcp_all,
profiling_bgi_soap_all,
profiling_bowtie2_all,
profiling_jgi_all,
Expand Down
163 changes: 163 additions & 0 deletions quantpi/rules/profiling_kmer.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
if len(KMCP_DBS) > 0:
rule profiling_kmcp_search:
input:
reads = profiling_input_with_short_reads,
db_dir = lambda wildcards: config["params"]["profiling"]["kmcp"]["database"][wildcards.kmcp_db]
output:
os.path.join(config["output"]["profiling"],
"search/kmcp/{sample}/{sample}.kmcp_search@{kmcp_db}.tsv.gz")
conda:
config["envs"]["kmcp"]
log:
os.path.join(config["output"]["profiling"],
"logs/kmcp/search/{sample}.kmcp_search@{kmcp_db}.log")
benchmark:
os.path.join(config["output"]["profiling"],
"benchmark/kmcp/search/{sample}.kmcp_search@{kmcp_db}.benchmark.txt")
params:
reads_mode = config["params"]["profiling"]["kmcp"]["search"]["reads_mode"],
reads_layout = 1 if IS_PE else 0,
min_query_len = config["params"]["profiling"]["kmcp"]["search"]["min_query_len"],
min_query_cov = config["params"]["profiling"]["kmcp"]["search"]["min_query_cov"],
external_opts = config["params"]["profiling"]["kmcp"]["search"]["external_opts"]
priority:
20
threads:
config["params"]["profiling"]["threads"]
shell:
'''
if [ {params.reads_layout} -eq 1 ] && [ "{params.reads_mode}" == "Paired-end" ]
then
kmcp search \
--threads {threads} \
--load-whole-db \
--db-dir {input.db_dir} \
--min-query-len {params.min_query_len} \
--min-query-cov {params.min_query_cov} \
{params.external_opts} \
-1 {input.reads[0]} \
-2 {input.reads[1]} \
--out-file {output} \
--log {log}
else
kmcp search \
--threads {threads} \
--load-whole-db \
--db-dir {input.db_dir} \
--min-query-len {params.min_query_len} \
--min-query-cov {params.min_query_cov} \
{input.reads} \
--out-file {output} \
--log {log}
fi
'''


rule profiling_kmcp_search_merge:
input:
expand(os.path.join(config["output"]["profiling"],
"search/kmcp/{{sample}}/{{sample}}.kmcp_search@{kmcp_db}.tsv.gz"),
kmcp_db=KMCP_DBS)
output:
os.path.join(config["output"]["profiling"],
"search/kmcp/{sample}/{sample}.kmcp_search@all.tsv.gz")
conda:
config["envs"]["kmcp"]
log:
os.path.join(config["output"]["profiling"],
"logs/kmcp/search_merge/{sample}.kmcp_search_merge.log")
benchmark:
os.path.join(config["output"]["profiling"],
"benchmark/kmcp/search_merge/{sample}.kmcp_search_merge.benchmark.txt")
shell:
'''
kmcp merge \
{input} \
--out-file {output} \
--log {log}
'''


KMCP_PROFILING_MODE = {
"pathogen_detection": 0,
"higher_recall": 1,
"high_recall": 2,
"default": 3,
"high_precision": 4,
"higher_precision": 5
}


rule profiling_kmcp_profile:
input:
search = os.path.join(config["output"]["profiling"],
"search/kmcp/{sample}/{sample}.kmcp_search@all.tsv.gz"),
taxid = KMCP_TAXIDMAP,
taxdump = config["params"]["profiling"]["kmcp"]["database"]["ncbi_taxdump"]
output:
default_profile = os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.default_format.profile.{profiling_mode}.tsv.gz"),
metaphlan_profile = os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.metaphlan_format.profile.{profiling_mode}.tsv.gz"),
cami_profile = os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.CAMI_format.profile.{profiling_mode}.tsv.gz"),
binning_result = os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.binning.{profiling_mode}.gz")
conda:
config["envs"]["kmcp"]
log:
os.path.join(config["output"]["profiling"],
"logs/kmcp/profile/{sample}.kmcp_profile_{profiling_mode}.log")
benchmark:
os.path.join(config["output"]["profiling"],
"benchmark/kmcp/profile/{sample}.kmcp_profile_{profiling_mode}.benchmark.txt")
params:
profiling_mode = lambda wildcards: KMCP_PROFILING_MODE[wildcards.profiling_mode],
min_query_cov = config["params"]["profiling"]["kmcp"]["profile"]["min_query_cov"],
min_chunks_reads = config["params"]["profiling"]["kmcp"]["profile"]["min_chunks_reads"],
min_chunks_fraction = config["params"]["profiling"]["kmcp"]["profile"]["min_chunks_fraction"],
max_chunks_depth_stdev = config["params"]["profiling"]["kmcp"]["profile"]["max_chunks_depth_stdev"],
min_uniq_reads = config["params"]["profiling"]["kmcp"]["profile"]["min_uniq_reads"],
min_hic_ureads = config["params"]["profiling"]["kmcp"]["profile"]["min_hic_ureads"],
min_hic_ureads_qcov = config["params"]["profiling"]["kmcp"]["profile"]["min_hic_ureads_qcov"],
min_hic_ureads_prop = config["params"]["profiling"]["kmcp"]["profile"]["min_hic_ureads_prop"],
external_opts = config["params"]["profiling"]["kmcp"]["profile"]["external_opts"]
shell:
'''
kmcp profile \
{input.search} \
--taxid-map {input.taxid} \
--taxdump {input.taxdump} \
--level species \
--mode {params.profiling_mode} \
--min-query-cov {params.min_query_cov} \
--min-chunks-reads {params.min_chunks_reads} \
--min-chunks-fraction {params.min_chunks_fraction} \
--max-chunks-depth-stdev {params.max_chunks_depth_stdev} \
--min-uniq-reads {params.min_uniq_reads} \
--min-hic-ureads {params.min_hic_ureads} \
--min-hic-ureads-qcov {params.min_hic_ureads_qcov} \
--min-hic-ureads-prop {params.min_hic_ureads_prop} \
{params.external_opts} \
--out-prefix {output.default_profile} \
--metaphlan-report {output.metaphlan_profile} \
--cami-report {output.cami_profile} \
--binning-result {output.binning_result} \
--log {log}
'''


rule profiling_kmcp_all:
input:
expand([
os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.{profile_format}.profile.{profiling_mode}.tsv.gz"),
os.path.join(config["output"]["profiling"],
"profile/kmcp/{sample}/{sample}.kmcp.binning.{profiling_mode}.gz")],
sample=SAMPLES_ID_LIST,
profile_format=["default_format", "metaphlan_format", "CAMI_format"],
profiling_mode=list(KMCP_PROFILING_MODE.keys()))

else:
rule profiling_kmcp_all:
input:
14 changes: 14 additions & 0 deletions quantpi/snakefiles/profiling_wf.smk
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,26 @@ READS_FORMAT = "sra" \
else "fastq"


KMCP_DBS = []
KMCP_TAXIDMAP = []
if config["params"]["profiling"]["kmcp"]["do"]["bacteriome"]:
KMCP_DBS.append("bacteriome")
KMCP_TAXIDMAP.append(config["params"]["profiling"]["kmcp"]["database"]["bacteriome"])
if config["params"]["profiling"]["kmcp"]["do"]["mycobiome"]:
KMCP_DBS.append("mycobiome")
KMCP_TAXIDMAP.append(config["params"]["profiling"]["kmcp"]["database"]["mycobiome"])
if config["params"]["profiling"]["kmcp"]["do"]["virome"]:
KMCP_DBS.append("virome")
KMCP_TAXIDMAP.append(config["params"]["profiling"]["kmcp"]["database"]["virome"])


include: "../rules/simulate.smk"
include: "../rules/raw.smk"
include: "../rules/trimming.smk"
include: "../rules/rmhost.smk"
include: "../rules/qcreport.smk"
include: "../rules/profiling_dna.smk"
include: "../rules/profiling_kmer.smk"
include: "../rules/profiling_marker.smk"
include: "../rules/profiling_custom.smk"
include: "../rules/profiling_function.smk"
Expand Down

0 comments on commit 0cf75f3

Please sign in to comment.