Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GMM-Demux #5641

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions modules/nf-core/cellranger/count/templates/cellranger_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

Copyright (c) Gregor Sturm 2023 - MIT License
"""
from subprocess import run

import re
import shlex
from pathlib import Path
from subprocess import run
from textwrap import dedent
import shlex
import re


def chunk_iter(seq, size):
Expand All @@ -34,11 +35,11 @@ def chunk_iter(seq, size):
# Match R1 in the filename, but only if it is followed by a non-digit or non-character
# match "file_R1.fastq.gz", "file.R1_000.fastq.gz", etc. but
# do not match "SRR12345", "file_INFIXR12", etc
filename_pattern = r'([^a-zA-Z0-9])R1([^a-zA-Z0-9])'
filename_pattern = r"([^a-zA-Z0-9])R1([^a-zA-Z0-9])"

for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2), start=1):
# double escapes are required because nextflow processes this python 'template'
if re.sub(filename_pattern, r'\\1R2\\2', r1.name) != r2.name:
if re.sub(filename_pattern, r"\\1R2\\2", r1.name) != r2.name:
raise AssertionError(
dedent(
f"""\
Expand All @@ -58,13 +59,19 @@ def chunk_iter(seq, size):
run(
# fmt: off
[
"cellranger", "count",
"--id", "${prefix}",
"--fastqs", str(fastq_all),
"--transcriptome", "${reference.name}",
"--localcores", "${task.cpus}",
"--localmem", "${task.memory.toGiga()}",
*shlex.split("""${args}""")
"cellranger",
"count",
"--id",
"${prefix}",
"--fastqs",
str(fastq_all),
"--transcriptome",
"${reference.name}",
"--localcores",
"${task.cpus}",
"--localmem",
"${task.memory.toGiga()}",
*shlex.split("""${args}"""),
],
# fmt: on
check=True,
Expand Down
26 changes: 17 additions & 9 deletions modules/nf-core/cellranger/multi/templates/cellranger_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
Copyright (c) Felipe Almeida 2024 - MIT License
"""

from subprocess import run
import re
import shlex
from pathlib import Path
from subprocess import run
from textwrap import dedent
import shlex
import re


def chunk_iter(seq, size):
Expand All @@ -36,7 +36,9 @@ def chunk_iter(seq, size):
# - ...
# Since we require fastq files in the input channel to be ordered such that a R1/R2 pair
# of files follows each other, ordering will get us a sequence of [R1, R2, R1, R2, ...]
fastqs = sorted(p for p in Path(".").glob(f"fastqs/{modality}/*/*") if p.name != EMPTY_FILE)
fastqs = sorted(
p for p in Path(".").glob(f"fastqs/{modality}/*/*") if p.name != EMPTY_FILE
)
assert len(fastqs) % 2 == 0

# target directory in which the renamed fastqs will be placed
Expand All @@ -46,7 +48,9 @@ def chunk_iter(seq, size):
for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2), start=1):
# will we rename the files or just move it with the same name to
# the 'fastq_all' directory which is where files are expected?
if "${skip_renaming}" == "true": # nf variables are true/false, which are different from Python
if (
"${skip_renaming}" == "true"
): # nf variables are true/false, which are different from Python
resolved_name_r1 = r1.name
resolved_name_r2 = r2.name

Expand Down Expand Up @@ -140,20 +144,24 @@ def chunk_iter(seq, size):
# check the extra data that is included
#
if len("${include_cmo}") > 0:
with open("${cmo_csv_text}", "r") as input_conf:
with open("${cmo_csv_text}") as input_conf:
config_txt = config_txt + "\\n${include_cmo}\\n" + input_conf.read() + "\\n"

if len("${include_beam}") > 0:
with open("${beam_csv_text}", "r") as input_conf, open("${beam_antigen_csv}", "r") as input_csv:
with open("${beam_csv_text}") as input_conf, open(
"${beam_antigen_csv}"
) as input_csv:
config_txt = config_txt + "\\n${include_beam}\\n" + input_conf.read() + "\\n"
config_txt = config_txt + "[feature]\\n" + input_csv.read() + "\\n"

if len("${include_frna}") > 0:
with open("${frna_csv_text}", "r") as input_conf:
with open("${frna_csv_text}") as input_conf:
config_txt = config_txt + "\\n${include_frna}\\n" + input_conf.read() + "\\n"

# Remove blank lines from config text
config_txt = "\\n".join([line for line in config_txt.split("\\n") if line.strip() != ""])
config_txt = "\\n".join(
[line for line in config_txt.split("\\n") if line.strip() != ""]
)

# Save config file
with open("${config}", "w") as file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

# Written by Pranathi Vemuri, later modified by Jonathan Manning and released under the MIT license.

import os
import logging
import os
import platform
from typing import Iterator, Tuple
from itertools import groupby
from typing import Iterator, Tuple


def setup_logging() -> logging.Logger:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

"""Provide functions to merge multiple versions.yml files."""


import yaml
import platform
from textwrap import dedent

import yaml


def _make_versions_html(versions):
"""Generate a tabular HTML output of all versions for MultiQC."""
Expand Down Expand Up @@ -58,7 +58,9 @@ def main():
}

with open("$versions") as f:
versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module
versions_by_process = (
yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module
)

# aggregate versions by the module name (derived from fully-qualified process name)
versions_by_module = {}
Expand Down
51 changes: 36 additions & 15 deletions modules/nf-core/custom/tx2gene/templates/tx2gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license.

import logging
import argparse
import glob
import logging
import os
import platform
import re
from collections import Counter, defaultdict, OrderedDict
from collections import Counter, OrderedDict
from collections.abc import Set
from typing import Dict

Expand All @@ -17,6 +16,7 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.

Expand All @@ -36,6 +36,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str


def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]:
"""
Read the top 100 transcripts from the quantification file.
Expand All @@ -50,9 +51,13 @@ def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]:
try:
# Find the quantification file within the directory
quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0]
with open(quant_file_path, "r") as file_handle:
with open(quant_file_path) as file_handle:
# Read the file and extract the top 100 transcripts
return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100}
return {
line.split()[0]
for i, line in enumerate(file_handle)
if i > 0 and i <= 100
}
except IndexError:
# Log an error and raise a FileNotFoundError if the quant file does not exist
logger.error("No quantification files found.")
Expand Down Expand Up @@ -81,7 +86,9 @@ def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str:
attributes_str = cols[8]
attributes = dict(re.findall(r'(\\S+) "(.*?)(?<!\\\\)";', attributes_str))

votes.update(key for key, value in attributes.items() if value in transcripts)
votes.update(
key for key, value in attributes.items() if value in transcripts
)

if not votes:
# Error out if no matching attribute is found
Expand Down Expand Up @@ -123,7 +130,12 @@ def parse_attributes(attributes_text: str) -> Dict[str, str]:


def map_transcripts_to_gene(
quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str
quant_type: str,
gtf_file: str,
quant_dir: str,
gene_id: str,
extra_id_field: str,
output_file: str,
) -> bool:
"""
Map transcripts to gene names and write the output to a file.
Expand All @@ -140,7 +152,9 @@ def map_transcripts_to_gene(
bool: True if the operation was successful, False otherwise.
"""
# Read the top transcripts based on quantification type
transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv")
transcripts = read_top_transcripts(
quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv"
)
# Discover the attribute that corresponds to transcripts in the GTF
transcript_attribute = discover_transcript_attribute(gtf_file, transcripts)

Expand All @@ -156,28 +170,35 @@ def map_transcripts_to_gene(
attr_dict = parse_attributes(cols[8])
if gene_id in attr_dict and transcript_attribute in attr_dict:
# Create a unique identifier for the transcript-gene combination
transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id])
transcript_gene_pair = (
attr_dict[transcript_attribute],
attr_dict[gene_id],
)

# Check if the combination has already been seen
if transcript_gene_pair not in seen:
# If it's a new combination, write it to the output and add to the seen set
extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id])
output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n")
output_handle.write(
f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n"
)
seen.add(transcript_gene_pair)

return True


# Main function to parse arguments and call the mapping function
if __name__ == "__main__":
if '${task.ext.prefix}' != "null":
if "${task.ext.prefix}" != "null":
prefix = "${task.ext.prefix}."
elif '$meta.id' != "null":
prefix = '${meta.id}.'
elif "$meta.id" != "null":
prefix = "${meta.id}."
else:
prefix = ''
prefix = ""

if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}tx2gene.tsv"):
if not map_transcripts_to_gene(
"$quant_type", "$gtf", "quants", "$id", "$extra", f"{prefix}tx2gene.tsv"
):
logger.error("Failed to map transcripts to genes.")

# Write the versions
Expand Down
9 changes: 9 additions & 0 deletions modules/nf-core/gmmdemux/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
name: "gmmdemux"
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- "bioconda::gmm-demux=0.2.2.3"
67 changes: 67 additions & 0 deletions modules/nf-core/gmmdemux/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

process GMMDEMUX {
tag "$meta.id"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gmm-demux:0.2.2.3--pyh7cba7a3_0':
'biocontainers/gmm-demux:0.2.2.3--pyh7cba7a3_0' }"

input:
tuple val(meta), path(cell_hashing_barcodes,stageAs: "hto_files/*"), path(cell_hashing_matrix,stageAs: "hto_files/*"),path(cell_hashing_features,stageAs: "hto_files/*"),val(hto_names)
val type_report
val summary_report
val skip

output:
mari-ga marked this conversation as resolved.
Show resolved Hide resolved
tuple val(meta), path("${meta.id}/barcodes.tsv.gz" ), emit: barcodes
tuple val(meta), path("${meta.id}/*.mtx.gz" ), emit: matrix
tuple val(meta), path("${meta.id}/features.tsv.gz" ), emit: features
tuple val(meta), path("${meta.id}/classification_report_${meta.id}" ), emit: classification_report
tuple val(meta), path("summary_report_${meta.id}.txt" ), emit: summary_report, optional: true
path "versions.yml" , emit: versions
Comment on lines +18 to +23
Copy link
Contributor

@tstoeriko tstoeriko Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tuple val(meta), path("${meta.id}/barcodes.tsv.gz" ), emit: barcodes
tuple val(meta), path("${meta.id}/*.mtx.gz" ), emit: matrix
tuple val(meta), path("${meta.id}/features.tsv.gz" ), emit: features
tuple val(meta), path("${meta.id}/classification_report_${meta.id}" ), emit: classification_report
tuple val(meta), path("summary_report_${meta.id}.txt" ), emit: summary_report, optional: true
path "versions.yml" , emit: versions
tuple val(meta), path("${prefix}/barcodes.tsv.gz") , emit: barcodes
tuple val(meta), path("${prefix}/*.mtx.gz") , emit: matrix
tuple val(meta), path("${prefix}/features.tsv.gz") , emit: features
tuple val(meta), path("${prefix}/classification_report_${prefix}"), emit: classification_report
tuple val(meta), path("summary_report_${prefix}.txt") , emit: summary_report, optional: true
path "versions.yml" , emit: versions


when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def type_report = type_report ? "-f test/classification_report_${prefix}" : "-s test/classification_report_${prefix}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def type_report = type_report ? "-f test/classification_report_${prefix}" : "-s test/classification_report_${prefix}"
def type_report = type_report ? "-f ${prefix}/classification_report_${prefix}" : "-s ${prefix}/classification_report_${prefix}"

def skip = skip ? "--skip $skip" : ""
def summary_rep = summary_report ? "-r summary_report_${prefix}.txt" : ""
def VERSION = '0.2.2.3' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
"""
if [[ ${summary_report} == true ]]; then
cat /dev/null > summary_report_${prefix}.txt
echo "summary report file created"
fi
Comment on lines +36 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary because the tool itself does not create the summary report file?

GMM-demux $args \\
$type_report \\
$summary_rep \\
$skip \\
hto_files \\
$hto_names
cat <<-END_VERSIONS > versions.yml
"${task.process}":
GMM-Demux: $VERSION
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
def VERSION = '0.2.2.3'
"""
mkdir test
touch test/barcodes.tsv.gz
touch test/features.tsv.gz
touch test/matrix.mtx.gz
Comment on lines +56 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
mkdir test
touch test/barcodes.tsv.gz
touch test/features.tsv.gz
touch test/matrix.mtx.gz
touch barcodes.tsv.gz
touch features.tsv.gz
touch matrix.mtx.gz

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried with the suggested lines, but it crashes the test for stub

Copy link
Contributor

@tstoeriko tstoeriko Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the paths just need to be updated to match the new output patterns, then it should hopefully work.

Suggested change
mkdir test
touch test/barcodes.tsv.gz
touch test/features.tsv.gz
touch test/matrix.mtx.gz
mkdir "${prefix}"
touch "${prefix}/barcodes.tsv.gz"
touch "${prefix}/features.tsv.gz"
touch "${prefix}/matrix.mtx.gz"
touch "${prefix}/classification_report_${prefix}"
touch "summary_report_${prefix}.txt"

touch test/classification_report_test

cat <<-END_VERSIONS > versions.yml
"${task.process}":
GMM-Demux: $VERSION
END_VERSIONS
"""
}
Loading
Loading