diff --git a/README.md b/README.md index 450e725..fb8c2cf 100644 --- a/README.md +++ b/README.md @@ -116,15 +116,21 @@ assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +### Submission study + +All data submitted through this pipeline must be associated with an ENA study (project). You can either pass an accession of your existing study via `--submission_study`or provide a metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data. + +See the [usage documentation](docs/usage.md#submission-study) for more details. + ### Required parameters: -| Parameter | Description | -| -------------------- | --------------------------------------------------------------------------------- | -| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` | -| `--input` | Path to the samplesheet describing the data to be submitted | -| `--outdir` | Path to the output directory for pipeline results | -| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to | -| `--centre_name` | Name of the submitter's organisation | +| Parameter | Description | +| ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------- | +| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` | +| `--input` | Path to the samplesheet describing the data to be submitted | +| `--outdir` | Path to the output directory for pipeline results | +| `--submission_study` OR `--study_metadata` | ENA study accession (PRJ/ERP) to submit the data to OR metadata file in JSON/TSV/CSV format to register new study | +| `--centre_name` | Name of the submitter's organisation | ### Optional parameters: diff --git a/assets/study_metadata.json b/assets/study_metadata.json new file mode 100644 index 0000000..fbc2b28 --- /dev/null +++ b/assets/study_metadata.json @@ -0,0 +1,6 @@ +{ + "alias": "study-example-2026", + "study_title": "Example metagenome study", + "study_abstract": "Description of the study aims and methods.", + "existing_study_type": "Metagenomics" +} diff --git a/assets/study_metadata.tsv b/assets/study_metadata.tsv new file mode 100644 index 0000000..2389f1d --- /dev/null +++ b/assets/study_metadata.tsv @@ -0,0 +1,2 @@ +alias study_title study_abstract existing_study_type +study-example-2026 Example metagenome study Description of the study aims and methods. Metagenomics diff --git a/bin/submit_study.py b/bin/submit_study.py new file mode 100755 index 0000000..28c1f9a --- /dev/null +++ b/bin/submit_study.py @@ -0,0 +1,633 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import csv +import datetime +import hashlib +import json +import logging +import os +import sys +import xml.etree.ElementTree as ET +from io import BytesIO +from pathlib import Path +from typing import Any, Final + +import click +import requests +from requests.auth import HTTPBasicAuth + + +# ----------------------------------------------------------- +# Logging +# ----------------------------------------------------------- + +logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.INFO, + stream=sys.stderr, +) +logger = logging.getLogger() + + +# ----------------------------------------------------------- +# Credentials +# ----------------------------------------------------------- + + +def get_credentials() -> tuple[str, str]: + """Read ENA credentials from environment variables. + + Returns: + Tuple of (*username*, *password*). + + Raises: + SystemExit: If either variable is unset or empty. + """ + username = os.environ.get("ENA_WEBIN", "").strip() + password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip() + if not username or not password: + logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set") + sys.exit(1) + return username, password + + +# ----------------------------------------------------------- +# ENA API helpers +# ----------------------------------------------------------- + +PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" +TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" + + +def submit_xml( + base_url: str, + auth: HTTPBasicAuth, + xml_bytes: bytes, +) -> ET.Element: + """Submit an XML document to ENA via Webin REST API v2. + + Args: + base_url: ENA submission service base URL. + auth: HTTP basic-auth credentials. + xml_bytes: Serialised XML submission document. + + Returns: + Parsed receipt XML element tree root. + """ + url = f"{base_url}/submit" + headers = { + "Content-Type": "application/xml", + "Accept": "application/xml", + } + resp = requests.post( + url, data=xml_bytes, + headers=headers, auth=auth, timeout=120, + ) + resp.raise_for_status() + return ET.fromstring(resp.content) + + +# ----------------------------------------------------------- +# XML utilities +# ----------------------------------------------------------- + + +def xml_to_bytes(root: ET.Element) -> bytes: + """Serialise an ElementTree element to UTF-8 bytes.""" + tree = ET.ElementTree(root) + buf = BytesIO() + tree.write(buf, encoding="UTF-8", xml_declaration=True) + return buf.getvalue() + + +# ----------------------------------------------------------- +# Hold-until date validation +# ----------------------------------------------------------- + +_MAX_HOLD_YEARS: Final = 2 + + +def validate_hold_until(hold_until: str) -> datetime.date: + """Parse and validate a hold-until date string. + + Args: + hold_until: Date string in ``YYYY-MM-DD`` format. + + Returns: + Parsed date. + + Raises: + click.BadParameter: If the date format is invalid, + in the past, or more than 2 years from today. + """ + try: + hold_date = datetime.date.fromisoformat(hold_until) + except ValueError: + raise click.BadParameter( + f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD." + ) from None + + today = datetime.date.today() + max_date = today.replace(year=today.year + _MAX_HOLD_YEARS) + + if hold_date > max_date: + raise click.BadParameter( + f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today" + f" ({today}). Maximum allowed: {max_date}." + ) + + if hold_date <= today: + raise click.BadParameter( + f"Hold date {hold_until} is not in the future (today is {today})." + ) + + return hold_date + + +# ----------------------------------------------------------- +# Study metadata field definitions +# ----------------------------------------------------------- + +#: Fields that must be present and non-empty in every record. +_REQUIRED_FIELDS: Final[frozenset[str]] = frozenset({ + "alias", + "study_title", +}) + +#: Fields that are recognised but optional. +_OPTIONAL_FIELDS: Final[frozenset[str]] = frozenset({ + "project_name", + "study_abstract", + "study_description", + "existing_study_type", + "new_study_type", +}) + +#: All recognised field names (required + optional). +_ALL_FIELDS: Final[frozenset[str]] = _REQUIRED_FIELDS | _OPTIONAL_FIELDS + + +# ----------------------------------------------------------- +# File loading (JSON, CSV, TSV) +# ----------------------------------------------------------- + + +def extract_records_from_tabular( + filepath: str | Path, + delimiter: str = ",", +) -> list[dict[str, str]]: + """Extract record dicts from a CSV or TSV file. + + Only columns present in _ALL_FIELDS are retained; + unknown columns are ignored. + + Args: + filepath: Path to the tabular file. + delimiter: Column delimiter character. + + Returns: + List of record dicts. + """ + records = [] + + with open(filepath, newline="", encoding="utf-8") as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + for line in reader: + record = {} + for col in _ALL_FIELDS: + value = line.get(col, "").strip() + if value: + record[col] = value + if record: + records.append(record) + + return records + + +def extract_records_from_json( + filepath: str | Path, +) -> list[dict[str, Any]]: + """Extract record dicts from a JSON file. + + Handle two JSON shapes: + + * Plain list of dicts. + * Single record object (no wrapper). + + Args: + filepath: Path to the JSON file. + + Returns: + List of record dicts, or [] if unrecognised. + """ + with open(filepath) as fh: + input_data = json.load(fh) + + if isinstance(input_data, list): + return input_data + + if isinstance(input_data, dict): + return [input_data] + + return [] + + +def load_and_validate_input_file( + filepath: str | Path, +) -> list[dict[str, Any]]: + """Load and validate records from a supported file format. + + Supported formats: JSON, CSV, TSV. Other formats will cause a ValueError. + Records are validated against _REQUIRED_FIELDS before being returned; + missing required fields will cause a ValueError. + + Args: + filepath: Path to the input file. + + Returns: + List of record dicts. If the file format is + unrecognised (based on file extension) or required fields are missing, + raises ValueError. + """ + ext = Path(filepath).suffix.lower() + if ext == ".json": + records = extract_records_from_json(filepath) + elif ext == ".csv": + records = extract_records_from_tabular(filepath, delimiter=",") + elif ext == ".tsv": + records = extract_records_from_tabular(filepath, delimiter="\t") + else: + raise ValueError(f"Unsupported file format: {ext}. Supported: .json, .csv, .tsv") + + if not records: + raise ValueError(f"File {filepath} seems to be empty. Check the format and content.") + + for record in records: + for field in _REQUIRED_FIELDS: + if not record.get(field, "").strip(): + raise ValueError( + f"Record with alias {record.get('alias', '')} is missing required field: {field}" + ) + + return records + + +# ----------------------------------------------------------- +# Result output +# ----------------------------------------------------------- + + +def write_results( + results: dict[str, list[dict[str, Any]]], + output_path: Path | None, +) -> None: + """Write JSON results to file or stdout.""" + json_str = json.dumps(results, indent=2) + if output_path: + with open(output_path, "w") as fh: + fh.write(json_str + "\n") + logger.info("Results written to %s", output_path) + else: + print(json_str) + + +# ----------------------------------------------------------- +# XML construction +# ----------------------------------------------------------- + + +def build_submission_xml( + studies: list[dict[str, Any]], + hold_until: str | None = None, + action: str = "ADD", + test: bool = False, +) -> ET.Element: + """Build a ```` XML document for submitting studies. + + Args: + studies: Study metadata dicts. + hold_until: Optional hold-until date string + (``YYYY-MM-DD``). + action: Submission action — ``"ADD"`` for new studies + or ``"MODIFY"`` to update existing ones. + test: If ``True``, append a timestamp-based hash to aliases + for uniqueness in test submissions. + + Returns: + Root ```` element. + """ + webin = ET.Element("WEBIN") + + # SUBMISSION_SET + submission_set = ET.SubElement(webin, "SUBMISSION_SET") + submission = ET.SubElement(submission_set, "SUBMISSION") + sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" + submission.set("alias", sub_alias) + actions = ET.SubElement(submission, "ACTIONS") + main_action = ET.SubElement(actions, "ACTION") + ET.SubElement(main_action, action.upper()) + if hold_until: + hold_action = ET.SubElement(actions, "ACTION") + hold_el = ET.SubElement(hold_action, "HOLD") + hold_el.set("HoldUntilDate", hold_until) + + # PROJECT_SET + project_set = ET.SubElement(webin, "PROJECT_SET") + for study in studies: + _add_project_element(project_set, study, test=test) + return webin + + +def _add_project_element( + project_set: ET.Element, + study: dict[str, Any], + test: bool = False, +) -> None: + """Append a ```` element to *project_set*.""" + alias = study.get("alias", "") + if test: + # Append 8-character hash of current timestamp for uniqueness in test mode + timestamp_hash = hashlib.md5( + datetime.datetime.now().isoformat().encode() + ).hexdigest()[:8] + alias = f"{alias}_{timestamp_hash}" + + project = ET.SubElement(project_set, "PROJECT") + project.set("alias", alias) + + name_text = study.get("project_name", study.get("study_title", "")) + if name_text: + name_el = ET.SubElement(project, "NAME") + name_el.text = name_text + + title_el = ET.SubElement(project, "TITLE") + title_el.text = study.get("study_title", "") + + desc_text = ( + study.get("study_abstract") + or study.get("study_description", "") + ) + if desc_text: + desc_el = ET.SubElement(project, "DESCRIPTION") + desc_el.text = desc_text + + sp = ET.SubElement(project, "SUBMISSION_PROJECT") + ET.SubElement(sp, "SEQUENCING_PROJECT") + # TODO: Check existing_study_type and new_study_type metadata fields, do we need those? + study_type = study.get("existing_study_type") + if study_type: + attrs = ET.SubElement( + project, "PROJECT_ATTRIBUTES", + ) + _add_project_attribute( + attrs, "existing_study_type", study_type, + ) + new_type = study.get("new_study_type") + if new_type and study_type == "Other": + _add_project_attribute( + attrs, "new_study_type", new_type, + ) + + +def _add_project_attribute( + parent: ET.Element, + tag_text: str, + value_text: str, +) -> None: + """Append a ```` to *parent*.""" + attr = ET.SubElement(parent, "PROJECT_ATTRIBUTE") + tag_el = ET.SubElement(attr, "TAG") + tag_el.text = tag_text + val_el = ET.SubElement(attr, "VALUE") + val_el.text = value_text + + +# ----------------------------------------------------------- +# Receipt parsing +# ----------------------------------------------------------- + + +def parse_xml_receipt( + receipt_root: ET.Element, +) -> tuple[bool, list[dict[str, str]], list[str]]: + """Parse an ENA XML receipt for study submissions. + + Args: + receipt_root: Root element of the receipt XML. + + Returns: + Tuple of (*success*, *accessions*, *messages*). + """ + success = receipt_root.get("success", "false").lower() == "true" + accessions: list[dict[str, str]] = [] + messages: list[str] = [] + + msgs_el = receipt_root.find("MESSAGES") + if msgs_el is not None: + for info in msgs_el.findall("INFO"): + messages.append(f"INFO: {info.text}") + for err in msgs_el.findall("ERROR"): + messages.append(f"ERROR: {err.text}") + + # TODO: "accession" should be present for successful submissions + # TODO: remove get default and log error if missing. + for proj in receipt_root.findall("PROJECT"): + acc_info: dict[str, str] = { + "alias": proj.get("alias", ""), + "accession": proj.get("accession", ""), + "status": proj.get("status", ""), + "holdUntilDate": proj.get("holdUntilDate", ""), + } + ext = proj.find("EXT_ID") + if ext is not None: + acc_info["external_accession"] = ext.get("accession", "") + acc_info["external_type"] = ext.get("type", "") + accessions.append(acc_info) + + # Some receipts use STUDY instead of PROJECT. + for study in receipt_root.findall("STUDY"): + accessions.append({ + "alias": study.get("alias", ""), + "accession": study.get("accession", ""), + "status": study.get("status", ""), + }) + + return success, accessions, messages + + +# ----------------------------------------------------------- +# Submission helper +# ----------------------------------------------------------- + + +def _do_submission( + base_url: str, + auth: Any, + xml_bytes: bytes, + action: str, + results: dict[str, list[dict[str, Any]]], + env_label: str, + dry_run: bool, +) -> bool: + """Validate, optionally submit, and parse one batch. + + Args: + base_url: ENA submission base URL. + auth: HTTP basic-auth credentials. + xml_bytes: Serialised XML submission document. + action: Label for log messages (``"ADD"`` or + ``"MODIFY"``). + results: Results dict to accumulate into. + env_label: ``"TEST server"`` or ``"LIVE server"``. + dry_run: If ``True``, skip the actual submission. + + Returns: + ``True`` if the batch succeeded (or dry run). + """ + if dry_run: + logger.info("DRY RUN — skipping %s submission", action) + logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8")) + return True + + logger.info("Submitting %s to ENA (%s)...", action, env_label) + try: + receipt_root = submit_xml(base_url, auth, xml_bytes) + except requests.exceptions.HTTPError as exc: + logger.error("HTTP error during %s submission: %s", action, exc) + if exc.response is not None: + logger.error("Response body: %s", exc.response.text) + return False + + success, accessions, receipt_messages = parse_xml_receipt(receipt_root) + for msg in receipt_messages: + logger.info(" Receipt: %s", msg) + + if success: + logger.info("%s SUCCESSFUL", action) + for acc in accessions: + ext = acc.get("external_accession", "") + ext_suffix = f" (study: {ext})" if ext else "" + logger.info( + " %s: alias=%s accession=%s status=%s%s", + action, acc["alias"], acc["accession"], acc["status"], ext_suffix, + ) + results["submitted"].append(acc) + else: + logger.error("%s FAILED", action) + receipt_xml_str = ET.tostring( + receipt_root, encoding="unicode", + ) + logger.error("Receipt XML: %s", receipt_xml_str) + results["failed"].extend(accessions) + + return success + + +# ----------------------------------------------------------- +# Main +# ----------------------------------------------------------- + +@click.command( + help="Submit studies to ENA via the Webin REST API v2.", +) +@click.option( + "--input", "input_file", + required=True, + type=click.Path(exists=True, path_type=Path), + help="Path to study metadata file (JSON, CSV, or TSV)", +) +@click.option( + "--test", "use_test", + is_flag=True, default=False, + help="Use the ENA test service (submissions are discarded daily)", +) +@click.option( + "--hold-until", + default=None, + help="Hold studies private until this date (YYYY-MM-DD, max 2 years from now)", +) +@click.option( + "--output", + type=click.Path(path_type=Path), + default=None, + help="Path to write JSON accession results (default: stdout)", +) +@click.option( + "--validate", + is_flag=True, default=False, + help="Validate and build XML but do not submit to ENA", +) +def main( + input_file: Path, + use_test: bool, + hold_until: str | None, + output: Path | None, + validate: bool, +) -> None: + """Submit studies to ENA via the Webin REST API v2.""" + username, password = get_credentials() + + env_label = "TEST server" if use_test else "LIVE server" + logger.info("ENA Study Submission — environment: %s", env_label) + base_url = TEST_URL if use_test else PROD_URL + + auth = HTTPBasicAuth(username, password) + logger.debug("Auth username: %s", username) + + if hold_until: + validate_hold_until(hold_until) + + # -- Step 1: Load input file ------------------------- + logger.info("Loading input: %s", input_file) + try: + studies = load_and_validate_input_file(input_file) + except ValueError as exc: + # Re-raise as click.BadParameter to get nice error formatting without a full stack trace + raise click.BadParameter(str(exc), param_hint="--input") from exc + + logger.info("Loaded %d study/studies from input", len(studies)) + + results: dict[str, list[dict[str, Any]]] = { + "submitted": [], + "failed": [], + } + + # -- Step 2: Build and submit XML -------------------- + logger.info("Building ADD XML for %d study/studies...", len(studies)) + xml_root = build_submission_xml( + studies, + hold_until=hold_until, + action="ADD", + test=use_test, + ) + xml_bytes = xml_to_bytes(xml_root) + logger.info("XML document size: %d bytes", len(xml_bytes)) + logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8")) + ok = _do_submission( + base_url, auth, xml_bytes, + action="ADD", + results=results, + env_label=env_label, + dry_run=validate, + ) + + if not ok: + sys.exit(1) + + # -- Step 3: Output results -------------------------- + write_results(results, output) + + logger.info("=" * 60) + logger.info("SUBMISSION SUMMARY") + logger.info(" Submitted (ADD): %d", len(results["submitted"])) + for submission in results["submitted"]: + alias = submission["alias"] + accession = submission["accession"] + external_accession = submission["external_accession"] + logger.info(f" {alias} -> {accession} ({external_accession})") + logger.info("=" * 60) + + +if __name__ == "__main__": + main() # type: ignore[call-arg] diff --git a/conf/test_assembly.config b/conf/test_assembly.config index d94b5bc..389e102 100644 --- a/conf/test_assembly.config +++ b/conf/test_assembly.config @@ -30,8 +30,6 @@ params { mode = "metagenomic_assemblies" submission_study = "PRJEB98843" - ena_raw_reads_study_accession = "PRJEB65995" - library = "metagenome" centre_name = "TEST_CENTER" } diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config new file mode 100644 index 0000000..b1c96d7 --- /dev/null +++ b/conf/test_assembly_no_study_complete_metadata.config @@ -0,0 +1,35 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode metagenomic_assemblies complete_metadata profile' + config_profile_description = 'Single-case assembly test with complete metadata values provided' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv' + + mode = "metagenomic_assemblies" + submission_study = null + study_metadata = "$projectDir/assets/study_metadata.json" + centre_name = "TEST_CENTER" + + test_upload = true + +} diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config new file mode 100644 index 0000000..aea18b1 --- /dev/null +++ b/conf/test_mag_no_study_complete_metadata.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '16.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags complete_metadata profile' + config_profile_description = 'Single-case MAG test with complete metadata values provided' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_complete_metadata.csv' + + mode = "mags" + submission_study = null + study_metadata = "$projectDir/assets/study_metadata.json" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} diff --git a/docs/usage.md b/docs/usage.md index 0833bb6..ad32375 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -99,6 +99,70 @@ assembly_002,data/assembly_002.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 An example file is available at [assets/samplesheet_assembly.csv](../assets/samplesheet_assembly.csv). +## Submission study + +All data submitted through this pipeline must be associated with an ENA study (project). You have two options: + +### Option 1 — Use an existing study + +If you already have an ENA study, pass its accession (starting with `PRJ` or `ERP`) via `--submission_study`: + +```bash +--submission_study PRJEB12345 +``` + +You can create a study manually via the [Webin Portal](https://www.ebi.ac.uk/ena/submit/webin/login) and then use the assigned accession here. + +### Option 2 — Register a new study automatically + +Provide a study metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data: + +```bash +--study_metadata study_metadata.json +``` + +The pipeline accepts JSON, CSV, and TSV formats. + +#### JSON formats + +Single study as a flat object: + +```json +{ + "alias": "study-gut-2026", + "study_title": "Gut microbiome study", + "study_abstract": "Characterisation of gut microbial communities" +} +``` + +#### CSV format + +```csv +alias,study_title,study_abstract +study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities +``` + +#### TSV format + +```tsv +alias study_title study_abstract +study-soil-2026 Soil microbiome study Survey of soil microbiota +``` + +#### Study metadata fields + +| Field | Required | Description | +| --------------------- | -------- | ---------------------------------------------------------------------------- | +| `study_title` | Yes | Descriptive title of the study. | +| `alias` | Yes | Unique project alias within your Webin account. Max length is 50 characters. | +| `study_abstract` | No | Free-text abstract describing the study. | +| `study_description` | No | Alternative to `study_abstract`. | +| `project_name` | No | Project name. Defaults to `study_title`. | +| `existing_study_type` | No | ENA study type (e.g. `Metagenomics`, `Other`). | +| `new_study_type` | No | Custom study type. Only used when `existing_study_type` is set to `Other`. | + +An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json). + ## Running the pipeline General command template: diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test b/modules/local/generate_assembly_manifest/tests/main.nf.test index efbdb86..d04d7a2 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test @@ -27,7 +27,7 @@ nextflow_process { assert process.success assertAll( { assert snapshot( - process.out, + sanitizeOutput(process.out), path(process.out.versions[0]).yaml ).match() }, { assert process.out.manifest.size() == 1 }, diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index 7fef896..cf8a9e1 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -11,7 +11,7 @@ ] ], "1": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" ], "manifest": [ [ @@ -22,57 +22,46 @@ ] ], "versions": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": "assembly_uploader 1.3.3" + "assembly_uploader": null } } ], + "timestamp": "2026-03-13T14:02:14.937082", "meta": { - "nf-test": "0.9.0", - "nextflow": "25.04.1" - }, - "timestamp": "2025-10-30T15:10:02.229709" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } }, "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": { "content": [ { - "0": [ - [ - { - "id": "test" - }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" - ] - ], - "1": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" - ], "manifest": [ [ { "id": "test" }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" + "233126d4c4d0.manifest:md5,cacedcfcce220081e7aa2f98c2f4ffd6" ] ], "versions": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": "assembly_uploader 1.3.3" + "assembly_uploader": "assembly_uploader 1.3.4" } } ], + "timestamp": "2026-03-13T12:32:23.722449", "meta": { - "nf-test": "0.9.0", - "nextflow": "25.04.1" - }, - "timestamp": "2025-10-30T15:09:57.708757" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf index f8bf1a5..3c5d348 100644 --- a/modules/local/genome_upload/main.nf +++ b/modules/local/genome_upload/main.nf @@ -11,6 +11,7 @@ process GENOME_UPLOAD { path(mags) path(table_for_upload) val(mags_or_bins_flag) + val(submission_study) output: path "results/{MAG,bin}_upload/manifests*/*.manifest" , emit: manifests @@ -34,7 +35,7 @@ process GENOME_UPLOAD { export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD genome_upload \\ - -u $params.submission_study \\ + -u $submission_study \\ --genome_info ${table_for_upload} \\ --centre_name $params.centre_name \\ --${mags_or_bins_flag} \\ diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml index 80dd37e..2faa83d 100644 --- a/modules/local/registerstudy/environment.yml +++ b/modules/local/registerstudy/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge - bioconda dependencies: - # TODO nf-core: List required Conda package(s). - # Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - # For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - - "bioconda::assembly_uploader=1.3.2" + - conda-forge::python>=3.12 + - conda-forge::pip + - pip: + - mgnify-pipelines-toolkit==1.4.21 diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf index 0621043..573a38c 100644 --- a/modules/local/registerstudy/main.nf +++ b/modules/local/registerstudy/main.nf @@ -3,54 +3,47 @@ process REGISTERSTUDY { label 'process_single' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/pip_assembly-uploader:2a65298c0161c561" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0': + 'biocontainers/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0' }" - input: - tuple val(meta), val(study), val(center), val(library) + // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment. + // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: + // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } + input: + tuple val(meta), path(study_metadata) output: - tuple val(meta), env("STUDY_ID"), emit: study_accession - path "versions.yml" , emit: versions + tuple val(meta), path("*_accessions.json"), emit: accessions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - echo "Generate study XMLs" - study_xmls \\ - $args \\ - --study ${study} \\ - --library ${library} \\ - --center ${center} \\ - - echo "Submit study to ENA" - submit_study \\ - $args2 \\ - --directory ${study}_upload \\ - --study ${study} 2>&1 | tee report.log - - STUDY_ID=\$(grep 'A new study accession has been created' report.log | grep -oE '(PRJ|ERP)[[:alnum:]_]+[[:digit:]]+') + submit_study.py \\ + --input ${study_metadata} \\ + --output ${prefix}_accessions.json \\ + ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": - assembly_uploader: \$(study_xmls --version) + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") END_VERSIONS """ stub: - def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}.report + echo '{"submitted":[],"failed":[]}' > ${prefix}_accessions.json cat <<-END_VERSIONS > versions.yml "${task.process}": - assembly_uploader: \$(study_xmls --version) + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") END_VERSIONS """ } diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml index c459a19..f0e6ce7 100644 --- a/modules/local/registerstudy/meta.yml +++ b/modules/local/registerstudy/meta.yml @@ -1,18 +1,27 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "registerstudy" -description: This module registers a study and project and generates accessions that will be used for metagenomic assembly uploads in ENA. The study generated will reference reads from an already public project. +description: | + Submit a new study to ENA via via the Webin drop-box XML submission service. + Reads study metadata from a JSON, CSV, or TSV file, + builds SUBMISSION XML and PROJECT XML, and submits to ENA. + Credentials are read from the ENA_WEBIN and ENA_WEBIN_PASSWORD + env variables, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD + inside the process. keywords: - - assembly - - register + - ena + - submission - study + - project + - webin tools: - - "registerstudy": - description: "Nextflow module to register study/project to upload primary metagenome and metatranscriptome - assemblies to ENA on a per-study basis. The scripts generate xmls to register a new study and create manifests - necessary for submission of assemblies using webin-cli." - homepage: "https://github.com/EBI-Metagenomics/assembly_uploader" - documentation: "https://github.com/EBI-Metagenomics/assembly_uploader" - tool_dev_url: "None" + - mgnify-pipelines-toolkit: + description: | + A toolkit of utilities used in MGnify metagenomics pipelines, + including click, requests, and other dependencies required by + the ENA submission scripts. + homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit doi: "" licence: ["Apache-2.0"] identifier: null @@ -21,50 +30,38 @@ input: - - meta: type: map description: | - Groovy Map containing sample information + Groovy Map containing sample information. e.g. `[ id:'sample1' ]` - - study: - type: value - description: | - Study accession with raw reads public in ENA. - Example: "PRJNA312520" - - - center: - type: value + - study_metadata: + type: file description: | - Name of the sequencing or submitting center. - Example: "Wellcome Sanger Institute" - - - library: - type: value - description: | - Library information associated with the study. - Example: "metagenome" - enum: - - metagenome - - metatranscriptome + Study metadata file in JSON, CSV, or TSV format. + JSON may be a plain list of dicts or a single dict of study records. + Required fields per record: study_title, alias. + pattern: "*.{json,csv,tsv}" output: - study_accession: - - - meta: + - accessions: + - meta: type: map description: | - Groovy Map containing sample information + Groovy Map containing sample information. e.g. `[ id:'sample1' ]` - - study: - type: value + - "*_accessions.json": + type: file description: | - Study accession registered in ENA. - Example: "PRJEB312520" - versions: - - "versions.yml": - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: "http://edamontology.org/format_3750" # YAML + JSON file containing the submission results with keys: + submitted (newly created accessions) and failed. + pattern: "*_accessions.json" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" authors: - - "@alisha246" + - "@timrozday" + - "@ochkalova" maintainers: - - "@alisha246" + - "@timrozday" + - "@ochkalova" diff --git a/modules/local/registerstudy/nextflow.config b/modules/local/registerstudy/nextflow.config deleted file mode 100644 index 3f71a8e..0000000 --- a/modules/local/registerstudy/nextflow.config +++ /dev/null @@ -1,9 +0,0 @@ -process { - withName: REGISTERSTUDY { - ext.args2 = '--test' - } -} -env { - ENA_WEBIN = secrets.WEBIN_ACCOUNT - ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD -} diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index d11a6d1..2ec967f 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -1,25 +1,20 @@ -// TODO nf-core: Once you have added the required tests, please run the following command to build this file: -// nf-core modules test registerstudy nextflow_process { name "Test Process REGISTERSTUDY" script "../main.nf" - config "../nextflow.config" + config "./nextflow.config" process "REGISTERSTUDY" - tag "modules" tag "registerstudy" - test("registerstudy - should register a study on ENA test server") { + test("registerstudy - submission to ENA test server (JSON metadata)") { when { process { """ input[0] = [ - [ id:'test', single_end:false ], // meta map - "PRJNA318468", - "EMG", - "metagenome" + [ id:'example_study' ], + file("$projectDir/assets/study_metadata.json", checkIfExists: true) ] """ } @@ -28,23 +23,45 @@ nextflow_process { then { assertAll( { assert process.success }, - //TODO improve assertions + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } ) } } - test("registerstudy - stub") { + test("registerstudy - submission to ENA test server (TSV metadata)") { + + when { + process { + """ + input[0] = [ + [ id:'example_study_tsv' ], + file("$projectDir/assets/study_metadata.tsv", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } + ) + } + } + test("registerstudy - stub") { options "-stub" when { process { """ input[0] = [ - [ id:'test', single_end:false ], // meta map - "PRJNA318468", - "EMG", - "metagenome" + [ id:'example_study' ], + file("$projectDir/assets/study_metadata.json", checkIfExists: true) ] """ } @@ -53,10 +70,8 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } - //TODO improve assertions + { assert snapshot(sanitizeOutput(process.out)).match() } ) } - } } diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 1dd3a79..d1cb6ea 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -1,65 +1,17 @@ { - "registerstudy - report - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "" - ] - ], - "1": [ - "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177" - ], - "study_accession": [ - [ - { - "id": "test", - "single_end": false - }, - "" - ] - ], - "versions": [ - "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177" - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" - }, - "timestamp": "2025-10-28T16:35:02.331026" - }, "registerstudy - stub": { "content": [ { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "" - ] - ], - "1": [ - "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" - ], - "study_accession": [ + "accessions": [ [ { - "id": "test", - "single_end": false + "id": "example_study" }, - "" + "example_study_accessions.json:md5,83600b2fb33a560c25351dbd4a9bdba2" ] ], "versions": [ - "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" + "versions.yml:md5,29d54944e57cbb7cb12b7605f13fd0fc" ] } ], @@ -67,6 +19,6 @@ "nf-test": "0.9.0", "nextflow": "25.04.1" }, - "timestamp": "2025-10-30T14:58:53.721718" + "timestamp": "2026-03-25T10:54:18.30373" } } \ No newline at end of file diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config new file mode 100644 index 0000000..0a1acb3 --- /dev/null +++ b/modules/local/registerstudy/tests/nextflow.config @@ -0,0 +1,18 @@ +// Test configuration for REGISTERSTUDY module. +// --test : use the ENA dev server (submissions are discarded daily) +// --validate : validate and build XML but do not submit to ENA +// +// Dummy credentials are sufficient for --validate mode since +// no HTTP calls are made. For real submission tests, replace with secrets: +// env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } + +process { + withName: REGISTERSTUDY { + ext.args = '--test' + } +} + +env { + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD +} diff --git a/nextflow.config b/nextflow.config index d0c9146..1cb8aff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,10 +13,7 @@ params { input = null mode = null // {mags, bins, metagenomic_assemblies} - // TODO rewrite register_study script to remove this unnecessary parameters - ena_raw_reads_study_accession = null - library = null - + study_metadata = null submission_study = null centre_name = null upload_tpa = false @@ -190,6 +187,8 @@ profiles { test_genome { includeConfig 'conf/test_genome.config' } test_assembly { includeConfig 'conf/test_assembly.config' } test_full { includeConfig 'conf/test_full.config' } + test_assembly_no_study_complete_metadata { includeConfig 'conf/test_assembly_no_study_complete_metadata.config' } + test_mag_no_study_complete_metadata { includeConfig 'conf/test_mag_no_study_complete_metadata.config' } } // Load nf-core custom profiles from different institutions diff --git a/nextflow_schema.json b/nextflow_schema.json index 12399c3..83b1ed2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -303,16 +303,13 @@ "description": "ENA study accession (PRJ/ERP) to submit the data to", "help_text": "Current implementation of pipeline requires to pre-register ENA project (PRJ/ERP) where you want to upload data to. Documentation how to register study: https://ena-docs.readthedocs.io/en/latest/submit/study.html" }, - "library": { + "study_metadata": { "type": "string", - "enum": ["metagenome", "metatranscriptome"], - "description": "Type of library for the submission. Required for creation of the new submission study.", - "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title." - }, - "ena_raw_reads_study_accession": { - "type": "string", - "description": "ENA study accession (PRJ/ERP) of the raw reads study associated with the assembly submission. Required for creation of the new submission study.", - "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title and description." + "format": "file-path", + "exists": true, + "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.", + "help_text": "File containing study metadata fields (required: study_title and alias, optional: study_abstract, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", + "fa_icon": "fas fa-file-alt" }, "webincli_submit": { "type": "boolean", @@ -323,6 +320,20 @@ } } }, + "oneOf": [ + { + "required": ["submission_study"], + "not": { + "required": ["study_metadata"] + } + }, + { + "required": ["study_metadata"], + "not": { + "required": ["submission_study"] + } + } + ], "allOf": [ { "$ref": "#/$defs/input_output_options" diff --git a/nf-test.config b/nf-test.config index 3525ead..613fc05 100644 --- a/nf-test.config +++ b/nf-test.config @@ -19,6 +19,6 @@ config { // load the necessary plugins plugins { - load "nft-utils@0.0.3" + load "nft-utils@0.0.9" } } diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index befa2db..3ff34eb 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-11-20T09:32:34+00:00", - "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `rRNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. They are described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields).\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,rRNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,false,marine,cable_bacteria,marine_sediment,false,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter | Description |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\nFor detailed descriptions of all samplesheet columns, see the [usage documentation](docs/usage.md#samplesheet-input).\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `RNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package.\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter | Description |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/tests/assembly_no_study_complete_metadata.nf.test b/tests/assembly_no_study_complete_metadata.nf.test new file mode 100644 index 0000000..b6c857b --- /dev/null +++ b/tests/assembly_no_study_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_no_study_complete_metadata" + profile "test_assembly_no_study_complete_metadata" + + test("-profile test_assembly_no_study_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_study_complete_metadata.nf.test b/tests/mag_no_study_complete_metadata.nf.test new file mode 100644 index 0000000..d585286 --- /dev/null +++ b/tests/mag_no_study_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_study_complete_metadata" + profile "test_mag_no_study_complete_metadata" + + test("-profile test_mag_no_study_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index 918e1d7..a7897ba 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -99,6 +99,9 @@ workflow ASSEMBLYSUBMIT { .map { meta, coverage_file -> // Read the file and calculate average def lines = coverage_file.readLines() + if (lines.size() < 2) { + return [meta, 0.0] + } def coverages = lines[1..-1].collect { line -> line.split('\t')[1] as Double } @@ -139,6 +142,7 @@ workflow ASSEMBLYSUBMIT { def content = "${header}\n${row}" def csv_file = file("${params.outdir}/${params.mode}/${meta.id}_assembly_metadata.csv") + csv_file.parent.toFile().mkdirs() csv_file.text = content [meta, csv_file] @@ -149,11 +153,16 @@ workflow ASSEMBLYSUBMIT { // Use provided study accession directly study_accession_ch = channel.of(params.submission_study) } else { - // Register a new study + // Register a new study using the study metadata file REGISTERSTUDY( - [[id:"study"], params.ena_raw_reads_study_accession, params.centre_name, params.library ] + channel.of([[id: "study"], file(params.study_metadata)]) ) - study_accession_ch = REGISTERSTUDY.out.study_accession.map { _meta, accession -> accession } + ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions) + study_accession_ch = REGISTERSTUDY.out.accessions + .map { _meta, json -> + def data = new groovy.json.JsonSlurper().parse(json) + data.submitted[0]?.accession + } } // Generate assembly manifest files and submit them to ENA diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index 23d653c..4b8ca08 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -5,6 +5,7 @@ */ include { GENOME_UPLOAD } from '../modules/local/genome_upload' include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { RENAME_FASTA_FOR_CATPACK } from '../modules/local/rename_fasta_for_catpack' include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' @@ -246,12 +247,28 @@ workflow GENOMESUBMIT { newLine: true ) - //GENOME_UPLOAD( - // genome_fasta.map{meta, fasta -> fasta}.collect(), - // genome_metadata_csv, - // params.mode - //) - //ch_versions = ch_versions.mix( GENOME_UPLOAD.out.versions ) + def study_accession_ch + if (params.submission_study) { + study_accession_ch = channel.of(params.submission_study) + } else { + REGISTERSTUDY( + channel.of([[id: "study"], file(params.study_metadata)]) + ) + ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions) + study_accession_ch = REGISTERSTUDY.out.accessions + .map { _meta, json -> + def data = new groovy.json.JsonSlurper().parse(json) + data.submitted[0]?.accession + } + } + + GENOME_UPLOAD( + genome_fasta.map{meta, fasta -> fasta}.collect(), + genome_metadata_csv, + params.mode, + study_accession_ch.first() + ) + ch_versions = ch_versions.mix( GENOME_UPLOAD.out.versions ) //manifests_ch = GENOME_UPLOAD.out.manifests.flatten() // .map { manifest ->